├── .rspec
├── Gemfile
├── .gitignore
├── data
    └── display_width.marshal.gz
├── .editorconfig
├── lib
    └── unicode
    │   ├── display_width
    │       ├── string_ext.rb
    │       ├── constants.rb
    │       ├── no_string_ext.rb
    │       ├── reline_ext.rb
    │       ├── index.rb
    │       └── emoji_support.rb
    │   └── display_width.rb
├── misc
    └── terminal-emoji-width.rb
├── MIT-LICENSE.txt
├── .github
    └── workflows
    │   └── test.yml
├── unicode-display_width.gemspec
├── Rakefile
├── CODE_OF_CONDUCT.md
├── CHANGELOG.md
├── README.md
└── spec
    └── display_width_spec.rb


/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | --format documentation
3 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gemspec
4 | 
5 | gem "irb"
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Gemfile.lock
2 | *.swp
3 | *~
4 | pkg
5 | /data/EastAsianWidth.txt
6 | /.bundle
7 | 


--------------------------------------------------------------------------------
/data/display_width.marshal.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/janlelis/unicode-display_width/HEAD/data/display_width.marshal.gz


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 2
 6 | end_of_line = lf
 7 | insert_final_newline = true
 8 | trim_trailing_whitespace = true
 9 | 
10 | [*.{md,rdoc,txt}]
11 | indent_size = 4
12 | 
13 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width/string_ext.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require_relative "../display_width"
 4 | 
 5 | class String
 6 |   def display_width(ambiguous = nil, overwrite = nil, old_options = {}, **options)
 7 |     Unicode::DisplayWidth.of(self, ambiguous, overwrite, old_options = {}, **options)
 8 |   end
 9 | end
10 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width/constants.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Unicode
 4 |   class DisplayWidth
 5 |     VERSION = "3.2.0"
 6 |     UNICODE_VERSION = "17.0.0"
 7 |     DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/")
 8 |     INDEX_FILENAME = DATA_DIRECTORY + "/display_width.marshal.gz"
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width/no_string_ext.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | warn "You are loading 'unicode-display_width/no_string_ext'\n" \
4 |      "Beginning with version 2.0, this is not necessary anymore\n"\
5 |      "You can just require 'unicode-display_width' now and no\n"\
6 |      "string extension will be loaded"
7 | 
8 | require_relative "../display_width"
9 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width/reline_ext.rb:
--------------------------------------------------------------------------------
 1 | # Experimental
 2 | # Patches Reline's get_mbchar_width to use Unicode::DisplayWidth
 3 | 
 4 | require "reline"
 5 | require "reline/unicode"
 6 | 
 7 | require_relative "../display_width"
 8 | 
 9 | class Reline::Unicode
10 |   def self.get_mbchar_width(mbchar)
11 |     Unicode::DisplayWidth.of(mbchar, Reline.ambiguous_width)
12 |   end
13 | end
14 | 
15 | 


--------------------------------------------------------------------------------
/misc/terminal-emoji-width.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | RULER = "123456789\n"
 4 | ABC   = "abcdefg\n\n"
 5 | 
 6 | puts "1) TEXT-DEFAULT EMOJI"
 7 | puts
 8 | puts RULER + "⛹" + ABC
 9 | 
10 | puts "1B) TEXT-DEFAULT EMOJI + VS16"
11 | puts
12 | puts RULER + "⛹️" + ABC
13 | 
14 | puts "1C) BASE EMOJI CHARACTER + MODIFIER"
15 | puts
16 | puts RULER + "🏃🏽" + ABC
17 | 
18 | puts "1D) MODIFIER IN ISOLATION"
19 | puts
20 | puts RULER + "Z🏽" + ABC
21 | 
22 | puts "2) RGI EMOJI SEQ"
23 | puts
24 | puts RULER + "🏃🏼‍♀‍➡" + ABC
25 | 
26 | puts "2B) RGI EMOJI SEQ (TEXT-DEFAULT FIRST)"
27 | puts
28 | puts RULER + "⛹️‍♂️" + ABC
29 | 
30 | puts "2C) RGI EMOJI SEQ (TEXT-DEFAULT FIRST + UQE)"
31 | puts
32 | puts RULER + "⛹‍♂️" + ABC
33 | 
34 | puts "3) NON-RGI VALID EMOJI"
35 | puts
36 | puts RULER + "🤠‍🤢" + ABC
37 | 
38 | puts "4) NOT WELL-FORMED EMOJI SEQ"
39 | puts
40 | puts RULER + "🚄🏾‍🔆" + ABC
41 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width/index.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "zlib"
 4 | require_relative "constants"
 5 | 
 6 | module Unicode
 7 |   class DisplayWidth
 8 |     File.open(INDEX_FILENAME, "rb") do |file|
 9 |       serialized_data = Zlib::GzipReader.new(file).read
10 |       serialized_data.force_encoding Encoding::BINARY
11 |       INDEX = Marshal.load(serialized_data)
12 |     end
13 | 
14 |     def self.decompress_index(index, level)
15 |       index.flat_map{ |value|
16 |         if level > 0
17 |           if value.instance_of?(Array)
18 |             value[15] ||= nil
19 |             decompress_index(value, level - 1)
20 |           else
21 |             decompress_index([value] * 16, level - 1)
22 |           end
23 |         else
24 |           if value.instance_of?(Array)
25 |             value[15] ||= nil
26 |             value
27 |           else
28 |             [value] * 16
29 |           end
30 |         end
31 |       }
32 |     end
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/MIT-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT LICENSE
 2 | 
 3 | Copyright (c) 2011, 2015-2024 Jan Lelis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     name: Ruby ${{ matrix.ruby }} (${{ matrix.os }})
 8 |     if: "!contains(github.event.head_commit.message, '[skip ci]')"
 9 |     strategy:
10 |       matrix:
11 |         ruby:
12 |         - '3.4'
13 |         - '3.3'
14 |         - '3.2'
15 |         - '3.1'
16 |         - '3.0'
17 |         - '2.7'
18 |         - jruby
19 |         - truffleruby
20 |         os:
21 |         - ubuntu-latest
22 |         - macos-latest
23 |     runs-on: ${{matrix.os}}
24 |     steps:
25 |     - uses: actions/checkout@v4
26 |     - name: Set up Ruby
27 |       uses: ruby/setup-ruby@v1
28 |       with:
29 |         ruby-version: ${{matrix.ruby}}
30 |         bundler-cache: true
31 |     - name: Run tests
32 |       run: bundle exec rake
33 | 
34 |   test-windows:
35 |     name: Ruby ${{ matrix.ruby }} (windows-latest)
36 |     if: "!contains(github.event.head_commit.message, '[skip ci]')"
37 |     strategy:
38 |       matrix:
39 |         ruby:
40 |         - '3.4'
41 |         - '3.3'
42 |         - '3.2'
43 |         - '3.1'
44 |         - '3.0'
45 |         - '2.7'
46 |         - jruby
47 |     runs-on: windows-latest
48 |     steps:
49 |     - uses: actions/checkout@v4
50 |     - name: Set up Ruby
51 |       uses: ruby/setup-ruby@v1
52 |       with:
53 |         ruby-version: ${{matrix.ruby}}
54 |         bundler-cache: true
55 |     - name: Run tests
56 |       run: bundle exec rake
57 | 


--------------------------------------------------------------------------------
/unicode-display_width.gemspec:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | require File.dirname(__FILE__) + "/lib/unicode/display_width/constants"
 3 | 
 4 | Gem::Specification.new do |s|
 5 |   s.name        = "unicode-display_width"
 6 |   s.version     = Unicode::DisplayWidth::VERSION
 7 |   s.authors     = ["Jan Lelis"]
 8 |   s.email       = ["hi@ruby.consulting"]
 9 |   s.homepage    = "https://github.com/janlelis/unicode-display_width"
10 |   s.summary     = "Determines the monospace display width of a string in Ruby."
11 |   s.description =  "[Unicode #{Unicode::DisplayWidth::UNICODE_VERSION}] Determines the monospace display width of a string using EastAsianWidth.txt, Unicode general category, Emoji specification, and other data."
12 |   s.files = Dir.glob(%w[{lib,data}/**/*])
13 |   s.extra_rdoc_files = ["README.md", "MIT-LICENSE.txt", "CHANGELOG.md"]
14 |   s.license = 'MIT'
15 |   s.required_ruby_version = '>= 2.5.0'
16 |   s.add_dependency 'unicode-emoji', '~> 4.1'
17 |   s.add_development_dependency 'rspec', '~> 3.4'
18 |   s.add_development_dependency 'rake', '~> 13.0'
19 | 
20 |   if s.respond_to?(:metadata)
21 |     s.metadata['changelog_uri'] = "https://github.com/janlelis/unicode-display_width/blob/main/CHANGELOG.md"
22 |     s.metadata['source_code_uri'] = "https://github.com/janlelis/unicode-display_width"
23 |     s.metadata['bug_tracker_uri'] = "https://github.com/janlelis/unicode-display_width/issues"
24 |     s.metadata['rubygems_mfa_required'] = "true"
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width/emoji_support.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Unicode
 4 |   class DisplayWidth
 5 |     module EmojiSupport
 6 |       # Tries to find out which terminal emulator is used to
 7 |       # set emoji: config to best suiting value
 8 |       #
 9 |       # Please also see section in README.md and
10 |       # misc/terminal-emoji-width.rb
11 |       #
12 |       # Please note: Many terminals do not set any ENV vars,
13 |       # maybe CSI queries can help?
14 |       def self.recommended
15 |         @recommended ||= _recommended
16 |       end
17 | 
18 |       def self._recommended
19 |         if ENV["CI"]
20 |           return :rqi
21 |         end
22 | 
23 |         case ENV["TERM_PROGRAM"]
24 |         when "iTerm.app"
25 |           return :all
26 |         when "Apple_Terminal"
27 |           return :rgi_at
28 |         when "WezTerm"
29 |           return :all_no_vs16
30 |         end
31 | 
32 |         case ENV["TERM"]
33 |         when "contour","foot"
34 |           # konsole: all, how to detect?
35 |           return :all
36 |         when /kitty/
37 |           return :vs16
38 |         end
39 | 
40 |         if ENV["WT_SESSION"] # Windows Terminal
41 |           return :vs16
42 |         end
43 | 
44 |         # As of last time checked: gnome-terminal, vscode, alacritty
45 |         :none
46 |       end
47 | 
48 |       # Maybe: Implement something like https://github.com/jquast/ucs-detect
49 |       #        which uses the terminal cursor to check for best support level
50 |       #        at runtime
51 |       # def self.detect!
52 |       # end
53 |     end
54 |   end
55 | end
56 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # # #
 2 | # Get gemspec info
 3 | 
 4 | gemspec_file = Dir['*.gemspec'].first
 5 | gemspec = eval File.read(gemspec_file), binding, gemspec_file
 6 | info = "#{gemspec.name} | #{gemspec.version} | " \
 7 |        "#{gemspec.runtime_dependencies.size} dependencies | " \
 8 |        "#{gemspec.files.size} files"
 9 | 
10 | 
11 | # # #
12 | # Gem build and install task
13 | 
14 | desc info
15 | task :gem do
16 |   puts info + "\n\n"
17 |   print "  "; sh "gem build #{gemspec_file}"
18 |   FileUtils.mkdir_p 'pkg'
19 |   FileUtils.mv "#{gemspec.name}-#{gemspec.version}.gem", 'pkg'
20 |   puts; sh %{gem install --no-document pkg/#{gemspec.name}-#{gemspec.version}.gem}
21 | end
22 | 
23 | 
24 | # # #
25 | # Start an IRB session with the gem loaded
26 | 
27 | desc "#{gemspec.name} | IRB"
28 | task :irb do
29 |   sh "irb -I ./lib -r #{gemspec.name.gsub '-','/'}"
30 | end
31 | 
32 | # # #
33 | # Run all specs
34 | 
35 | 
36 | desc "#{gemspec.name} | Test"
37 | task :test do
38 |   sh "rspec spec"
39 | end
40 | task :spec => :test
41 | task :default => :test
42 | 
43 | # # #
44 | # Update index table
45 | 
46 | namespace :update do
47 |   desc "#{gemspec.name} | Update index"
48 |   task :index do
49 |     require File.dirname(__FILE__) + '/lib/unicode/display_width/index_builder'
50 |     Unicode::DisplayWidth::IndexBuilder.build!
51 |   end
52 | end
53 | 
54 | # # #
55 | # Update data file
56 | 
57 | namespace :update do
58 |   desc "#{gemspec.name} | Update unicode data"
59 |   task :data do
60 |     require File.dirname(__FILE__) + '/lib/unicode/display_width/index_builder'
61 |     Unicode::DisplayWidth::IndexBuilder.fetch!
62 |   end
63 | end
64 | 
65 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at opensource@janlelis.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [https://contributor-covenant.org/version/1/4][version]
72 | 
73 | [homepage]: https://contributor-covenant.org
74 | [version]: https://contributor-covenant.org/version/1/4/
75 | 


--------------------------------------------------------------------------------
/lib/unicode/display_width.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "unicode/emoji"
  4 | 
  5 | require_relative "display_width/constants"
  6 | require_relative "display_width/index"
  7 | require_relative "display_width/emoji_support"
  8 | 
  9 | module Unicode
 10 |   class DisplayWidth
 11 |     DEFAULT_AMBIGUOUS = 1
 12 |     INITIAL_DEPTH = 0x10000
 13 |     ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n-\x0F]/
 14 |     ASCII_NON_ZERO_STRING = "\0\x05\a\b\n-\x0F"
 15 |     ASCII_BACKSPACE = "\b"
 16 |     AMBIGUOUS_MAP = {
 17 |       1 => :WIDTH_ONE,
 18 |       2 => :WIDTH_TWO,
 19 |     }
 20 |     FIRST_AMBIGUOUS = {
 21 |       WIDTH_ONE: 768,
 22 |       WIDTH_TWO: 161,
 23 |     }
 24 |     NOT_COMMON_NARROW_REGEX = {
 25 |      WIDTH_ONE: /[^\u{10}-\u{2FF}]/m,
 26 |      WIDTH_TWO: /[^\u{10}-\u{A1}]/m,
 27 |     }
 28 |     FIRST_4096 = {
 29 |       WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
 30 |       WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
 31 |     }
 32 |     EMOJI_SEQUENCES_REGEX_MAPPING = {
 33 |       rgi: :REGEX_INCLUDE_MQE_UQE,
 34 |       rgi_at: :REGEX_INCLUDE_MQE_UQE,
 35 |       possible: :REGEX_WELL_FORMED,
 36 |     }
 37 |     REGEX_EMOJI_VS16 = Regexp.union(
 38 |       Regexp.compile(
 39 |         Unicode::Emoji::REGEX_TEXT_PRESENTATION.source +
 40 |         "(?<![#*0-9])" +
 41 |         "\u{FE0F}"
 42 |       ),
 43 |       Unicode::Emoji::REGEX_EMOJI_KEYCAP
 44 |     )
 45 | 
 46 |     # ebase = Unicode::Emoji::REGEX_PROP_MODIFIER_BASE.source
 47 |     REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+|.[\u{1F3FB}-\u{1F3FF}]/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
 48 |     REGEX_EMOJI_ALL_SEQUENCES_AND_VS16 = Regexp.union(REGEX_EMOJI_ALL_SEQUENCES, REGEX_EMOJI_VS16)
 49 | 
 50 |     # Returns monospace display width of string
 51 |     def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
 52 |       # Binary strings don't make much sense when calculating display width.
 53 |       # Assume it's valid UTF-8
 54 |       if string.encoding == Encoding::BINARY && !string.force_encoding(Encoding::UTF_8).valid_encoding?
 55 |         # Didn't work out, go back to binary
 56 |         string.force_encoding(Encoding::BINARY)
 57 |       end
 58 | 
 59 |       string = string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace) unless string.encoding == Encoding::UTF_8
 60 |       options = normalize_options(string, ambiguous, overwrite, old_options, **options)
 61 | 
 62 |       width = 0
 63 | 
 64 |       unless options[:overwrite].empty?
 65 |         width, string = width_custom(string, options[:overwrite])
 66 |       end
 67 | 
 68 |       if string.ascii_only?
 69 |         return width + width_ascii(string)
 70 |       end
 71 | 
 72 |       ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
 73 | 
 74 |       unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
 75 |         return width + string.size
 76 |       end
 77 | 
 78 |       # Retrieve Emoji width
 79 |       if options[:emoji] != :none
 80 |         e_width, string = emoji_width(
 81 |           string,
 82 |           options[:emoji],
 83 |           options[:ambiguous],
 84 |         )
 85 |         width += e_width
 86 | 
 87 |         unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
 88 |           return width + string.size
 89 |         end
 90 |       end
 91 | 
 92 |       index_full = INDEX[ambiguous_index_name]
 93 |       index_low = FIRST_4096[ambiguous_index_name]
 94 |       first_ambiguous = FIRST_AMBIGUOUS[ambiguous_index_name]
 95 | 
 96 |       string.each_codepoint{ |codepoint|
 97 |         if codepoint > 15 && codepoint < first_ambiguous
 98 |           width += 1
 99 |         elsif codepoint < 0x1001
100 |           width += index_low[codepoint] || 1
101 |         else
102 |           d = INITIAL_DEPTH
103 |           w = index_full[codepoint / d]
104 |           while w.instance_of? Array
105 |             w = w[(codepoint %= d) / (d /= 16)]
106 |           end
107 | 
108 |           width += w || 1
109 |         end
110 |       }
111 | 
112 |       # Return result + prevent negative lengths
113 |       width < 0 ? 0 : width
114 |     end
115 | 
116 |     # Returns width of custom overwrites and remaining string
117 |     def self.width_custom(string, overwrite)
118 |       width = 0
119 | 
120 |       string = string.each_codepoint.select{ |codepoint|
121 |         if overwrite[codepoint]
122 |           width += overwrite[codepoint]
123 |           nil
124 |         else
125 |           codepoint
126 |         end
127 |       }.pack("U*")
128 | 
129 |       [width, string]
130 |     end
131 | 
132 |     # Returns width for ASCII-only strings. Will consider zero-width control symbols.
133 |     def self.width_ascii(string)
134 |       if string.match?(ASCII_NON_ZERO_REGEX)
135 |         res = string.delete(ASCII_NON_ZERO_STRING).bytesize - string.count(ASCII_BACKSPACE)
136 |         return res < 0 ? 0 : res
137 |       end
138 | 
139 |       string.bytesize
140 |     end
141 | 
142 |     # Returns width of all considered Emoji and remaining string
143 |     def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
144 |       res = 0
145 | 
146 |       if emoji_set_regex = EMOJI_SEQUENCES_REGEX_MAPPING[mode]
147 |         emoji_width_via_possible(
148 |           string,
149 |           Unicode::Emoji.const_get(emoji_set_regex),
150 |           mode == :rgi_at,
151 |           ambiguous,
152 |         )
153 | 
154 |       elsif mode == :all_no_vs16
155 |         no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){ res += 2; "" }
156 |         [res, no_emoji_string]
157 | 
158 |       elsif mode == :vs16
159 |         no_emoji_string = string.gsub(REGEX_EMOJI_VS16){ res += 2; "" }
160 |         [res, no_emoji_string]
161 | 
162 |       elsif mode == :all
163 |         no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ res += 2; "" }
164 |         [res, no_emoji_string]
165 | 
166 |       else
167 |         [0, string]
168 | 
169 |       end
170 |     end
171 | 
172 |     # Match possible Emoji first, then refine
173 |     def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, ambiguous = DEFAULT_AMBIGUOUS)
174 |       res = 0
175 | 
176 |       # For each string possibly an emoji
177 |       no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ |emoji_candidate|
178 |         # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
179 |         if emoji_candidate == emoji_candidate[emoji_set_regex]
180 |           if strict_eaw
181 |             res += self.of(emoji_candidate[0], ambiguous, emoji: false)
182 |           else
183 |             res += 2
184 |           end
185 |           ""
186 | 
187 |         # We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
188 |         else
189 |           if !strict_eaw
190 |             # Ensure all explicit VS16 sequences have width 2
191 |             emoji_candidate.gsub!(REGEX_EMOJI_VS16){ res += 2; "" }
192 |           end
193 | 
194 |           emoji_candidate
195 |         end
196 |       }
197 | 
198 |       [res, no_emoji_string]
199 |     end
200 | 
201 |     def self.normalize_options(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
202 |       unless old_options.empty?
203 |         warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
204 |         options.merge! old_options
205 |       end
206 | 
207 |       options[:ambiguous] = ambiguous if ambiguous
208 |       options[:ambiguous] ||= DEFAULT_AMBIGUOUS
209 | 
210 |       if options[:ambiguous] != 1 && options[:ambiguous] != 2
211 |         raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
212 |       end
213 | 
214 |       if overwrite && !overwrite.empty?
215 |         warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
216 |         options[:overwrite] = overwrite
217 |       end
218 |       options[:overwrite] ||= {}
219 | 
220 |       if [nil, true, :auto].include?(options[:emoji])
221 |         options[:emoji] = EmojiSupport.recommended
222 |       elsif options[:emoji] == false
223 |         options[:emoji] = :none
224 |       end
225 | 
226 |       options
227 |     end
228 | 
229 |     def initialize(ambiguous: DEFAULT_AMBIGUOUS, overwrite: {}, emoji: true)
230 |       @ambiguous = ambiguous
231 |       @overwrite = overwrite
232 |       @emoji     = emoji
233 |     end
234 | 
235 |     def get_config(**kwargs)
236 |       {
237 |         ambiguous: kwargs[:ambiguous] || @ambiguous,
238 |         overwrite: kwargs[:overwrite] || @overwrite,
239 |         emoji:     kwargs[:emoji]     || @emoji,
240 |       }
241 |     end
242 | 
243 |     def of(string, **kwargs)
244 |       self.class.of(string, **get_config(**kwargs))
245 |     end
246 |   end
247 | end
248 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # CHANGELOG
  2 | 
  3 | ## 3.2.0
  4 | 
  5 | - Unicode 17.0
  6 | 
  7 | ## 3.1.5
  8 | 
  9 | - Cache Emoji support level for performance reasons #30, patch by @Earlopain:
 10 | 
 11 | ## 3.1.4
 12 | 
 13 | - Fix that skin tone modifiers were ignored when used in a non-ZWJ sequence
 14 |   context (= single emoji char + modifier) #29
 15 | - Add more docs and specs about modifier handling
 16 | 
 17 | ## 3.1.3
 18 | 
 19 | Better handling of non-UTF-8 strings, patch by @Earlopain:
 20 | 
 21 | - Data with *BINARY* encoding is interpreted as UTF-8, if possible
 22 | - Use `invalid: :replace` and `undef: :replace` options when converting to UTF-8
 23 | 
 24 | ## 3.1.2
 25 | 
 26 | - Performance improvements
 27 | 
 28 | ## 3.1.1
 29 | 
 30 | - Performance improvements
 31 | 
 32 | ## 3.1.0
 33 | 
 34 | **Improve Emoji support:**
 35 | 
 36 | - Emoji modes: Differentiate between well-formed Emoji (`:possible`) and any
 37 |   ZWJ/modifier sequence (`:all`). The latter is more common and more efficient
 38 |   to implement.
 39 | - Unify `:rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
 40 |   the former `:rgi_uqe` option). Most terminals that want to support the RGI set
 41 |   will probably want to catch Emoji sequences with missing VS16s.
 42 | - Add new `:all_no_vs16` and `:rgi_at` modes to be able to support some terminals
 43 |   that needs these quirks
 44 | - Add alias `emoji: :auto` for `emoji: true` and `emoji: :none` for `emoji: false`
 45 | - `:auto` mode: Only consider terminal cells when recommending Emoji support level
 46 |   (Emoji themselves might display differently)
 47 | - `:auto` mode: Set default Emoji mode for unknown/unsupported terminals to `:none`
 48 | - Rename `:basic` mode to `:vs16`
 49 | 
 50 | ## 3.0.1
 51 | 
 52 | 
 53 | - Add WezTerm and foot as good Emoji terminals
 54 | 
 55 | ## 3.0.0
 56 | 
 57 | **Rework Emoji support:**
 58 | 
 59 | - Emoji widths are now enabled by default
 60 | - Only reduce Emoji width to 2 when RGI Emoji detected (configurable)
 61 | - VS16 turns Emoji characters of width 1 into full-width
 62 | - Please note that Emoji parsing has a notable impact on performance.
 63 |   You can use the `emoji: false` option to disable Emoji adjustments
 64 | - Tries to detect terminal's Emoji support level automatically (from ENV vars)
 65 | 
 66 | **Index fixes and updates:**
 67 | 
 68 | - Private-use characters are considered ambiguous (were given width 1 before)
 69 | - Fix that a few zero-width ignorable codepoints from recent Unicode were missing
 70 | - Consider the following separators to be zero-width:
 71 |   - U+2028 - LINE SEPARATOR - Zl
 72 |   - U+2029 - PARAGRAPH SEPARATOR - Zp
 73 | 
 74 | **Other:**
 75 | 
 76 | - Add keyword arguments to `Unicode::DisplayWidth.of`. If you are using a hash
 77 |   with overwrite values as third parameter, be sure to put it in curly braces.
 78 | - Using third parameter or explicit hash as fourth parameter is deprecated,
 79 |   please migrate to the keyword arguments API
 80 | - Gem raises `ArgumentError` for ambiguous values other than 1 or 2
 81 | - Performance optimizations
 82 | - Require Ruby 2.5
 83 | 
 84 | ## 2.6.0
 85 | 
 86 | - Unicode 16
 87 | 
 88 | ## 2.5.0
 89 | 
 90 | - Unicode 15.1
 91 | 
 92 | ## 2.4.2
 93 | 
 94 | More performance improvements:
 95 | 
 96 | - Optimize lookup of first 4096 codepoints
 97 | - Avoid overwrite lookup if no overwrites are set
 98 | 
 99 | ## 2.4.1
100 | 
101 | - Improve general performance!
102 | - Further improve performance for ASCII strings
103 | 
104 | *You should really upgrade - it's much faster now!*
105 | 
106 | ## 2.4.0
107 | - Improve performance for ASCII-only strings, by @fatkodima
108 | - Require Ruby 2.4
109 | 
110 | ## 2.3.0
111 | 
112 | - Unicode 15.0
113 | 
114 | ## 2.2.0
115 | 
116 | - Add *Hangul Jamo Extended-B* block to zero-width chars, thanks @ninjalj #22
117 | 
118 | ## 2.1.0
119 | 
120 | - Unicode 14.0
121 | 
122 | ## 2.0.0
123 | 
124 | Add Support for Ruby 3.0
125 | 
126 | ### Breaking Changes
127 | 
128 | Some features of this library were marked deprecated for a long time and have been removed with Version 2.0:
129 | 
130 | - Aliases of display\_width (…\_size, …\_length) have been removed
131 | - Auto-loading of string core extension has been removed:
132 | 
133 | If you are relying on the `String#display_width` string extension to be automatically loaded (old behavior), please load it explicitly now:
134 | 
135 | ```ruby
136 | require "unicode/display_width/string_ext"
137 | ```
138 | 
139 | You could also change your `Gemfile` line to achieve this:
140 | 
141 | ```ruby
142 | gem "unicode-display_width", require: "unicode/display_width/string_ext"
143 | ```
144 | 
145 | ## 2.0.0.pre2
146 | 
147 | - Update 2.0 branch to Unicode 13
148 | 
149 | ## 2.0.0.pre1
150 | 
151 | Will be published as non-pre version on rubygems.org when Ruby 3.0 is released (December 2020)
152 | 
153 | - Introduce new class-based API, which remembers your string-width configuration. See README for details.
154 | - Remove auto-loading of string extension
155 |   - You can: `require "unicode/display_width/string_ext"` to continue to use the string extension
156 |   - The manual opt-out `require "unicode/display_width/no_string_ext"` is not needed anymore and will
157 |     issue a warning in the future
158 | - Remove (already deprecated) String#display_size and String#display_width aliases
159 | 
160 | Refactorings / Internal Changes:
161 | 
162 | - Freeze string literals
163 | - The Unicode::DisplayWidth now is class, instead of a module, this enables the new config-object API
164 | 
165 | ## 1.8.0
166 | 
167 | - Unicode 14.0 (last release of 1.x)
168 | 
169 | ## 1.7.0
170 | 
171 | - Unicode 13
172 | 
173 | ## 1.6.1
174 | 
175 | - Fix that ambiguous and overwrite options where ignored for emoji-measuring
176 | 
177 | ## 1.6.0
178 | 
179 | - Unicode 12.1
180 | 
181 | ## 1.5.0
182 | 
183 | - Unicode 12
184 | 
185 | ## 1.4.1
186 | 
187 | - Only bundle required lib/* and data/* files in actual rubygem, patch by @tas50
188 | 
189 | ## 1.4.0
190 | 
191 | - Unicode 11
192 | 
193 | ## 1.3.3
194 | 
195 | - Replace Gem::Util.gunzip with direct zlib implementation
196 |   This removes the dependency on rubygems, fixes #17
197 | 
198 | ## 1.3.2
199 | 
200 | - Explicitly load rubygems/util, fixes regression in 1.3.1 (autoload issue)
201 | 
202 | ## 1.3.1
203 | 
204 | - Use `Gem::Util` for `gunzip`, removes deprecation warning, patch by @Schwad
205 | 
206 | ## 1.3.0
207 | 
208 | - Unicode 10
209 | 
210 | ## 1.2.1
211 | 
212 | - Fix bug that `emoji: true` would fail for emoji without modifier
213 | 
214 | ## 1.2.0
215 | 
216 | - Add zero-width codepoint ranges: U+2060..U+206F, U+FFF0..U+FFF8, U+E0000..U+E0FFF
217 | - Add full-witdh codepoint ranges: U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2FFFD, U+30000..U+3FFFD
218 | - Experimental emoji support using the [unicode-emoji](https://github.com/janlelis/unicode-emoji) gem
219 | - Fix minor bug in index compression scheme
220 | 
221 | ## 1.1.3
222 | 
223 | - Fix that non-UTF-8 encodings do not throw errors, patch by @windwiny
224 | 
225 | ## 1.1.2
226 | 
227 | - Reduce memory consumption and increase performance, patch by @rrosenblum
228 | 
229 | ## 1.1.1
230 | 
231 | - Always load index into memory, fixes #9
232 | 
233 | ## 1.1.0
234 | 
235 | - Support Unicode 9.0
236 | 
237 | ## 1.0.5
238 | 
239 | - Actually include new index from 1.0.4
240 | 
241 | ## 1.0.4
242 | 
243 | - New index format (much smaller) and internal API changes
244 | - Move index generation to a builder plugin for the unicoder gem
245 | - No public API changes
246 | 
247 | ## 1.0.3
248 | 
249 | - Avoid circular dependency warning
250 | 
251 | ## 1.0.2
252 | 
253 | - Fix error that gemspec might be invalid under some circumstances (see gh#6)
254 | 
255 | ## 1.0.1
256 | 
257 | - Inofficially allow Ruby 1.9
258 | 
259 | ## 1.0.0
260 | 
261 | - Faster than 0.3.1
262 | - Advanced determination of character width
263 | - This includes: Treat width of most chars of general categories (Mn, Me, Cf) as 0
264 | - This includes: Introduce list of characters with special widths
265 | - Allow custom overrides for specific codepoints
266 | - Set required Ruby version to 2.0
267 | - Add NO_STRING_EXT mode to disable monkey patching
268 | - Internal API & index format changed drastically
269 | - Remove require 'unicode/display_size' (use 'unicode/display_width' instead)
270 | 
271 | ## 0.3.1
272 | 
273 | - Faster than 0.3.0
274 | - Deprecate usage of aliases: String#display_size and String#display_length
275 | - Eliminate Ruby warnings (@amatsuda)
276 | 
277 | ## 0.3.0
278 | 
279 | - Update EastAsianWidth from 7.0 to 8.0
280 | - Add rake task to update EastAsianWidth.txt
281 | - Move code to generate index from library to Rakefile
282 | - Update project's meta files
283 | - Deprecate requiring 'unicode-display_size'
284 | 
285 | ## 0.2.0
286 | 
287 | - Update EastAsianWidth from 6.0 to 7.0
288 | - Don't build index table automatically when not available
289 | - Don't include EastAsianWidth.txt in gem (only index)
290 | 
291 | 
292 | ## 0.1.0
293 | 
294 | - Fix github issue #1
295 | 
296 | 
297 | ## 0.1.0
298 | 
299 | - Initial release
300 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Unicode::DisplayWidth [![[version]](https://badge.fury.io/rb/unicode-display_width.svg)](https://badge.fury.io/rb/unicode-display_width) [<img src="https://github.com/janlelis/unicode-display_width/workflows/Test/badge.svg" />](https://github.com/janlelis/unicode-display_width/actions?query=workflow%3ATest)
  2 | 
  3 | Determines the monospace display width of a string in Ruby, which is useful for all kinds of terminal-based applications. The implementation is based on [EastAsianWidth.txt](https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt), the [Emoji specfication](https://www.unicode.org/reports/tr51/) and other data, 100% in Ruby. It does not rely on the OS vendor ([wcwidth](https://github.com/janlelis/wcswidth-ruby)) to provide an up-to-date method for measuring string width in terminals.
  4 | 
  5 | Unicode version: **17.0.0** (September 2025)
  6 | 
  7 | ## Gem Version 3 — Improved Emoji Support
  8 | 
  9 | **Emoji support is now enabled by default.** See below for description and configuration possibilities.
 10 | 
 11 | **Unicode::DisplayWidth.of now takes keyword arguments:** { ambiguous:, emoji:, overwrite: }
 12 | 
 13 | See [CHANGELOG](/CHANGELOG.md) for details.
 14 | 
 15 | ## Gem Version 2.4.2 — Performance Updates
 16 | 
 17 | **If you use this gem, you should really upgrade to 2.4.2 or newer. It's often 100x faster, sometimes even 1000x and more!**
 18 | 
 19 | This is possible because the gem now detects if you use very basic (and common) characters, like ASCII characters. Furthermore, the character width lookup code has been optimized, so even when the string involves full-width or ambiguous characters, the gem is much faster now.
 20 | 
 21 | ## Introduction to Character Widths
 22 | 
 23 | Guessing the correct space a character will consume on terminals is not easy. There is no single standard. Most implementations combine data from [East Asian Width](https://www.unicode.org/reports/tr11/), some [General Categories](https://en.wikipedia.org/wiki/Unicode_character_property#General_Category), and hand-picked adjustments.
 24 | 
 25 | ### How this Library Handles Widths
 26 | 
 27 | Further at the top means higher precedence. Please expect changes to this algorithm with every MINOR version update (the X in 1.X.0)!
 28 | 
 29 | Width  | Characters                   | Comment
 30 | -------|------------------------------|--------------------------------------------------
 31 | ?      | (user defined)               | Overwrites any other values
 32 | ?      | Emoji                        | See "How this Library Handles Emoji Width" below
 33 | -1     | `"\b"`                       | Backspace (total width never below 0)
 34 | 0      | `"\0"`, `"\x05"`, `"\a"`, `"\n"`, `"\v"`, `"\f"`, `"\r"`, `"\x0E"`, `"\x0F"` | [C0 control codes](https://en.wikipedia.org/wiki/C0_and_C1_control_codes#C0_.28ASCII_and_derivatives.29) which do not change horizontal width
 35 | 1      | `"\u{00AD}"`                 | SOFT HYPHEN
 36 | 2      | `"\u{2E3A}"`                 | TWO-EM DASH
 37 | 3      | `"\u{2E3B}"`                 | THREE-EM DASH
 38 | 0      | General Categories: Mn, Me, Zl, Zp, Cf (non-arabic)| Excludes ARABIC format characters
 39 | 0      | Derived Property: Default_Ignorable_Code_Point     | Ignorable ranges
 40 | 0      | `"\u{1160}".."\u{11FF}"`, `"\u{D7B0}".."\u{D7FF}"` | HANGUL JUNGSEONG
 41 | 2      | East Asian Width: F, W       | Full-width characters
 42 | 2      | `"\u{3400}".."\u{4DBF}"`, `"\u{4E00}".."\u{9FFF}"`, `"\u{F900}".."\u{FAFF}"`, `"\u{20000}".."\u{2FFFD}"`, `"\u{30000}".."\u{3FFFD}"` | Full-width ranges
 43 | 1 or 2 | East Asian Width: A          | Ambiguous characters, user defined, default: 1
 44 | 1      | All other codepoints         | -
 45 | 
 46 | ## Install
 47 | 
 48 | Install the gem with:
 49 | 
 50 |     $ gem install unicode-display_width
 51 | 
 52 | Or add to your Gemfile:
 53 | 
 54 |     gem 'unicode-display_width'
 55 | 
 56 | ## Usage
 57 | 
 58 | ```ruby
 59 | require 'unicode/display_width'
 60 | 
 61 | Unicode::DisplayWidth.of("⚀") # => 1
 62 | Unicode::DisplayWidth.of("一") # => 2
 63 | ```
 64 | 
 65 | ### Ambiguous Characters
 66 | 
 67 | The second parameter defines the value returned by characters defined as ambiguous:
 68 | 
 69 | ```ruby
 70 | Unicode::DisplayWidth.of("·", 1) # => 1
 71 | Unicode::DisplayWidth.of("·", 2) # => 2
 72 | ```
 73 | 
 74 | ### Encoding Notes
 75 | 
 76 | - Data with *BINARY* encoding is interpreted as UTF-8, if possible
 77 | - Non-UTF-8 strings are converted to UTF-8 before measuring, using the [`{invalid: :replace, undef: :replace}`) options](https://ruby-doc.org/3.3.5/encodings_rdoc.html#label-Encoding+Options)
 78 | 
 79 | ### Custom Overwrites
 80 | 
 81 | You can overwrite how to handle specific code points by passing a hash (or even a proc) as `overwrite:` parameter:
 82 | 
 83 | ```ruby
 84 | Unicode::DisplayWidth.of("a\tb", 1, overwrite: { "\t".ord => 10 })) # => TAB counted as 10, result is 12
 85 | ```
 86 | 
 87 | Please note that using overwrites disables some perfomance optimizations of this gem.
 88 | 
 89 | ### Emoji
 90 | 
 91 | If your terminal supports it, the gem detects Emoji and Emoji sequences and adjusts the width of the measured string. This can be disabled by passing `emoji: false` as an argument:
 92 | 
 93 | ```ruby
 94 | Unicode::DisplayWidth.of "🤾🏽‍♀️", emoji: :all # => 2
 95 | Unicode::DisplayWidth.of "🤾🏽‍♀️", emoji: false # => 5
 96 | ```
 97 | 
 98 | #### How this Library Handles Emoji Width
 99 | 
100 | There are many Emoji which get constructed by combining other Emoji in a sequence. This makes measuring the width complicated, since terminals might either display the combined Emoji or the separate parts of the Emoji individually.
101 | 
102 | Another aspect where terminals disagree is whether Emoji characters which have a text presentation by default (width 1) should be turned into full-width (width 2) when combined with Variation Selector 16 (*U+FEOF*).
103 | 
104 | Finally, it varies if Skin Tone Modifiers can be applied to all characters or just to those with the "Emoji Base" property.
105 | 
106 | Emoji Type  | Width / Comment
107 | ------------|----------------
108 | Basic/Single Emoji character without Variation Selector   | No special handling
109 | Basic/Single Emoji character with VS15 (Text)             | No special handling
110 | Basic/Single Emoji character with VS16 (Emoji)            | 2 or East Asian Width (see table below)
111 | Single Emoji character with Skin Tone Modifier            | 2 unless Emoji mode is `:none` or `vs16`
112 | Skin Tone Modifier used in isolation or with invalid base | 2 if Emoji mode is `:rgi` / `:rgi_at`
113 | Emoji Sequence                                            | 2 if Emoji belongs to configured Emoji set (see table below)
114 | 
115 | #### Emoji Modes
116 | 
117 | The `emoji:` option can be used to configure which type of Emoji should be considered to have a width of 2 and if VS16-Emoji should be widened. Other sequences are treated as non-combined Emoji, so the widths of all partial Emoji add up (e.g. width of one basic Emoji + one skin tone modifier + another basic Emoji). The following Emoji settings can be used:
118 | 
119 | `emoji:` Option | VS16-Emoji Width | Emoji Sequences Width / Comment | Example Terminals
120 | ----------------|------------------|---------------------------------|------------------
121 | `true` or `:auto`  | - | Automatically use recommended Emoji setting for your terminal | -
122 | `:all`     | 2                | 2 for all ZWJ/modifier/keycap sequences, even if they are not well-formed Emoji sequences | iTerm, foot
123 | `:all_no_vs16` | EAW (1 or 2) | 2 for all ZWJ/modifier/keycap sequences, even if they are not well-formed Emoji sequences | WezTerm
124 | `:possible`| 2                | 2 for all possible/well-formed Emoji sequences | ?
125 | `:rgi`     | 2                | 2 for all [RGI Emoji](https://www.unicode.org/reports/tr51/#def_rgi_set) sequences | ?
126 | `:rgi_at`  | EAW (1 or 2)     | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have EAW | Apple Terminal
127 | `:vs16`    | 2                | 2 * number of partial Emoji (sequences never considered to represent a combined Emoji) | kitty?
128 | `false` or  `:none` | EAW (1 or 2) | No Emoji adjustments | gnome-terminal, many older terminals
129 | 
130 | - *EAW:* East Asian Width
131 | - *RGI Emoji:* Emoji Recommended for General Interchange
132 | - *ZWJ:* Zero-width Joiner: Codepoint `U+200D`,used in many Emoji sequences
133 | 
134 | #### Emoji Support in Terminals
135 | 
136 | Unfortunately, the level of Emoji support varies a lot between terminals. While some of them are able to display (almost) all Emoji sequences correctly, others fall back to displaying sequences of basic Emoji. When `emoji: true` or `emoji: :auto` is used, the gem will attempt to set the best fitting Emoji setting for you (e.g. `:rgi_at` on "Apple_Terminal" or `false` on Gnome's terminal widget).
137 | 
138 | Please note that Emoji display and number of terminal columns used might differs a lot. For example, it might be the case that a terminal does not understand which Emoji to display, but still manages to calculate the proper amount of terminal cells. The automatic Emoji support level per terminal only considers the latter (cursor position), not the actual Emoji image(s) displayed. Please [open an issue](https://github.com/janlelis/unicode-display_width/issues/new) if you notice your terminal application could use a better default value. Also see the [ucs-detect project](https://ucs-detect.readthedocs.io/results.html), which is a great resource that compares various terminal's Unicode/Emoji capabilities. You can visually check how your terminals renders different kind of Emoji types with the [terminal-emoji-width.rb script](https://github.com/janlelis/unicode-display_width/blob/main/misc/terminal-emoji-width.rb).
139 | 
140 | **To terminal implementors reading this:** Although the practice of giving all Emoji/ZWJ sequences a width of 2 (`:all` mode described above) has some advantages, it does not lead to a particularly good developer experience. Since there is always the possibility of well-formed Emoji that are currently not supported (non-RGI / future Unicode) appearing, those sequences will take more cells. Instead of overflowing, cutting off sequences or displaying placeholder-Emoji, could it be worthwile to implement the `:rgi` option (only known Emoji get width 2) and give those unknown Emoji the space they need? This would support the idea that the meaning of an unknown Emoji sequence can still be conveyed (without messing up the terminal at the same time). Just a thought…
141 | 
142 | ### Usage with String Extension
143 | 
144 | ```ruby
145 | require 'unicode/display_width/string_ext'
146 | 
147 | "⚀".display_width # => 1
148 | '一'.display_width # => 2
149 | ```
150 | 
151 | ### Usage with Config Object
152 | 
153 | You can use a config object that allows you to save your configuration for later-reuse. This requires an extra line of code, but has the advantage that you'll need to define your string-width options only once:
154 | 
155 | ```ruby
156 | require 'unicode/display_width'
157 | 
158 | display_width = Unicode::DisplayWidth.new(
159 |   # ambiguous: 1,
160 |   overwrite: { "A".ord => 100 },
161 |   emoji: :all,
162 | )
163 | 
164 | display_width.of "⚀" # => 1
165 | display_width.of "🤠‍🤢" # => 2
166 | display_width.of "A" # => 100
167 | ```
168 | 
169 | ### Usage from the Command-Line
170 | 
171 | Use this one-liner to print out display widths for strings from the command-line:
172 | 
173 | ```
174 | $ gem install unicode-display_width
175 | $ ruby -r unicode/display_width -e 'puts Unicode::DisplayWidth.of $*[0]' -- "一"
176 | ```
177 | Replace "一" with the actual string to measure
178 | 
179 | ## Other Implementations & Discussion
180 | 
181 | - Python: https://github.com/jquast/wcwidth
182 | - JavaScript: https://github.com/mycoboco/wcwidth.js
183 | - C: https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
184 | - C for Julia: https://github.com/JuliaLang/utf8proc/issues/2
185 | - Golang: https://github.com/rivo/uniseg
186 | 
187 | See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries.
188 | 
189 | ## Copyright & Info
190 | 
191 | - Copyright (c) 2011, 2015-2025 Jan Lelis, https://janlelis.com, released under the MIT
192 | license
193 | - Early versions based on runpaint's unicode-data interface: Copyright (c) 2009 Run Paint Run Run
194 | - Unicode data: https://www.unicode.org/copyright.html#Exhibit1
195 | 


--------------------------------------------------------------------------------
/spec/display_width_spec.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require_relative '../lib/unicode/display_width/string_ext'
  4 | 
  5 | describe 'Unicode::DisplayWidth.of' do
  6 |   describe '[east asian width]' do
  7 |     it 'returns 2 for F' do
  8 |       expect( '！'.display_width ).to eq 2
  9 |     end
 10 | 
 11 |     it 'returns 2 for W' do
 12 |       expect( '一'.display_width ).to eq 2
 13 |     end
 14 | 
 15 |     it 'returns 2 for W (which are currently unassigned)' do
 16 |       expect( "\u{3FFFD}".display_width ).to eq 2
 17 |     end
 18 | 
 19 |     it 'returns 1 for N' do
 20 |       expect( 'À'.display_width ).to eq 1
 21 |     end
 22 | 
 23 |     it 'returns 1 for Na' do
 24 |       expect( 'A'.display_width ).to eq 1
 25 |     end
 26 | 
 27 |     it 'returns 1 for H' do
 28 |       expect( '｡'.display_width ).to eq 1
 29 |     end
 30 | 
 31 |     it 'returns first argument of display_width for A' do
 32 |       expect( '·'.display_width(1) ).to eq 1
 33 |     end
 34 | 
 35 |     it 'returns first argument of display_width for A' do
 36 |       expect( '·'.display_width(2) ).to eq 2
 37 |     end
 38 | 
 39 |     it 'returns 1 for A if no argument given' do
 40 |       expect( '·'.display_width ).to eq 1
 41 |     end
 42 |   end
 43 | 
 44 |   describe '[zero width]' do
 45 |     it 'returns 0 for Mn chars' do
 46 |       expect( 'ֿ'.display_width ).to eq 0
 47 |     end
 48 | 
 49 |     it 'returns 0 for Me chars' do
 50 |       expect( '҈'.display_width ).to eq 0
 51 |     end
 52 | 
 53 |     it 'returns 0 for Cf chars' do
 54 |       expect( '​'.display_width ).to eq 0
 55 |     end
 56 | 
 57 |     it 'returns 0 for HANGUL JUNGSEONG chars' do
 58 |       expect( 'ᅠ'.display_width ).to eq 0
 59 |       expect( 'ힰ'.display_width ).to eq 0
 60 |     end
 61 | 
 62 |     it 'returns 0 for U+2060..U+206F' do
 63 |       expect( "\u{2060}".display_width ).to eq 0
 64 |     end
 65 | 
 66 |     it 'returns 0 for U+FFF0..U+FFF8' do
 67 |       expect( "\u{FFF0}".display_width ).to eq 0
 68 |     end
 69 | 
 70 |     it 'returns 0 for U+E0000..U+E0FFF' do
 71 |       expect( "\u{E0000}".display_width ).to eq 0
 72 |     end
 73 |   end
 74 | 
 75 |   describe '[special characters]' do
 76 |     it 'returns 0 for ␀' do
 77 |       expect( "\0".display_width ).to eq 0
 78 |     end
 79 | 
 80 |     it 'returns 0 for ␅' do
 81 |       expect( "\x05".display_width ).to eq 0
 82 |     end
 83 | 
 84 |     it 'returns 0 for ␇' do
 85 |       expect( "\a".display_width ).to eq 0
 86 |     end
 87 | 
 88 |     it 'returns -1 for ␈' do
 89 |       expect( "aaaa\b".display_width ).to eq 3
 90 |     end
 91 | 
 92 |     it 'returns -1 for ␈, but at least 0' do
 93 |       expect( "\b".display_width ).to eq 0
 94 |     end
 95 | 
 96 |     it 'returns 0 for ␊' do
 97 |       expect( "\n".display_width ).to eq 0
 98 |     end
 99 | 
100 |     it 'returns 0 for ␋' do
101 |       expect( "\v".display_width ).to eq 0
102 |     end
103 | 
104 |     it 'returns 0 for ␌' do
105 |       expect( "\f".display_width ).to eq 0
106 |     end
107 | 
108 |     it 'returns 0 for ␍' do
109 |       expect( "\r".display_width ).to eq 0
110 |     end
111 | 
112 |     it 'returns 0 for ␎' do
113 |       expect( "\x0E".display_width ).to eq 0
114 |     end
115 | 
116 |     it 'returns 0 for ␏' do
117 |       expect( "\x0F".display_width ).to eq 0
118 |     end
119 | 
120 |     it 'returns 1 for other C0 characters' do
121 |       expect( "\x01".display_width ).to eq 1
122 |       expect( "\x02".display_width ).to eq 1
123 |       expect( "\x03".display_width ).to eq 1
124 |       expect( "\x04".display_width ).to eq 1
125 |       expect( "\x06".display_width ).to eq 1
126 |       expect( "\x10".display_width ).to eq 1
127 |       expect( "\x11".display_width ).to eq 1
128 |       expect( "\x12".display_width ).to eq 1
129 |       expect( "\x13".display_width ).to eq 1
130 |       expect( "\x14".display_width ).to eq 1
131 |       expect( "\x15".display_width ).to eq 1
132 |       expect( "\x16".display_width ).to eq 1
133 |       expect( "\x17".display_width ).to eq 1
134 |       expect( "\x18".display_width ).to eq 1
135 |       expect( "\x19".display_width ).to eq 1
136 |       expect( "\x1a".display_width ).to eq 1
137 |       expect( "\x1b".display_width ).to eq 1
138 |       expect( "\x1c".display_width ).to eq 1
139 |       expect( "\x1d".display_width ).to eq 1
140 |       expect( "\x1e".display_width ).to eq 1
141 |       expect( "\x1f".display_width ).to eq 1
142 |       expect( "\x7f".display_width ).to eq 1
143 |     end
144 | 
145 |     it 'returns 0 for LINE SEPARATOR' do
146 |       expect( "\u{2028}".display_width ).to eq 0
147 |     end
148 | 
149 |     it 'returns 0 for PARAGRAPH SEPARATOR' do
150 |       expect( "\u{2029}".display_width ).to eq 0
151 |     end
152 | 
153 |     it 'returns 1 for SOFT HYPHEN' do
154 |       expect( "­".display_width ).to eq 1
155 |     end
156 | 
157 |     it 'returns 2 for THREE-EM DASH' do
158 |       expect( "⸺".display_width ).to eq 2
159 |     end
160 | 
161 |     it 'returns 3 for THREE-EM DASH' do
162 |       expect( "⸻".display_width ).to eq 3
163 |     end
164 | 
165 |     it 'returns ambiguous for private-use' do
166 |       expect( "󰀀".display_width(1) ).to eq 1
167 |       expect( "󰀀".display_width(2) ).to eq 2
168 |     end
169 |   end
170 | 
171 | 
172 |   describe '[overwrite]' do
173 |     it 'can be passed a 3rd parameter with overwrites (old format)' do
174 |       expect( "\t".display_width(1, { 0x09 => 12 }) ).to eq 12
175 |     end
176 | 
177 |     it 'can be passed as :overwrite option' do
178 |       expect( "\t".display_width(overwrite: { 0x09 => 12 }) ).to eq 12
179 |     end
180 |   end
181 | 
182 |   describe '[encoding]' do
183 |     it 'works with non-utf8 Unicode encodings' do
184 |       expect( 'À'.encode("UTF-16LE").display_width ).to eq 1
185 |     end
186 | 
187 |     it 'works with a string that is invalid in its encoding' do
188 |       s = "\x81\x39".dup.force_encoding(Encoding::SHIFT_JIS)
189 | 
190 |       # Would print as �9 on the terminal
191 |       expect( s.display_width ).to eq 2
192 |     end
193 | 
194 |     it 'works with a binary encoded string that is valid in UTF-8' do
195 |       expect( '€'.b.display_width ).to eq 1
196 |     end
197 |   end
198 | 
199 |   describe '[emoji]' do
200 |     describe '(basic emoji / text emoji)' do
201 |       it 'counts default-text presentation Emoji according to EAW (example: 1)' do
202 |         expect( "❣".display_width(emoji: :all) ).to eq 1
203 |       end
204 | 
205 |       it 'counts default-text presentation Emoji according to EAW (example: ambiguous)' do
206 |         expect( "♀".display_width(1, emoji: :all) ).to eq 1
207 |         expect( "♀".display_width(2, emoji: :all) ).to eq 2
208 |       end
209 |      
210 |       it 'counts default-text presentation Emoji with Emoji Presentation (VS16) as 2' do
211 |         expect( "❣️".display_width(emoji: :all) ).to eq 2
212 |       end
213 | 
214 |       it 'counts default-text presentation Emoji with Emoji Presentation (VS16) as 2 (in a sequence)' do
215 |         expect( "❣️‍❣️".display_width(emoji: :rgi) ).to eq 4
216 |       end
217 | 
218 |       it 'counts default-emoji presentation Emoji according to EAW (always 2)' do
219 |         expect( "💚".display_width(emoji: :all) ).to eq 2
220 |       end
221 |     end
222 | 
223 |     describe '(special emoji / emoji sequences)' do
224 |       it 'works with flags: width 2' do
225 |         expect( "🇵🇹".display_width(emoji: :all) ).to eq 2
226 |       end
227 | 
228 |       it 'works with subdivision flags: width 2' do
229 |         expect( "🏴󠁧󠁢󠁥󠁮󠁧󠁿".display_width(emoji: :all) ).to eq 2
230 |       end
231 | 
232 |       it 'works with keycaps: width 2' do
233 |         expect( "1️⃣".display_width(emoji: :all) ).to eq 2
234 |       end
235 |     end
236 | 
237 |     describe '(modifiers and zwj sequences)' do
238 |       it 'applies simple skin tone modifiers' do
239 |         expect( "👏🏽".display_width(emoji: :rgi) ).to eq 2
240 |       end
241 | 
242 |       it 'counts RGI Emoji ZWJ sequence as width 2' do
243 |         expect( "🤾🏽‍♀️".display_width(emoji: :rgi) ).to eq 2
244 |       end
245 | 
246 |       it 'works for emoji involving characters which are east asian ambiguous' do
247 |         expect( "🤾🏽‍♀️".display_width(2, emoji: :rgi) ).to eq 2
248 |       end
249 |     end
250 | 
251 |     describe '(modes)' do
252 |       describe 'false / :none' do
253 |         it 'does no Emoji adjustments when emoji suport is disabled' do
254 |           expect( "🤾🏽‍♀️".display_width(emoji: false) ).to eq 5
255 |           expect( "❣️".display_width(emoji: :none) ).to eq 1
256 |           expect( "👏🏽".display_width(emoji: :none) ).to eq 4
257 |         end
258 |       end
259 | 
260 |       describe ':vs16' do
261 |         it 'will ignore shorter width of all Emoji sequences' do
262 |           # Please note that this is different from emoji: false / emoji: :none
263 |           # -> Basic Emoji with VS16 still get normalized
264 |           expect( "🤾🏽‍♀️".display_width(emoji: :vs16) ).to eq 6
265 |         end
266 | 
267 |         it 'counts default-text presentation Emoji with Emoji Presentation (VS16) as 2' do
268 |           expect( "❣️".display_width(emoji: :vs16) ).to eq 2
269 |         end
270 | 
271 |         it 'works with keycaps: width 2' do
272 |           expect( "1️⃣".display_width(emoji: :vs16) ).to eq 2
273 |         end
274 |       end
275 | 
276 |       describe ':rgi' do
277 |         it 'will ignore shorter width of non-RGI sequences' do
278 |           expect( "🤾🏽‍♀️".display_width(emoji: :rgi) ).to eq 2 # FQE
279 |           expect( "🤾🏽‍♀".display_width(emoji: :rgi) ).to eq 2 # MQE
280 |           expect( "❤‍🩹".display_width(emoji: :rgi) ).to eq 2 # UQE
281 |           expect( "👏🏽".display_width(emoji: :rgi) ).to eq 2 # Modifier
282 |           expect( "J🏽".display_width(emoji: :rgi) ).to eq 3 # Modifier with invalid base
283 |           expect( "🤠‍🤢".display_width(emoji: :rgi) ).to eq 4 # Non-RGI/well-formed
284 |           expect( "🚄🏾‍▶️".display_width(emoji: :rgi) ).to eq 6 # Invalid/non-Emoji sequence
285 |         end
286 | 
287 |         it 'counts default-text presentation Emoji with Emoji Presentation (VS16) as 2' do
288 |           expect( "❣️".display_width(emoji: :rgi) ).to eq 2
289 |         end
290 |       end
291 | 
292 |       describe ':rgi_at' do
293 |         it 'will assign width based on EAW of first partial Emoji to whole sequence' do
294 |           expect( "🤾🏽‍♀️".display_width(emoji: :rgi_at) ).to eq 2
295 |           expect( "⛹️‍♀️".display_width(emoji: :rgi_at) ).to eq 1
296 |           expect( "❤‍🩹".display_width(emoji: :rgi_at) ).to eq 1
297 |         end
298 | 
299 |         it 'will count partial emoji for non-RGI sequences' do
300 |           expect( "🤠‍🤢".display_width(emoji: :rgi_at) ).to eq 4 # Non-RGI/well-formed
301 |           expect( "🚄🏾‍▶️".display_width(emoji: :rgi_at) ).to eq 5 # Invalid/non-Emoji sequence
302 |         end
303 | 
304 |         it 'uses EAW for default-text presentation Emoji with Emoji Presentation (VS16)' do
305 |           expect( "❣️".display_width(emoji: :rgi_at) ).to eq 1
306 |         end
307 |       end
308 | 
309 |       describe ':possible' do
310 |         it 'will treat possible/well-formed Emoji sequence as width 2' do
311 |           expect( "🤾🏽‍♀️".display_width(emoji: :possible) ).to eq 2 # FQE
312 |           expect( "🤾🏽‍♀".display_width(emoji: :possible) ).to eq 2 # MQE
313 |           expect( "❤‍🩹".display_width(emoji: :possible) ).to eq 2 # UQE
314 |           expect( "👏🏽".display_width(emoji: :possible) ).to eq 2 # Modifier
315 |           expect( "J🏽".display_width(emoji: :possible) ).to eq 3 # Modifier with invalid base
316 |           expect( "🤠‍🤢".display_width(emoji: :possible) ).to eq 2 # Non-RGI/well-formed
317 |           expect( "🚄🏾‍▶️".display_width(emoji: :possible) ).to eq 6 # Invalid/non-Emoji sequence
318 |         end
319 | 
320 |         it 'counts default-text presentation Emoji with Emoji Presentation (VS16) as 2' do
321 |           expect( "❣️".display_width(emoji: :possible) ).to eq 2
322 |         end
323 |       end
324 | 
325 |       describe ':all' do
326 |         it 'will treat any ZWJ/modifier/keycap sequences sequence as width 2' do
327 |           expect( "🤾🏽‍♀️".display_width(emoji: :all) ).to eq 2 # FQE
328 |           expect( "🤾🏽‍♀".display_width(emoji: :all) ).to eq 2 # MQE
329 |           expect( "❤‍🩹".display_width(emoji: :all) ).to eq 2 # UQE
330 |           expect( "👏🏽".display_width(emoji: :all) ).to eq 2 # Modifier
331 |           expect( "👏🏽".display_width(emoji: :all) ).to eq 2 # Modifier
332 |           expect( "J🏽".display_width(emoji: :all) ).to eq 2 # Modifier with invalid base
333 |           expect( "🤠‍🤢".display_width(emoji: :all) ).to eq 2 # Non-RGI/well-formed
334 |           expect( "🚄🏾‍▶️".display_width(emoji: :all) ).to eq 2 # Invalid/non-Emoji sequence
335 |         end
336 | 
337 |         it 'counts default-text presentation Emoji with Emoji Presentation (VS16) as 2' do
338 |           expect( "❣️".display_width(emoji: :all) ).to eq 2
339 |         end
340 |       end
341 | 
342 |       describe ':all_no_vs16' do
343 |         it 'will treat any ZWJ/modifier/keycap sequences sequence as width 2' do
344 |           expect( "🤾🏽‍♀️".display_width(emoji: :all_no_vs16) ).to eq 2 # FQE
345 |           expect( "🤾🏽‍♀".display_width(emoji: :all_no_vs16) ).to eq 2 # MQE
346 |           expect( "❤‍🩹".display_width(emoji: :all_no_vs16) ).to eq 2 # UQE
347 |           expect( "👏🏽".display_width(emoji: :all_no_vs16) ).to eq 2 # Modifier
348 |           expect( "J🏽".display_width(emoji: :all_no_vs16) ).to eq 2 # Modifier with wrong base
349 |           expect( "🤠‍🤢".display_width(emoji: :all_no_vs16) ).to eq 2 # Non-RGI/well-formed
350 |           expect( "🚄🏾‍▶️".display_width(emoji: :all_no_vs16) ).to eq 2 # Invalid/non-Emoji sequence
351 |         end
352 | 
353 |         it 'uses EAW for default-text presentation Emoji with Emoji Presentation (VS16)' do
354 |           expect( "❣️".display_width(emoji: :all_no_vs16) ).to eq 1
355 |         end
356 |       end
357 |     end
358 |   end
359 | end
360 | 
361 | describe "Config object based API" do
362 |   let :display_width do
363 |     Unicode::DisplayWidth.new(
364 |       # ambiguous: 1,
365 |       overwrite: { "A".ord => 100 },
366 |       emoji: :all,
367 |     )
368 |   end
369 | 
370 |   it "will respect given overwrite option" do
371 |     expect( display_width.of "A" ).to eq 100
372 |   end
373 | 
374 |   it "will respect given emoji option" do
375 |     expect( display_width.of "🤠‍🤢" ).to eq 2
376 |   end
377 | end
378 | 


--------------------------------------------------------------------------------