├── .github └── workflows │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Gemfile ├── MIT-LICENSE.txt ├── README.md ├── Rakefile ├── data └── blocks.marshal.gz ├── lib └── unicode │ ├── blocks.rb │ └── blocks │ ├── constants.rb │ ├── index.rb │ └── string_ext.rb ├── spec └── unicode_blocks_spec.rb └── unicode-blocks.gemspec /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Ruby ${{ matrix.ruby }} (${{ matrix.os }}) 8 | if: "!contains(github.event.head_commit.message, '[skip ci]')" 9 | strategy: 10 | matrix: 11 | ruby: 12 | - '3.3' 13 | - '3.2' 14 | - '3.1' 15 | - '3.0' 16 | - jruby 17 | - truffleruby 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | runs-on: ${{matrix.os}} 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Ruby 25 | uses: ruby/setup-ruby@v1 26 | with: 27 | ruby-version: ${{matrix.ruby}} 28 | bundler-cache: true 29 | - name: Run tests 30 | run: bundle exec rake 31 | 32 | test-windows: 33 | name: Ruby ${{ matrix.ruby }} (windows-latest) 34 | if: "!contains(github.event.head_commit.message, '[skip ci]')" 35 | strategy: 36 | matrix: 37 | ruby: 38 | - '3.3' 39 | - '3.2' 40 | - '3.1' 41 | - '3.0' 42 | - jruby 43 | runs-on: windows-latest 44 | steps: 45 | - uses: actions/checkout@v2 46 | - name: Set up Ruby 47 | uses: ruby/setup-ruby@v1 48 | with: 49 | ruby-version: ${{matrix.ruby}} 50 | bundler-cache: true 51 | - name: Run tests 52 | run: bundle exec rake 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Gemfile.lock 2 | /pkg 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## CHANGELOG 2 | 3 | ### 1.10.0 4 | 5 | - Unicode 16.0 6 | 7 | ### 1.9.0 8 | 9 | - Unicode 15.1 10 | 11 | ### 1.8.0 12 | 13 | - Unicode 15.0 14 | 15 | ### 1.7.0 16 | 17 | - Unicode 14.0 18 | 19 | ### 1.6.0 20 | 21 | * Unicode 13.0 22 | 23 | ### 1.5.0 24 | 25 | * Unicode 12.1 26 | 27 | ### 1.4.0 28 | 29 | * Unicode 12 30 | 31 | ### 1.3.0 32 | 33 | * Unicode 11 34 | * Do not depend on rubygems (only use zlib stdlib for unzipping) 35 | 36 | ### 1.2.2 37 | 38 | * Explicitly load rubygems/util, fixes regression in 1.2.1 39 | 40 | ### 1.2.1 41 | 42 | * Use `Gem::Util` for `gunzip`, removes deprecation warning 43 | 44 | ### 1.2.0 45 | 46 | * Unicode 10.0 47 | 48 | ### 1.1.0 49 | 50 | * Support Unicode 9.0 51 | 52 | ### 1.0.0 53 | 54 | * Initial release 55 | 56 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at opensource@janlelis.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | 5 | gem 'minitest' 6 | gem 'rake' 7 | gem 'irb' unless RUBY_ENGINE == "jruby" 8 | -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-2024 Jan Lelis, https://janlelis.com 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unicode::Blocks [![[version]](https://badge.fury.io/rb/unicode-blocks.svg)](https://badge.fury.io/rb/unicode-blocks) [![[ci]](https://github.com/janlelis/unicode-blocks/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-blocks/actions?query=workflow%3ATest) 2 | 3 | Each Unicode character belongs to a [block](https://en.wikipedia.org/wiki/Unicode_block). This gem returns the all blocks associated with the given string. 4 | 5 | Unicode version: **16.0.0** (September 2024) 6 | 7 | Supported Rubies: **3.3**, **3.2**, **3.1**, **3.0** 8 | 9 | Old Rubies which might still work: **2.7**, **2.6**, **2.5**, **2.4**, **2.3**, **2.X** 10 | 11 | ## Gemfile 12 | 13 | ```ruby 14 | gem "unicode-blocks" 15 | ``` 16 | 17 | ## Usage 18 | 19 | ```ruby 20 | require "unicode/blocks" 21 | 22 | # All blocks of a string 23 | Unicode::Blocks.blocks("Abc") # => ["Basic Latin"] 24 | Unicode::Blocks.blocks("СC") # => ["Cyrillic", "Basic Latin"] 25 | Unicode::Blocks.blocks("⧉⪥⟤") # => ["Miscellaneous Mathematical Symbols-A", 26 | "Miscellaneous Mathematical Symbols-B", 27 | "Supplemental Mathematical Operators"] 28 | 29 | # Also aliased as .of 30 | Unicode::Blocks.of("🃉🂹") # => ["Playing Cards"] 31 | Unicode::Blocks.of("\u{10c50}") # => ["No_Block"] 32 | 33 | # Single character 34 | Unicode::Blocks.block("☼") # => "Miscellaneous Symbols" 35 | ``` 36 | 37 | The list of blocks is always sorted alphabetically. 38 | 39 | ## Hints 40 | 41 | ### Regex Matching 42 | 43 | If you have a string and want to match a substring/character from a specific Unicode block, you actually won't need this gem. Instead, you can use the [Regexp Unicode Property Syntax `\p{}`](https://ruby-doc.org/core/Regexp.html#class-Regexp-label-Character+Properties) with blocks by prefixing the block name with "In": 44 | 45 | ```ruby 46 | "⧉⪥⟤".scan(/\p{In Miscellaneous Mathematical Symbols-B}/) # => ["⧉"] 47 | ``` 48 | 49 | See [Idiosyncratic Ruby: Proper Unicoding](https://idiosyncratic-ruby.com/41-proper-unicoding.html) for more info. 50 | 51 | ### Block Names 52 | 53 | You can retrieve all block names (except for **No_Block**) like this: 54 | 55 | ```ruby 56 | require "unicode/blocks" 57 | puts Unicode::Blocks.names 58 | 59 | # # # Output # # # 60 | 61 | Basic Latin 62 | Latin-1 Supplement 63 | Latin Extended-A 64 | Latin Extended-B 65 | IPA Extensions 66 | Spacing Modifier Letters 67 | Combining Diacritical Marks 68 | Greek and Coptic 69 | Cyrillic 70 | Cyrillic Supplement 71 | Armenian 72 | Hebrew 73 | Arabic 74 | Syriac 75 | Arabic Supplement 76 | Thaana 77 | NKo 78 | Samaritan 79 | Mandaic 80 | Syriac Supplement 81 | Arabic Extended-B 82 | Arabic Extended-A 83 | Devanagari 84 | Bengali 85 | Gurmukhi 86 | Gujarati 87 | Oriya 88 | Tamil 89 | Telugu 90 | Kannada 91 | Malayalam 92 | Sinhala 93 | Thai 94 | Lao 95 | Tibetan 96 | Myanmar 97 | Georgian 98 | Hangul Jamo 99 | Ethiopic 100 | Ethiopic Supplement 101 | Cherokee 102 | Unified Canadian Aboriginal Syllabics 103 | Ogham 104 | Runic 105 | Tagalog 106 | Hanunoo 107 | Buhid 108 | Tagbanwa 109 | Khmer 110 | Mongolian 111 | Unified Canadian Aboriginal Syllabics Extended 112 | Limbu 113 | Tai Le 114 | New Tai Lue 115 | Khmer Symbols 116 | Buginese 117 | Tai Tham 118 | Combining Diacritical Marks Extended 119 | Balinese 120 | Sundanese 121 | Batak 122 | Lepcha 123 | Ol Chiki 124 | Cyrillic Extended-C 125 | Georgian Extended 126 | Sundanese Supplement 127 | Vedic Extensions 128 | Phonetic Extensions 129 | Phonetic Extensions Supplement 130 | Combining Diacritical Marks Supplement 131 | Latin Extended Additional 132 | Greek Extended 133 | General Punctuation 134 | Superscripts and Subscripts 135 | Currency Symbols 136 | Combining Diacritical Marks for Symbols 137 | Letterlike Symbols 138 | Number Forms 139 | Arrows 140 | Mathematical Operators 141 | Miscellaneous Technical 142 | Control Pictures 143 | Optical Character Recognition 144 | Enclosed Alphanumerics 145 | Box Drawing 146 | Block Elements 147 | Geometric Shapes 148 | Miscellaneous Symbols 149 | Dingbats 150 | Miscellaneous Mathematical Symbols-A 151 | Supplemental Arrows-A 152 | Braille Patterns 153 | Supplemental Arrows-B 154 | Miscellaneous Mathematical Symbols-B 155 | Supplemental Mathematical Operators 156 | Miscellaneous Symbols and Arrows 157 | Glagolitic 158 | Latin Extended-C 159 | Coptic 160 | Georgian Supplement 161 | Tifinagh 162 | Ethiopic Extended 163 | Cyrillic Extended-A 164 | Supplemental Punctuation 165 | CJK Radicals Supplement 166 | Kangxi Radicals 167 | Ideographic Description Characters 168 | CJK Symbols and Punctuation 169 | Hiragana 170 | Katakana 171 | Bopomofo 172 | Hangul Compatibility Jamo 173 | Kanbun 174 | Bopomofo Extended 175 | CJK Strokes 176 | Katakana Phonetic Extensions 177 | Enclosed CJK Letters and Months 178 | CJK Compatibility 179 | CJK Unified Ideographs Extension A 180 | Yijing Hexagram Symbols 181 | CJK Unified Ideographs 182 | Yi Syllables 183 | Yi Radicals 184 | Lisu 185 | Vai 186 | Cyrillic Extended-B 187 | Bamum 188 | Modifier Tone Letters 189 | Latin Extended-D 190 | Syloti Nagri 191 | Common Indic Number Forms 192 | Phags-pa 193 | Saurashtra 194 | Devanagari Extended 195 | Kayah Li 196 | Rejang 197 | Hangul Jamo Extended-A 198 | Javanese 199 | Myanmar Extended-B 200 | Cham 201 | Myanmar Extended-A 202 | Tai Viet 203 | Meetei Mayek Extensions 204 | Ethiopic Extended-A 205 | Latin Extended-E 206 | Cherokee Supplement 207 | Meetei Mayek 208 | Hangul Syllables 209 | Hangul Jamo Extended-B 210 | High Surrogates 211 | High Private Use Surrogates 212 | Low Surrogates 213 | Private Use Area 214 | CJK Compatibility Ideographs 215 | Alphabetic Presentation Forms 216 | Arabic Presentation Forms-A 217 | Variation Selectors 218 | Vertical Forms 219 | Combining Half Marks 220 | CJK Compatibility Forms 221 | Small Form Variants 222 | Arabic Presentation Forms-B 223 | Halfwidth and Fullwidth Forms 224 | Specials 225 | Linear B Syllabary 226 | Linear B Ideograms 227 | Aegean Numbers 228 | Ancient Greek Numbers 229 | Ancient Symbols 230 | Phaistos Disc 231 | Lycian 232 | Carian 233 | Coptic Epact Numbers 234 | Old Italic 235 | Gothic 236 | Old Permic 237 | Ugaritic 238 | Old Persian 239 | Deseret 240 | Shavian 241 | Osmanya 242 | Osage 243 | Elbasan 244 | Caucasian Albanian 245 | Vithkuqi 246 | Todhri 247 | Linear A 248 | Latin Extended-F 249 | Cypriot Syllabary 250 | Imperial Aramaic 251 | Palmyrene 252 | Nabataean 253 | Hatran 254 | Phoenician 255 | Lydian 256 | Meroitic Hieroglyphs 257 | Meroitic Cursive 258 | Kharoshthi 259 | Old South Arabian 260 | Old North Arabian 261 | Manichaean 262 | Avestan 263 | Inscriptional Parthian 264 | Inscriptional Pahlavi 265 | Psalter Pahlavi 266 | Old Turkic 267 | Old Hungarian 268 | Hanifi Rohingya 269 | Garay 270 | Rumi Numeral Symbols 271 | Yezidi 272 | Arabic Extended-C 273 | Old Sogdian 274 | Sogdian 275 | Old Uyghur 276 | Chorasmian 277 | Elymaic 278 | Brahmi 279 | Kaithi 280 | Sora Sompeng 281 | Chakma 282 | Mahajani 283 | Sharada 284 | Sinhala Archaic Numbers 285 | Khojki 286 | Multani 287 | Khudawadi 288 | Grantha 289 | Tulu-Tigalari 290 | Newa 291 | Tirhuta 292 | Siddham 293 | Modi 294 | Mongolian Supplement 295 | Takri 296 | Myanmar Extended-C 297 | Ahom 298 | Dogra 299 | Warang Citi 300 | Dives Akuru 301 | Nandinagari 302 | Zanabazar Square 303 | Soyombo 304 | Unified Canadian Aboriginal Syllabics Extended-A 305 | Pau Cin Hau 306 | Devanagari Extended-A 307 | Sunuwar 308 | Bhaiksuki 309 | Marchen 310 | Masaram Gondi 311 | Gunjala Gondi 312 | Makasar 313 | Kawi 314 | Lisu Supplement 315 | Tamil Supplement 316 | Cuneiform 317 | Cuneiform Numbers and Punctuation 318 | Early Dynastic Cuneiform 319 | Cypro-Minoan 320 | Egyptian Hieroglyphs 321 | Egyptian Hieroglyph Format Controls 322 | Egyptian Hieroglyphs Extended-A 323 | Anatolian Hieroglyphs 324 | Gurung Khema 325 | Bamum Supplement 326 | Mro 327 | Tangsa 328 | Bassa Vah 329 | Pahawh Hmong 330 | Kirat Rai 331 | Medefaidrin 332 | Miao 333 | Ideographic Symbols and Punctuation 334 | Tangut 335 | Tangut Components 336 | Khitan Small Script 337 | Tangut Supplement 338 | Kana Extended-B 339 | Kana Supplement 340 | Kana Extended-A 341 | Small Kana Extension 342 | Nushu 343 | Duployan 344 | Shorthand Format Controls 345 | Symbols for Legacy Computing Supplement 346 | Znamenny Musical Notation 347 | Byzantine Musical Symbols 348 | Musical Symbols 349 | Ancient Greek Musical Notation 350 | Kaktovik Numerals 351 | Mayan Numerals 352 | Tai Xuan Jing Symbols 353 | Counting Rod Numerals 354 | Mathematical Alphanumeric Symbols 355 | Sutton SignWriting 356 | Latin Extended-G 357 | Glagolitic Supplement 358 | Cyrillic Extended-D 359 | Nyiakeng Puachue Hmong 360 | Toto 361 | Wancho 362 | Nag Mundari 363 | Ol Onal 364 | Ethiopic Extended-B 365 | Mende Kikakui 366 | Adlam 367 | Indic Siyaq Numbers 368 | Ottoman Siyaq Numbers 369 | Arabic Mathematical Alphabetic Symbols 370 | Mahjong Tiles 371 | Domino Tiles 372 | Playing Cards 373 | Enclosed Alphanumeric Supplement 374 | Enclosed Ideographic Supplement 375 | Miscellaneous Symbols and Pictographs 376 | Emoticons 377 | Ornamental Dingbats 378 | Transport and Map Symbols 379 | Alchemical Symbols 380 | Geometric Shapes Extended 381 | Supplemental Arrows-C 382 | Supplemental Symbols and Pictographs 383 | Chess Symbols 384 | Symbols and Pictographs Extended-A 385 | Symbols for Legacy Computing 386 | CJK Unified Ideographs Extension B 387 | CJK Unified Ideographs Extension C 388 | CJK Unified Ideographs Extension D 389 | CJK Unified Ideographs Extension E 390 | CJK Unified Ideographs Extension F 391 | CJK Unified Ideographs Extension I 392 | CJK Compatibility Ideographs Supplement 393 | CJK Unified Ideographs Extension G 394 | CJK Unified Ideographs Extension H 395 | Tags 396 | Variation Selectors Supplement 397 | Supplementary Private Use Area-A 398 | Supplementary Private Use Area-B 399 | ``` 400 | 401 | See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries. 402 | 403 | ## MIT License 404 | 405 | - Copyright (C) 2016-2024 Jan Lelis . Released under the MIT license. 406 | - Unicode data: https://www.unicode.org/copyright.html#Exhibit1 407 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # # # 2 | # Get gemspec info 3 | 4 | gemspec_file = Dir['*.gemspec'].first 5 | gemspec = eval File.read(gemspec_file), binding, gemspec_file 6 | info = "#{gemspec.name} | #{gemspec.version} | " \ 7 | "#{gemspec.runtime_dependencies.size} dependencies | " \ 8 | "#{gemspec.files.size} files" 9 | 10 | # # # 11 | # Gem build and install task 12 | 13 | desc info 14 | task :gem do 15 | puts info + "\n\n" 16 | print " "; sh "gem build #{gemspec_file}" 17 | FileUtils.mkdir_p 'pkg' 18 | FileUtils.mv "#{gemspec.name}-#{gemspec.version}.gem", 'pkg' 19 | puts; sh %{gem install --no-document pkg/#{gemspec.name}-#{gemspec.version}.gem} 20 | end 21 | 22 | # # # 23 | # Start an IRB session with the gem loaded 24 | 25 | desc "#{gemspec.name} | IRB" 26 | task :irb do 27 | sh "irb -I ./lib -r #{gemspec.name.gsub '-','/'}" 28 | end 29 | 30 | # # # 31 | # Run Specs 32 | 33 | desc "#{gemspec.name} | Spec" 34 | task :spec do 35 | if RbConfig::CONFIG['host_os'] =~ /mswin|mingw/ 36 | sh "for %f in (spec/\*.rb) do ruby spec/%f" 37 | else 38 | sh "for file in spec/*.rb; do ruby $file; done" 39 | end 40 | end 41 | task default: :spec 42 | -------------------------------------------------------------------------------- /data/blocks.marshal.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/janlelis/unicode-blocks/bda440465d24af0f734fbfef1847bef19c44294b/data/blocks.marshal.gz -------------------------------------------------------------------------------- /lib/unicode/blocks.rb: -------------------------------------------------------------------------------- 1 | require_relative "blocks/constants" 2 | 3 | module Unicode 4 | module Blocks 5 | def self.blocks(string) 6 | res = [] 7 | string.each_char{ |char| 8 | block_name = block(char) 9 | res << block_name unless res.include?(block_name) 10 | } 11 | res.sort 12 | end 13 | class << self; alias of blocks; end 14 | 15 | def self.block(char) 16 | require_relative 'blocks/index' unless defined? ::Unicode::Blocks::INDEX 17 | codepoint = char.unpack("U")[0] or raise(ArgumentError, "Unicode::Blocks.block must be given a valid char") 18 | block_info = INDEX.bsearch{ |block_info| codepoint <= block_info[1] } 19 | codepoint >= block_info[0] ? block_info[2] : "No_Block" 20 | end 21 | 22 | def self.names 23 | require_relative 'blocks/index' unless defined? ::Unicode::Blocks::INDEX 24 | INDEX.map(&:last) 25 | end 26 | end 27 | end 28 | 29 | -------------------------------------------------------------------------------- /lib/unicode/blocks/constants.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Unicode 4 | module Blocks 5 | VERSION = "1.10.0" 6 | UNICODE_VERSION = "16.0.0" 7 | DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/").freeze 8 | INDEX_FILENAME = (DATA_DIRECTORY + "/blocks.marshal.gz").freeze 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/unicode/blocks/index.rb: -------------------------------------------------------------------------------- 1 | require "zlib" 2 | require_relative "constants" 3 | 4 | module Unicode 5 | module Blocks 6 | File.open(INDEX_FILENAME, "rb") do |file| 7 | serialized_data = Zlib::GzipReader.new(file).read 8 | serialized_data.force_encoding Encoding::BINARY 9 | INDEX = Marshal.load(serialized_data) 10 | end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/unicode/blocks/string_ext.rb: -------------------------------------------------------------------------------- 1 | require_relative "../blocks" 2 | 3 | class String 4 | # Optional string extension for your convenience 5 | def unicode_blocks 6 | Unicode::Blocks.of(self) 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /spec/unicode_blocks_spec.rb: -------------------------------------------------------------------------------- 1 | require_relative "../lib/unicode/blocks" 2 | require "minitest/autorun" 3 | 4 | describe Unicode::Blocks do 5 | describe ".blocks (alias .of)" do 6 | it "will always return an Array" do 7 | assert_equal [], Unicode::Blocks.of("") 8 | end 9 | 10 | it "will return all blocks that characters in the string belong to" do 11 | assert_equal ["Basic Latin", "Cyrillic"], Unicode::Blocks.of("СC") 12 | end 13 | 14 | it "will return all blocks in sorted order" do 15 | assert_equal ["Basic Latin", "Cyrillic"], Unicode::Blocks.of("СA") 16 | assert_equal ["Basic Latin", "Cyrillic"], Unicode::Blocks.of("AС") 17 | end 18 | 19 | it "will call .block for every character" do 20 | mocked_method = Minitest::Mock.new 21 | mocked_method.expect :call, "first block", ["С"] 22 | mocked_method.expect :call, "second block", ["A"] 23 | Unicode::Blocks.stub :block, mocked_method do 24 | Unicode::Blocks.of("СA") 25 | end 26 | mocked_method.verify 27 | end 28 | end 29 | 30 | describe ".block" do 31 | it "will return block for that character" do 32 | assert_equal "Specials", Unicode::Blocks.block("�") 33 | end 34 | 35 | it "will return No_Block for characters not in any block" do 36 | assert_equal "No_Block", Unicode::Blocks.block("\u{10c50}") 37 | end 38 | end 39 | 40 | describe ".names" do 41 | it "will return a list of all block names" do 42 | assert_kind_of Array, Unicode::Blocks.names 43 | assert_includes Unicode::Blocks.names, "Ancient Symbols" 44 | end 45 | end 46 | end 47 | 48 | -------------------------------------------------------------------------------- /unicode-blocks.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | require File.dirname(__FILE__) + "/lib/unicode/blocks/constants" 4 | 5 | Gem::Specification.new do |gem| 6 | gem.name = "unicode-blocks" 7 | gem.version = Unicode::Blocks::VERSION 8 | gem.summary = "Return Unicode blocks of a string." 9 | gem.description = "[Unicode #{Unicode::Blocks::UNICODE_VERSION}] Answers the question: Which Unicode block does a code point belong to?" 10 | gem.authors = ["Jan Lelis"] 11 | gem.email = ["hi@ruby.consulting"] 12 | gem.homepage = "https://github.com/janlelis/unicode-blocks" 13 | gem.license = "MIT" 14 | 15 | gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ } 16 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 17 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 18 | gem.require_paths = ["lib"] 19 | gem.metadata = { "rubygems_mfa_required" => "true" } 20 | 21 | gem.required_ruby_version = ">= 2.0" 22 | end 23 | --------------------------------------------------------------------------------