├── .document ├── .github └── workflows │ └── unf_ext.yml ├── .gitignore ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── ext └── unf_ext │ ├── extconf.rb │ ├── unf.cc │ └── unf │ ├── normalizer.hh │ ├── table.hh │ ├── trie │ ├── char_stream.hh │ ├── node.hh │ └── searcher.hh │ └── util.hh ├── lib ├── unf_ext.rb └── unf_ext │ └── version.rb ├── test ├── helper.rb ├── normalization-test.txt └── test_unf_ext.rb └── unf_ext.gemspec /.document: -------------------------------------------------------------------------------- 1 | lib/**/*.rb 2 | bin/* 3 | - 4 | features/**/*.feature 5 | LICENSE.txt 6 | -------------------------------------------------------------------------------- /.github/workflows/unf_ext.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | name: >- 8 | ${{ matrix.os }} ${{ matrix.ruby }} 9 | 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | os: [ ubuntu-22.04, macos-12, windows-2022 ] 15 | ruby: [ 2.7, "3.0", 3.1, 3.2, head ] 16 | include: 17 | - { os: windows-2022, ruby: ucrt } 18 | - { os: windows-2022, ruby: mswin } 19 | exclude: 20 | - { os: windows-2022, ruby: head } 21 | 22 | steps: 23 | - name: repo checkout 24 | uses: actions/checkout@v4 25 | 26 | - uses: ruby/setup-ruby@v1 27 | with: 28 | ruby-version: ${{ matrix.ruby }} 29 | bundler-cache: true 30 | 31 | - name: compile 32 | timeout-minutes: 5 33 | run: bundle exec rake compile 34 | 35 | - name: test 36 | timeout-minutes: 5 37 | run: bundle exec rake test 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | *.so 4 | *.bundle 5 | .config 6 | .yardoc 7 | Gemfile.lock 8 | InstalledFiles 9 | _yardoc 10 | coverage 11 | doc/ 12 | lib/bundler/man 13 | ext/*/Makefile 14 | pkg 15 | rdoc 16 | spec/reports 17 | test/tmp 18 | test/version_tmp 19 | tmp 20 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.0.9.1](https://github.com/knu/ruby-unf_ext/tree/v0.0.9.1) (2023-11-16) 4 | 5 | [Full Changelog](https://github.com/knu/ruby-unf_ext/compare/v0.0.9...v0.0.9.1) 6 | 7 | - Fix use of designated initializers. [\#75](https://github.com/knu/ruby-unf_ext/pull/75) ([igorpeshansky](https://github.com/igorpeshansky)) 8 | 9 | ## [0.0.9](https://github.com/knu/ruby-unf_ext/tree/v0.0.9) (2023-11-11) 10 | 11 | [Full Changelog](https://github.com/knu/ruby-unf_ext/compare/v0.0.8.2...v0.0.9) 12 | 13 | - Use the newer TypedData extension API [\#72](https://github.com/knu/ruby-unf_ext/pull/72) ([casperisfine](https://github.com/casperisfine)) 14 | - Update Actions - updates OS's, add Ruby 3,2, etc [\#71](https://github.com/knu/ruby-unf_ext/pull/71) ([MSP-Greg](https://github.com/MSP-Greg)) 15 | - Add cross compilation for Ruby 3.2 [\#69](https://github.com/knu/ruby-unf_ext/pull/69) ([johnnyshields](https://github.com/johnnyshields)) 16 | 17 | ## 0.0.8.2 (2022-05-26) 18 | 19 | - Add x64-mingw-ucrt native gem support for RubyInstaller 3.1. 20 | 21 | ## 0.0.8.1 (2022-03-13) 22 | 23 | - Include Windows binaries for Ruby 3.1. (FAIL) 24 | 25 | ## 0.0.8 (2021-09-14) 26 | 27 | - No functional change in the library code. 28 | - Include Windows binaries for Ruby 3.0. 29 | - Drop support for Ruby 2.1 and earlier. 30 | - Replace Travis CI with Github Actions. 31 | - Fix cross-build after upgrading rake-compiler/rake-compiler-dock to 1.1.1/1.1.0. 32 | 33 | ## 0.0.7.7 (2020-03-30) 34 | 35 | - Include Windows binaries for Ruby 2.7. 36 | 37 | ## 0.0.7.6 (2019-03-19) 38 | 39 | - Include Windows binaries for Ruby 2.6. 40 | 41 | ## 0.0.7.5 (2018-02-06) 42 | 43 | - Include Windows binaries for Ruby 2.5. 44 | 45 | ## 0.0.7.4 (2017-04-19) 46 | 47 | - Fix build on ARM and GCC 6 again. 48 | 49 | ## 0.0.7.3 (2017-04-11) 50 | 51 | - Update the base Unicode version to 9. 52 | 53 | - Fix compile issues on ARM and GCC 6. 54 | 55 | ## 0.0.7.2 (2016-02-01) 56 | 57 | - Include Windows binaries for Ruby 2.3. 58 | 59 | ## 0.0.7.1 (2015-04-18) 60 | 61 | - Windows fat binary gems no longer require libstd++ to 62 | run, which are statically linked. 63 | 64 | - Add a fat binary gem for x64-mingw32 (64bit Windows). 65 | 66 | - Windows fat binary gems now include binaries for Ruby up to 2.2. 67 | 68 | ## 0.0.6 (2013-02-16) 69 | 70 | - Migrate from Jeweler to Bundler. 71 | 72 | ## 0.0.5 (2012-05-30) 73 | 74 | - Fix a type error for strict compilers. 75 | 76 | ## 0.0.4 (2011-12-08) 77 | 78 | - Release under the current name of `unf_ext`. 79 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in ruby-unf_ext.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010 Takeru Ohta 4 | Copyright (c) 2011-2018 Akinori MUSHA (extended Ruby support) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ruby-unf_ext 2 | ============ 3 | 4 | Synopsis 5 | -------- 6 | 7 | * Unicode Normalization Form support library for CRuby 8 | 9 | Description 10 | ----------- 11 | 12 | * Normalizes UTF-8 strings into and from NFC, NFD, NFKC or NFKD 13 | 14 | # For bulk conversion 15 | normalizer = UNF::Normalizer.new 16 | a_bunch_of_strings.map! { |string| 17 | normalizer.normalize(string, :nfc) #=> string in NFC 18 | } 19 | 20 | * Compliant with Unicode 9.0 21 | 22 | Requirement 23 | ----------- 24 | 25 | * Ruby 1.8.7+, 1.9.2+ 26 | 27 | * C++ compiler and libstdc++ 28 | 29 | Installation 30 | ------------ 31 | 32 | gem install unf_ext 33 | 34 | Or: 35 | 36 | ruby extconf.rb && make && make install 37 | 38 | Development Resources 39 | --------------------- 40 | 41 | * https://github.com/sile/unf 42 | 43 | For issues regarding files under the directory `unf`, please 44 | contact this upstream. 45 | 46 | * https://github.com/knu/ruby-unf_ext 47 | 48 | The development site and the repository. 49 | 50 | License 51 | ------- 52 | 53 | Copyright (c) 2010-2017 Takeru Ohta 54 | Copyright (c) 2011-2018 Akinori MUSHA 55 | 56 | Licensed under the MIT license. 57 | See `LICENSE` for details. 58 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | 3 | gemspec = Bundler::GemHelper.gemspec 4 | 5 | native_platforms = %w[ 6 | x86-mingw32 7 | x64-mingw32 8 | x64-mingw-ucrt 9 | ] 10 | 11 | require 'rake/extensiontask' 12 | Rake::ExtensionTask.new('unf_ext', gemspec) do |ext| 13 | ext.cross_compile = true 14 | ext.cross_platform = native_platforms 15 | ext.cross_config_options << '--with-ldflags="-static-libgcc"' << '--with-static-libstdc++' 16 | end 17 | 18 | namespace :gem do 19 | task :native do 20 | require 'rake_compiler_dock' 21 | sh 'bundle package --all' 22 | native_platforms.each do |plat| 23 | RakeCompilerDock.sh "bundle --local && rake native:#{plat} gem", platform: plat 24 | end 25 | end 26 | end 27 | 28 | task :gems => %i[build gem:native] 29 | 30 | require 'rake/testtask' 31 | Rake::TestTask.new(:test) do |test| 32 | test.libs << 'test' 33 | test.test_files = gemspec.test_files 34 | test.verbose = true 35 | end 36 | 37 | task :default => :test 38 | -------------------------------------------------------------------------------- /ext/unf_ext/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | 3 | if with_config('static-libstdc++') 4 | $LDFLAGS << ' ' << `#{CONFIG['CC']} -print-file-name=libstdc++.a`.chomp 5 | else 6 | have_library('stdc++') 7 | 8 | case RbConfig::CONFIG['host_os'] 9 | when /mswin/ 10 | # Avoid `error C7555: use of designated initializers requires at least '/std:c++20'` 11 | $CFLAGS << ' ' << '-std:c++20' 12 | $CPPFLAGS << ' ' << '-std:c++20' 13 | $CXXFLAGS << ' ' << '-std:c++20' 14 | when /solaris(!?2.11)/ 15 | # Do a little trickery here to enable C++ standard on Solaris 11 if found. 16 | # This also forces 64bit compilation mode. 17 | $CXX = CONFIG['CXX'] 18 | $CXX << ' ' << '-m64' 19 | $CFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '') 20 | $CFLAGS << ' ' << '-m64 -std=c++11' 21 | $CPPFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '') 22 | $CPPFLAGS << ' ' << '-m64 -std=c++11' 23 | $CXXFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '') 24 | $CXXFLAGS << ' ' << '-m64 -std=c++11' 25 | when /aix/ 26 | # Compiler flags necessary on AIX. 27 | # rubocop:disable Style/GlobalVars 28 | $CFLAGS << ' ' << '-D_ALL_SOURCE=1' 29 | $CPPFLAGS << ' ' << '-D_ALL_SOURCE=1' 30 | $CXXFLAGS << ' ' << '-D_ALL_SOURCE=1' 31 | end 32 | end 33 | 34 | create_makefile 'unf_ext' 35 | 36 | unless CONFIG['CXX'] 37 | case CONFIG['CC'] 38 | when %r{((?:.*[-/])?)gcc([-0-9.]*)$} 39 | cxx = $1 + 'g++' + $2 40 | when %r{((?:.*[-/])?)clang([-0-9.]*)$} 41 | cxx = $1 + 'clang++' + $2 42 | else 43 | cxx = CONFIG['CC'] 44 | end 45 | 46 | warn "CXX is automatically set to #{cxx}" 47 | 48 | new_mf = <<-EOF << File.read('Makefile') 49 | CXX=#{cxx} 50 | EOF 51 | 52 | File.open('Makefile', 'w') { |mf| 53 | mf.print new_mf 54 | } 55 | end 56 | -------------------------------------------------------------------------------- /ext/unf_ext/unf.cc: -------------------------------------------------------------------------------- 1 | #include "unf/normalizer.hh" 2 | 3 | #include 4 | #if defined(HAVE_RUBY_ENCODING_H) 5 | #include 6 | #endif 7 | 8 | extern "C" { 9 | VALUE unf_allocate(VALUE klass); 10 | VALUE unf_initialize(VALUE self); 11 | void unf_delete(void* ptr); 12 | VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form); 13 | 14 | ID FORM_NFD; 15 | ID FORM_NFC; 16 | ID FORM_NFKD; 17 | ID FORM_NFKC; 18 | 19 | void Init_unf_ext() { 20 | VALUE mdl = rb_define_module("UNF"); 21 | 22 | VALUE cls = rb_define_class_under(mdl, "Normalizer", rb_cObject); 23 | rb_define_alloc_func(cls, unf_allocate); 24 | rb_define_method(cls, "initialize", (VALUE (*)(...))unf_initialize, 0); 25 | rb_define_method(cls, "normalize", (VALUE (*)(...))unf_normalize, 2); 26 | 27 | FORM_NFD = rb_intern("nfd"); 28 | FORM_NFC = rb_intern("nfc"); 29 | FORM_NFKD= rb_intern("nfkd"); 30 | FORM_NFKC= rb_intern("nfkc"); 31 | } 32 | 33 | static const rb_data_type_t unf_normalizer_data_type = { 34 | .wrap_struct_name = "UNF::Normalizer", 35 | .function = { 36 | .dmark = NULL, 37 | .dfree = unf_delete, 38 | .dsize = NULL 39 | }, 40 | .parent = NULL, 41 | .data = NULL, 42 | .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED 43 | }; 44 | 45 | VALUE unf_allocate(VALUE klass) { 46 | UNF::Normalizer* ptr; 47 | VALUE obj = TypedData_Make_Struct(klass, UNF::Normalizer, &unf_normalizer_data_type, ptr); 48 | new ((void*)ptr) UNF::Normalizer; 49 | return obj; 50 | } 51 | 52 | VALUE unf_initialize(VALUE self) { 53 | return self; 54 | } 55 | 56 | void unf_delete(void *data) { 57 | UNF::Normalizer* ptr = (UNF::Normalizer*)data; 58 | ptr->~Normalizer(); 59 | ruby_xfree(ptr); 60 | } 61 | 62 | VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form) { 63 | UNF::Normalizer* ptr; 64 | TypedData_Get_Struct(self, UNF::Normalizer, &unf_normalizer_data_type, ptr); 65 | 66 | const char* src = StringValueCStr(source); 67 | const char* rlt; 68 | ID form_id = SYM2ID(normalization_form); 69 | 70 | if(form_id == FORM_NFD) 71 | rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFD); 72 | else if(form_id == FORM_NFC) 73 | rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFC); 74 | else if(form_id == FORM_NFKD) 75 | rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKD); 76 | else if(form_id == FORM_NFKC) 77 | rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKC); 78 | else 79 | rb_raise(rb_eArgError, "Specified Normalization-Form is unknown. Please select one from among :nfc, :nfd, :nfkc, :nfkd."); 80 | 81 | #if defined(HAVE_RUBY_ENCODING_H) 82 | return rb_enc_str_new(rlt, strlen(rlt), rb_utf8_encoding()); 83 | #else 84 | return rb_str_new2(rlt); 85 | #endif 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /ext/unf_ext/unf/normalizer.hh: -------------------------------------------------------------------------------- 1 | #ifndef UNF_NORMALIZER_HH 2 | #define UNF_NORMALIZER_HH 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "trie/searcher.hh" 9 | #include "trie/char_stream.hh" 10 | #include "table.hh" 11 | #include "util.hh" 12 | 13 | namespace UNF { 14 | class Normalizer { 15 | public: 16 | enum Form { FORM_NFD, FORM_NFC, FORM_NFKD, FORM_NFKC }; 17 | 18 | public: 19 | Normalizer() 20 | : nf_d(TABLE::NODES, TABLE::CANONICAL_DECOM_ROOT, (const char *)TABLE::STRINGS), 21 | nf_kd(TABLE::NODES, TABLE::COMPATIBILITY_DECOM_ROOT, (const char *)TABLE::STRINGS), 22 | nf_c(TABLE::NODES, TABLE::CANONICAL_COM_ROOT, (const char *)TABLE::STRINGS), 23 | nf_c_qc(TABLE::NODES, TABLE::NFC_ILLEGAL_ROOT), 24 | nf_kc_qc(TABLE::NODES, TABLE::NFKC_ILLEGAL_ROOT), 25 | ccc(TABLE::NODES, TABLE::CANONICAL_CLASS_ROOT) 26 | {} 27 | 28 | const char* normalize(const char* src, Form form) { 29 | switch(form) { 30 | case FORM_NFD: return nfd(src); 31 | case FORM_NFC: return nfc(src); 32 | case FORM_NFKD: return nfkd(src); 33 | case FORM_NFKC: return nfkc(src); 34 | default: return src; 35 | } 36 | } 37 | const char* nfd(const char* src) { return decompose(src, nf_d); } 38 | const char* nfkd(const char* src) { return decompose(src, nf_kd); } 39 | const char* nfc(const char* src) { return compose(src, nf_c_qc, nf_d); } 40 | const char* nfkc(const char* src) { return compose(src, nf_kc_qc, nf_kd); } 41 | 42 | private: 43 | const char* decompose(const char* src, const Trie::NormalizationForm& nf) { 44 | const char* beg = next_invalid_char(src, nf); 45 | if(*beg=='\0') 46 | return src; 47 | 48 | buffer.assign(src, beg); 49 | do { 50 | const char* end = next_valid_starter(beg, nf); 51 | decompose_one(beg, end, nf, buffer); 52 | beg = next_invalid_char(end, nf); 53 | buffer.append(end, beg); 54 | } while(*beg!='\0'); 55 | 56 | return buffer.c_str(); 57 | } 58 | 59 | void decompose_one(const char* beg, const char* end, const Trie::NormalizationForm& nf, std::string& buf) { 60 | unsigned last = buf.size(); 61 | nf.decompose(Trie::RangeCharStream(beg,end), buf); 62 | char* bufbeg = const_cast(buf.data()); 63 | canonical_combining_class_ordering(bufbeg+last, bufbeg+buf.size()); 64 | } 65 | 66 | const char* compose(const char* src, const Trie::NormalizationForm& nf, const Trie::NormalizationForm& nf_decomp) { 67 | const char* beg = next_invalid_char(src, nf); 68 | if(*beg=='\0') 69 | return src; 70 | 71 | buffer.assign(src, beg); 72 | while(*beg!='\0') { 73 | const char* end = next_valid_starter(beg, nf); 74 | buffer2.clear(); 75 | decompose_one(beg, end, nf_decomp, buffer2); 76 | end = compose_one(buffer2.c_str(), end, buffer); 77 | beg = next_invalid_char(end, nf); 78 | buffer.append(end, beg); 79 | } 80 | 81 | return buffer.c_str(); 82 | } 83 | 84 | const char* compose_one(const char* starter, const char* rest_starter, std::string& buf) { 85 | Trie::CharStreamForComposition in(starter, rest_starter, canonical_classes, buffer3); 86 | while(in.within_first()) 87 | nf_c.compose(in, buf); 88 | return in.cur(); 89 | } 90 | 91 | void canonical_combining_class_ordering(char* beg, const char* end) { 92 | canonical_classes.assign(end-beg+1, 0); // +1 is for sentinel value 93 | ccc.sort(beg, canonical_classes); 94 | } 95 | 96 | const char* next_invalid_char(const char* src, const Trie::NormalizationForm& nf) const { 97 | int last_canonical_class = 0; 98 | const char* cur = Util::nearest_utf8_char_start_point(src); 99 | const char* starter = cur; 100 | 101 | for(; *cur != '\0'; cur = Util::nearest_utf8_char_start_point(cur+1)) { 102 | int canonical_class = ccc.get_class(cur); 103 | if(last_canonical_class > canonical_class && canonical_class != 0) 104 | return starter; 105 | 106 | if(nf.quick_check(cur)==false) 107 | return starter; 108 | 109 | if(canonical_class==0) 110 | starter=cur; 111 | 112 | last_canonical_class = canonical_class; 113 | } 114 | return cur; 115 | } 116 | 117 | const char* next_valid_starter(const char* src, const Trie::NormalizationForm& nf) const { 118 | const char* cur = Util::nearest_utf8_char_start_point(src+1); 119 | while(ccc.get_class(cur)!=0 || nf.quick_check(cur)==false) 120 | cur = Util::nearest_utf8_char_start_point(cur+1); 121 | return cur; 122 | } 123 | 124 | private: 125 | const Trie::NormalizationForm nf_d; 126 | const Trie::NormalizationForm nf_kd; 127 | const Trie::NormalizationForm nf_c; 128 | const Trie::NormalizationForm nf_c_qc; 129 | const Trie::NormalizationForm nf_kc_qc; 130 | const Trie::CanonicalCombiningClass ccc; 131 | 132 | std::string buffer; 133 | std::string buffer2; 134 | std::string buffer3; 135 | std::vector canonical_classes; 136 | }; 137 | } 138 | 139 | #endif 140 | -------------------------------------------------------------------------------- /ext/unf_ext/unf/trie/char_stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UNF_TRIE_CHAR_STREAM_HH 2 | #define UNF_TRIE_CHAR_STREAM_HH 3 | 4 | #include 5 | #include 6 | #include "../util.hh" 7 | 8 | namespace UNF { 9 | namespace Trie { 10 | class CharStream { 11 | public: 12 | CharStream(const char* str) : cur_(str) {} 13 | unsigned char read() { return eos() ? '\0' : *cur_++; } 14 | unsigned char prev() const { return cur_[-1]; } 15 | unsigned char peek() const { return *cur_; } 16 | const char* cur() const { return cur_; } 17 | bool eos() const { return *cur_ == '\0'; } 18 | void setCur(const char* new_cur) { cur_ = new_cur; } 19 | 20 | private: 21 | const char* cur_; 22 | }; 23 | 24 | class RangeCharStream { 25 | public: 26 | RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {} 27 | unsigned char read() { return eos() ? '\0' : *cur_++; } 28 | unsigned char prev() const { return cur_[-1]; } 29 | unsigned char peek() const { return *cur_; } 30 | const char* cur() const { return cur_; } 31 | const char* end() const { return end_; } 32 | bool eos() const { return cur_ == end_; } 33 | 34 | private: 35 | const char* cur_; 36 | const char* end_; 37 | }; 38 | 39 | class CompoundCharStream { 40 | public: 41 | CompoundCharStream(const char* first, const char* second) 42 | : beg1(first), beg2(second), cur1(beg1), cur2(beg2) {} 43 | 44 | unsigned char read() { return !eos1() ? read1() : read2(); } 45 | unsigned char peek() const { return !eos1() ? *cur1 : *cur2; } 46 | unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; } 47 | 48 | const char* cur() const { return !eos1() ? cur1 : cur2; } 49 | bool eos() const { return eos1() && eos2(); } 50 | bool within_first() const { return !eos1(); } 51 | 52 | unsigned offset() const { return cur1-beg1 + cur2-beg2; } 53 | void setCur(const char* p) { 54 | if(beg1 <= p && p <= cur1) { 55 | cur1=p; 56 | cur2=beg2; 57 | } else { 58 | cur2=p; 59 | } 60 | } 61 | 62 | protected: 63 | unsigned char read1() { return eos1() ? '\0' : *cur1++; } 64 | unsigned char read2() { return eos2() ? '\0' : *cur2++; } 65 | bool eos1() const { return *cur1=='\0'; } 66 | bool eos2() const { return *cur2=='\0'; } 67 | 68 | protected: 69 | const char* beg1; 70 | const char* beg2; 71 | const char* cur1; 72 | const char* cur2; 73 | }; 74 | 75 | class CharStreamForComposition : public CompoundCharStream { 76 | public: 77 | CharStreamForComposition (const char* first, const char* second, 78 | const std::vector& canonical_classes, 79 | std::string& buf) 80 | : CompoundCharStream(first, second), classes(canonical_classes), skipped(buf) 81 | {} 82 | 83 | void init_skipinfo() { 84 | skipped.clear(); 85 | skipped_tail = 0; 86 | } 87 | 88 | void mark_as_last_valid_point() { 89 | skipped_tail = skipped.size(); 90 | marked_point = cur(); 91 | } 92 | 93 | void reset_at_marked_point() { 94 | setCur(marked_point); 95 | } 96 | 97 | void append_read_char_to_str(std::string& s, const char* beg) const { 98 | if(eos1()==false) { 99 | s.append(beg, cur()); 100 | } else { 101 | s.append(beg, cur1); 102 | s.append(beg2, cur()); 103 | } 104 | } 105 | 106 | void append_skipped_chars_to_str(std::string& s) const { 107 | s.append(skipped.begin(), skipped.begin()+skipped_tail); 108 | } 109 | 110 | unsigned char get_canonical_class() const { 111 | return offset() < classes.size() ? classes[offset()] : 0; 112 | } 113 | 114 | bool next_combining_char(unsigned char prev_class, const char* ppp) { 115 | while(Util::is_utf8_char_start_byte(peek()) == false) 116 | read(); 117 | 118 | unsigned char mid_class = get_prev_canonical_class(); 119 | unsigned char cur_class = get_canonical_class(); 120 | 121 | if(prev_class==0 && mid_class==0 && cur_class!=0) 122 | return false; 123 | 124 | if(prev_class < cur_class && mid_class < cur_class) { 125 | skipped.append(ppp, cur()); 126 | return true; 127 | } else { 128 | if(cur_class != 0) { 129 | read(); 130 | return next_combining_char(prev_class,ppp); 131 | } 132 | return false; 133 | } 134 | } 135 | 136 | private: 137 | unsigned char get_prev_canonical_class() const { 138 | return offset()-1 < classes.size() ? classes[offset()-1] : 0; 139 | } 140 | 141 | private: 142 | const std::vector& classes; 143 | std::string& skipped; 144 | unsigned skipped_tail; 145 | const char* marked_point; 146 | }; 147 | } 148 | } 149 | 150 | #endif 151 | -------------------------------------------------------------------------------- /ext/unf_ext/unf/trie/node.hh: -------------------------------------------------------------------------------- 1 | #ifndef UNF_TRIE_NODE_HH 2 | #define UNF_TRIE_NODE_HH 3 | 4 | namespace UNF { 5 | namespace Trie { 6 | class Node { 7 | public: 8 | unsigned jump(unsigned char ch) const { return base() + ch; } 9 | unsigned value() const { return base(); } 10 | unsigned check_char() const { return data>>24; } 11 | unsigned to_uint() const { return data; } 12 | 13 | static const Node* from_uint_array(const unsigned* node_uints) 14 | { return reinterpret_cast(node_uints); } 15 | 16 | private: 17 | unsigned base() const { return data & 0xFFFFFF; } 18 | 19 | private: 20 | unsigned data; 21 | }; 22 | } 23 | } 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /ext/unf_ext/unf/trie/searcher.hh: -------------------------------------------------------------------------------- 1 | #ifndef UNF_TRIE_SEARCHER_HH 2 | #define UNF_TRIE_SEARCHER_HH 3 | 4 | #include "char_stream.hh" 5 | #include "node.hh" 6 | #include "../util.hh" 7 | 8 | namespace UNF { 9 | namespace Trie { 10 | class Searcher { 11 | public: 12 | Searcher(const Node* nodes, unsigned root, const char* value=NULL) 13 | : nodes(nodes), root(root), value(value) {} 14 | 15 | unsigned find_value(const char* key, int default_value) const { 16 | unsigned node_index=root; 17 | for(CharStream in(key);; in.read()) { 18 | node_index = nodes[node_index].jump(in.peek()); 19 | if(nodes[node_index].check_char()==in.peek()) { 20 | unsigned terminal_index = nodes[node_index].jump('\0'); 21 | if(nodes[terminal_index].check_char()=='\0') { 22 | return nodes[terminal_index].value(); 23 | } 24 | } else 25 | return default_value; 26 | } 27 | } 28 | 29 | protected: 30 | const Node* nodes; 31 | const unsigned root; 32 | const char* value; 33 | }; 34 | 35 | class CanonicalCombiningClass : private Searcher { 36 | public: 37 | CanonicalCombiningClass(const unsigned* node_uints, unsigned root) 38 | : Searcher(Node::from_uint_array(node_uints), root) {} 39 | 40 | unsigned get_class(const char* str) const { return find_value(str,0); } 41 | 42 | void sort(char* str, std::vector& classes) const { 43 | CharStream in(str); 44 | unsigned sort_beg=0; 45 | unsigned sort_end=0; 46 | unsigned unicode_char_count=0; 47 | 48 | loop_head: 49 | unsigned beg = in.cur()-str; 50 | 51 | for(unsigned node_index=root;;){ 52 | node_index = nodes[node_index].jump(in.read()); 53 | 54 | if(nodes[node_index].check_char()==in.prev()) { 55 | unsigned terminal_index = nodes[node_index].jump('\0'); 56 | if(nodes[terminal_index].check_char()=='\0') { 57 | if((unicode_char_count++)==0) 58 | sort_beg = beg; 59 | sort_end = in.cur()-str; 60 | 61 | unsigned char klass = nodes[terminal_index].value(); 62 | for(unsigned i=beg; i < sort_end; i++) 63 | classes[i] = klass; 64 | break; 65 | } 66 | } else { 67 | if(unicode_char_count > 1) 68 | bubble_sort(str, classes, sort_beg, sort_end); 69 | unicode_char_count = 0; 70 | break; 71 | } 72 | } 73 | Util::eat_until_utf8_char_start_point(in); 74 | 75 | if(in.eos()==false) 76 | goto loop_head; 77 | 78 | if(unicode_char_count > 1) 79 | bubble_sort(str, classes, sort_beg, sort_end); 80 | } 81 | 82 | private: 83 | void bubble_sort(char* str, std::vector& canonical_classes, unsigned beg, unsigned end) const { 84 | for(unsigned limit=beg, next=end; limit != next;) { 85 | limit = next; 86 | for(unsigned i=beg+1; i < limit; i++) 87 | if(canonical_classes[i-1] > canonical_classes[i]) { 88 | std::swap(canonical_classes[i-1], canonical_classes[i]); 89 | std::swap(str[i-1], str[i]); 90 | next = i; 91 | } 92 | } 93 | } 94 | }; 95 | 96 | class NormalizationForm : private Searcher { 97 | public: 98 | NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL) 99 | : Searcher(Node::from_uint_array(node_uints), root, value) {} 100 | 101 | bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; } 102 | 103 | void decompose(RangeCharStream in, std::string& buffer) const { 104 | loop_head: 105 | const char* beg = in.cur(); 106 | 107 | for(unsigned node_index=root;;) { 108 | node_index = nodes[node_index].jump(in.read()); 109 | if(nodes[node_index].check_char()==in.prev()) { 110 | unsigned terminal_index = nodes[node_index].jump('\0'); 111 | if(nodes[terminal_index].check_char()=='\0') { 112 | word_append(buffer, value, nodes[terminal_index].value()); 113 | beg = in.cur(); 114 | break; 115 | } 116 | } else { 117 | Util::eat_until_utf8_char_start_point(in); 118 | buffer.append(beg, in.cur()); 119 | break; 120 | } 121 | } 122 | 123 | if(in.eos()==false) 124 | goto loop_head; 125 | } 126 | 127 | void compose(CharStreamForComposition& in, std::string& buf) const { 128 | in.init_skipinfo(); 129 | 130 | const char* const beg = in.cur(); 131 | const char* current_char_head = in.cur(); 132 | unsigned composed_char_info = 0; 133 | 134 | unsigned node_index = root; 135 | unsigned retry_root_node = root; 136 | unsigned char retry_root_class = 0; 137 | 138 | for(bool first=true;;) { 139 | if(Util::is_utf8_char_start_byte(in.peek())) { 140 | if(node_index != root) 141 | first=false; 142 | current_char_head = in.cur(); 143 | 144 | retry_root_node = node_index; 145 | retry_root_class = in.get_canonical_class(); 146 | } 147 | 148 | retry: 149 | unsigned next_index = nodes[node_index].jump(in.peek()); 150 | if(nodes[next_index].check_char()==in.read()) { 151 | // succeeded 152 | node_index = next_index; 153 | unsigned terminal_index = nodes[node_index].jump('\0'); 154 | if(nodes[terminal_index].check_char()=='\0') { 155 | composed_char_info = nodes[terminal_index].value(); 156 | 157 | in.mark_as_last_valid_point(); 158 | if(in.eos() || retry_root_class > in.get_canonical_class()) 159 | break; 160 | } 161 | } else if (first==true) { 162 | // no retry if current point is a part of first starter 163 | break; 164 | } else if (in.next_combining_char(retry_root_class, current_char_head)==true) { 165 | // back previous code-point and retry 166 | node_index = retry_root_node; 167 | current_char_head = in.cur(); 168 | goto retry; 169 | } else { 170 | break; 171 | } 172 | } 173 | 174 | if(composed_char_info != 0) { 175 | // append composed unicode-character and skipped combining-characters 176 | word_append(buf, value, composed_char_info); 177 | in.append_skipped_chars_to_str(buf); 178 | in.reset_at_marked_point(); 179 | } else { 180 | // append one unicode-character 181 | in.setCur(Util::nearest_utf8_char_start_point(beg+1)); 182 | in.append_read_char_to_str(buf, beg); 183 | } 184 | } 185 | 186 | private: 187 | static void word_append(std::string& buffer, const char* base, unsigned pos_info) { 188 | buffer.append(base+(pos_info&0x3FFFF), pos_info>>18); 189 | } 190 | }; 191 | } 192 | } 193 | 194 | #endif 195 | -------------------------------------------------------------------------------- /ext/unf_ext/unf/util.hh: -------------------------------------------------------------------------------- 1 | #ifndef UNF_UTIL_HH 2 | #define UNF_UTIL_HH 3 | 4 | namespace UNF { 5 | namespace Util { 6 | inline bool is_utf8_char_start_byte(char byte) { 7 | if(!(byte&0x80)) return true; // ascii 8 | else if (byte&0x40) return true; // start of a UTF-8 character byte sequence 9 | return false; 10 | } 11 | 12 | inline const char* nearest_utf8_char_start_point(const char* s) { 13 | for(; is_utf8_char_start_byte(*s)==false; s++); 14 | return s; 15 | } 16 | 17 | template 18 | inline void eat_until_utf8_char_start_point(CharStream& in) { 19 | for(; is_utf8_char_start_byte(in.peek())==false; in.read()); 20 | } 21 | } 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /lib/unf_ext.rb: -------------------------------------------------------------------------------- 1 | begin 2 | require "#{RUBY_VERSION[/\A[0-9]+\.[0-9]+/]}/unf_ext.so" 3 | rescue LoadError 4 | require "unf_ext.so" 5 | end 6 | -------------------------------------------------------------------------------- /lib/unf_ext/version.rb: -------------------------------------------------------------------------------- 1 | module UNF 2 | class Normalizer 3 | VERSION = "0.0.9.1" 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'bundler' 3 | begin 4 | Bundler.setup(:default, :development) 5 | rescue Bundler::BundlerError => e 6 | $stderr.puts e.message 7 | $stderr.puts "Run `bundle install` to install missing gems" 8 | exit e.status_code 9 | end 10 | require 'test/unit' 11 | 12 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 13 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..')) 14 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 15 | require 'unf_ext' 16 | 17 | class Test::Unit::TestCase 18 | end 19 | -------------------------------------------------------------------------------- /test/test_unf_ext.rb: -------------------------------------------------------------------------------- 1 | require 'helper' 2 | require 'pathname' 3 | 4 | class TestUnf < Test::Unit::TestCase 5 | test "raise ArgumentError if an unknown normalization form is given" do 6 | normalizer = UNF::Normalizer.new 7 | assert_raises(ArgumentError) { normalizer.normalize("が", :nfck) } 8 | end 9 | 10 | test "pass all tests bundled with the original unf" do 11 | normalizer = UNF::Normalizer.new 12 | open(Pathname(__FILE__).dirname + 'normalization-test.txt', 'r:utf-8').each_slice(6) { |lines| 13 | flunk "broken test file" if lines.size != 6 || lines.pop !~ /^$/ 14 | str, nfc, nfd, nfkc, nfkd = lines 15 | assert_equal nfd, normalizer.normalize(str, :nfd) 16 | assert_equal nfd, normalizer.normalize(nfd, :nfd) 17 | assert_equal nfd, normalizer.normalize(nfc, :nfd) 18 | assert_equal nfkd, normalizer.normalize(nfkc, :nfd) 19 | assert_equal nfkd, normalizer.normalize(nfkc, :nfd) 20 | 21 | assert_equal nfc, normalizer.normalize(str, :nfc) 22 | assert_equal nfc, normalizer.normalize(nfd, :nfc) 23 | assert_equal nfc, normalizer.normalize(nfc, :nfc) 24 | assert_equal nfkc, normalizer.normalize(nfkc, :nfc) 25 | assert_equal nfkc, normalizer.normalize(nfkd, :nfc) 26 | 27 | assert_equal nfkd, normalizer.normalize(str, :nfkd) 28 | assert_equal nfkd, normalizer.normalize(nfd, :nfkd) 29 | assert_equal nfkd, normalizer.normalize(nfc, :nfkd) 30 | assert_equal nfkd, normalizer.normalize(nfkc, :nfkd) 31 | assert_equal nfkd, normalizer.normalize(nfkd, :nfkd) 32 | 33 | assert_equal nfkc, normalizer.normalize(str, :nfkc) 34 | assert_equal nfkc, normalizer.normalize(nfd, :nfkc) 35 | assert_equal nfkc, normalizer.normalize(nfc, :nfkc) 36 | assert_equal nfkc, normalizer.normalize(nfkc, :nfkc) 37 | assert_equal nfkc, normalizer.normalize(nfkd, :nfkc) 38 | } 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /unf_ext.gemspec: -------------------------------------------------------------------------------- 1 | lib = File.expand_path('../lib', __FILE__) 2 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 3 | require 'unf_ext/version' 4 | 5 | Gem::Specification.new do |gem| 6 | gem.name = "unf_ext" 7 | gem.version = UNF::Normalizer::VERSION 8 | gem.authors = ["Takeru Ohta", "Akinori MUSHA"] 9 | gem.email = ["knu@idaemons.org"] 10 | gem.description = %q{Unicode Normalization Form support library for CRuby} 11 | gem.summary = %q{Unicode Normalization Form support library for CRuby} 12 | gem.homepage = "https://github.com/knu/ruby-unf_ext" 13 | gem.licenses = ["MIT"] 14 | 15 | gem.files = `git ls-files`.split($/) 16 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 17 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}).grep(%r{/test_[^/]+\.rb$}) 18 | gem.require_paths = ["lib"] 19 | gem.extensions = ["ext/unf_ext/extconf.rb"] 20 | 21 | gem.extra_rdoc_files = [ 22 | "LICENSE.txt", 23 | "README.md" 24 | ] 25 | 26 | gem.required_ruby_version = '>= 2.2' 27 | 28 | gem.add_development_dependency("rake", [">= 0.9.2.2"]) 29 | gem.add_development_dependency("test-unit") 30 | gem.add_development_dependency("rdoc", ["> 2.4.2"]) 31 | gem.add_development_dependency("bundler", [">= 1.2"]) 32 | gem.add_development_dependency("rake-compiler", [">= 1.2.1"]) 33 | gem.add_development_dependency("rake-compiler-dock", [">= 1.3.0"]) 34 | end 35 | --------------------------------------------------------------------------------