├── .document
├── .github
    └── workflows
    │   └── unf_ext.yml
├── .gitignore
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── ext
    └── unf_ext
    │   ├── extconf.rb
    │   ├── unf.cc
    │   └── unf
    │       ├── normalizer.hh
    │       ├── table.hh
    │       ├── trie
    │           ├── char_stream.hh
    │           ├── node.hh
    │           └── searcher.hh
    │       └── util.hh
├── lib
    ├── unf_ext.rb
    └── unf_ext
    │   └── version.rb
├── test
    ├── helper.rb
    ├── normalization-test.txt
    └── test_unf_ext.rb
└── unf_ext.gemspec


/.document:
--------------------------------------------------------------------------------
1 | lib/**/*.rb
2 | bin/*
3 | - 
4 | features/**/*.feature
5 | LICENSE.txt
6 | 


--------------------------------------------------------------------------------
/.github/workflows/unf_ext.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     name: >-
 8 |       ${{ matrix.os }} ${{ matrix.ruby }}
 9 | 
10 |     runs-on: ${{ matrix.os }}
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         os: [ ubuntu-22.04, macos-12, windows-2022 ]
15 |         ruby: [ 2.7, "3.0", 3.1, 3.2, head ]
16 |         include:
17 |           - { os: windows-2022, ruby: ucrt  }
18 |           - { os: windows-2022, ruby: mswin }
19 |         exclude:
20 |           - { os: windows-2022, ruby: head }
21 | 
22 |     steps:
23 |       - name: repo checkout
24 |         uses: actions/checkout@v4
25 | 
26 |       - uses: ruby/setup-ruby@v1
27 |         with:
28 |           ruby-version: ${{ matrix.ruby }}
29 |           bundler-cache: true
30 | 
31 |       - name: compile
32 |         timeout-minutes: 5
33 |         run: bundle exec rake compile
34 | 
35 |       - name: test
36 |         timeout-minutes: 5
37 |         run: bundle exec rake test
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | *.so
 4 | *.bundle
 5 | .config
 6 | .yardoc
 7 | Gemfile.lock
 8 | InstalledFiles
 9 | _yardoc
10 | coverage
11 | doc/
12 | lib/bundler/man
13 | ext/*/Makefile
14 | pkg
15 | rdoc
16 | spec/reports
17 | test/tmp
18 | test/version_tmp
19 | tmp
20 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [0.0.9.1](https://github.com/knu/ruby-unf_ext/tree/v0.0.9.1) (2023-11-16)
 4 | 
 5 | [Full Changelog](https://github.com/knu/ruby-unf_ext/compare/v0.0.9...v0.0.9.1)
 6 | 
 7 | - Fix use of designated initializers. [\#75](https://github.com/knu/ruby-unf_ext/pull/75) ([igorpeshansky](https://github.com/igorpeshansky))
 8 | 
 9 | ## [0.0.9](https://github.com/knu/ruby-unf_ext/tree/v0.0.9) (2023-11-11)
10 | 
11 | [Full Changelog](https://github.com/knu/ruby-unf_ext/compare/v0.0.8.2...v0.0.9)
12 | 
13 | - Use the newer TypedData extension API [\#72](https://github.com/knu/ruby-unf_ext/pull/72) ([casperisfine](https://github.com/casperisfine))
14 | - Update Actions - updates OS's, add Ruby 3,2, etc [\#71](https://github.com/knu/ruby-unf_ext/pull/71) ([MSP-Greg](https://github.com/MSP-Greg))
15 | - Add cross compilation for Ruby 3.2 [\#69](https://github.com/knu/ruby-unf_ext/pull/69) ([johnnyshields](https://github.com/johnnyshields))
16 | 
17 | ## 0.0.8.2 (2022-05-26)
18 | 
19 | - Add x64-mingw-ucrt native gem support for RubyInstaller 3.1.
20 | 
21 | ## 0.0.8.1 (2022-03-13)
22 | 
23 | - Include Windows binaries for Ruby 3.1. (FAIL)
24 | 
25 | ## 0.0.8 (2021-09-14)
26 | 
27 | - No functional change in the library code.
28 | - Include Windows binaries for Ruby 3.0.
29 | - Drop support for Ruby 2.1 and earlier.
30 | - Replace Travis CI with Github Actions.
31 | - Fix cross-build after upgrading rake-compiler/rake-compiler-dock to 1.1.1/1.1.0.
32 | 
33 | ## 0.0.7.7 (2020-03-30)
34 | 
35 | - Include Windows binaries for Ruby 2.7.
36 | 
37 | ## 0.0.7.6 (2019-03-19)
38 | 
39 | - Include Windows binaries for Ruby 2.6.
40 | 
41 | ## 0.0.7.5 (2018-02-06)
42 | 
43 | - Include Windows binaries for Ruby 2.5.
44 | 
45 | ## 0.0.7.4 (2017-04-19)
46 | 
47 | - Fix build on ARM and GCC 6 again.
48 | 
49 | ## 0.0.7.3 (2017-04-11)
50 | 
51 | - Update the base Unicode version to 9.
52 | 
53 | - Fix compile issues on ARM and GCC 6.
54 | 
55 | ## 0.0.7.2 (2016-02-01)
56 | 
57 | - Include Windows binaries for Ruby 2.3.
58 | 
59 | ## 0.0.7.1 (2015-04-18)
60 | 
61 | - Windows fat binary gems no longer require libstd++ to
62 |   run, which are statically linked.
63 | 
64 | - Add a fat binary gem for x64-mingw32 (64bit Windows).
65 | 
66 | - Windows fat binary gems now include binaries for Ruby up to 2.2.
67 | 
68 | ## 0.0.6 (2013-02-16)
69 | 
70 | - Migrate from Jeweler to Bundler.
71 | 
72 | ## 0.0.5 (2012-05-30)
73 | 
74 | - Fix a type error for strict compilers.
75 | 
76 | ## 0.0.4 (2011-12-08)
77 | 
78 | - Release under the current name of `unf_ext`.
79 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Specify your gem's dependencies in ruby-unf_ext.gemspec
4 | gemspec
5 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2010 Takeru Ohta <phjgt308@gmail.com>
 4 | Copyright (c) 2011-2018 Akinori MUSHA <knu@idaemons.org> (extended Ruby support)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ruby-unf_ext
 2 | ============
 3 | 
 4 | Synopsis
 5 | --------
 6 | 
 7 | * Unicode Normalization Form support library for CRuby
 8 | 
 9 | Description
10 | -----------
11 | 
12 | * Normalizes UTF-8 strings into and from NFC, NFD, NFKC or NFKD
13 | 
14 |         # For bulk conversion
15 |         normalizer = UNF::Normalizer.new
16 |         a_bunch_of_strings.map! { |string|
17 |           normalizer.normalize(string, :nfc) #=> string in NFC
18 |         }
19 | 
20 | * Compliant with Unicode 9.0
21 | 
22 | Requirement
23 | -----------
24 | 
25 | * Ruby 1.8.7+, 1.9.2+
26 | 
27 | * C++ compiler and libstdc++
28 | 
29 | Installation
30 | ------------
31 | 
32 | 	gem install unf_ext
33 | 
34 | Or:
35 | 
36 |     ruby extconf.rb && make && make install
37 | 
38 | Development Resources
39 | ---------------------
40 | 
41 | * https://github.com/sile/unf
42 | 
43 |     For issues regarding files under the directory `unf`, please
44 |     contact this upstream.
45 | 
46 | * https://github.com/knu/ruby-unf_ext
47 | 
48 |     The development site and the repository.
49 | 
50 | License
51 | -------
52 | 
53 | Copyright (c) 2010-2017 Takeru Ohta
54 | Copyright (c) 2011-2018 Akinori MUSHA
55 | 
56 | Licensed under the MIT license.
57 | See `LICENSE` for details.
58 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'bundler/gem_tasks'
 2 | 
 3 | gemspec = Bundler::GemHelper.gemspec
 4 | 
 5 | native_platforms = %w[
 6 |   x86-mingw32
 7 |   x64-mingw32
 8 |   x64-mingw-ucrt
 9 | ]
10 | 
11 | require 'rake/extensiontask'
12 | Rake::ExtensionTask.new('unf_ext', gemspec) do |ext|
13 |   ext.cross_compile = true
14 |   ext.cross_platform = native_platforms
15 |   ext.cross_config_options << '--with-ldflags="-static-libgcc"' << '--with-static-libstdc++'
16 | end
17 | 
18 | namespace :gem do
19 |   task :native do
20 |     require 'rake_compiler_dock'
21 |     sh 'bundle package --all'
22 |     native_platforms.each do |plat|
23 |       RakeCompilerDock.sh "bundle --local && rake native:#{plat} gem", platform: plat
24 |     end
25 |   end
26 | end
27 | 
28 | task :gems => %i[build gem:native]
29 | 
30 | require 'rake/testtask'
31 | Rake::TestTask.new(:test) do |test|
32 |   test.libs << 'test'
33 |   test.test_files = gemspec.test_files
34 |   test.verbose = true
35 | end
36 | 
37 | task :default => :test
38 | 


--------------------------------------------------------------------------------
/ext/unf_ext/extconf.rb:
--------------------------------------------------------------------------------
 1 | require 'mkmf'
 2 | 
 3 | if with_config('static-libstdc++')
 4 |   $LDFLAGS << ' ' << `#{CONFIG['CC']} -print-file-name=libstdc++.a`.chomp
 5 | else
 6 |   have_library('stdc++')
 7 | 
 8 |   case RbConfig::CONFIG['host_os']
 9 |   when /mswin/
10 |     # Avoid `error C7555: use of designated initializers requires at least '/std:c++20'`
11 |     $CFLAGS << ' ' << '-std:c++20'
12 |     $CPPFLAGS << ' ' << '-std:c++20'
13 |     $CXXFLAGS << ' ' << '-std:c++20'
14 |   when /solaris(!?2.11)/
15 |     # Do a little trickery here to enable C++ standard on Solaris 11 if found.
16 |     # This also forces 64bit compilation mode.
17 |     $CXX = CONFIG['CXX']
18 |     $CXX << ' ' << '-m64'
19 |     $CFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '')
20 |     $CFLAGS << ' ' << '-m64 -std=c++11'
21 |     $CPPFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '')
22 |     $CPPFLAGS << ' ' << '-m64 -std=c++11'
23 |     $CXXFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '')
24 |     $CXXFLAGS << ' ' << '-m64 -std=c++11'
25 |   when /aix/
26 |     # Compiler flags necessary on AIX.
27 |     # rubocop:disable Style/GlobalVars
28 |     $CFLAGS << ' ' << '-D_ALL_SOURCE=1'
29 |     $CPPFLAGS << ' ' << '-D_ALL_SOURCE=1'
30 |     $CXXFLAGS << ' ' << '-D_ALL_SOURCE=1'
31 |   end
32 | end
33 | 
34 | create_makefile 'unf_ext'
35 | 
36 | unless CONFIG['CXX']
37 |   case CONFIG['CC']
38 |   when %r{((?:.*[-/])?)gcc([-0-9.]*)$}
39 |     cxx = $1 + 'g++' + $2
40 |   when %r{((?:.*[-/])?)clang([-0-9.]*)$}
41 |     cxx = $1 + 'clang++' + $2
42 |   else
43 |     cxx = CONFIG['CC']
44 |   end
45 | 
46 |   warn "CXX is automatically set to #{cxx}"
47 | 
48 |   new_mf = <<-EOF << File.read('Makefile')
49 | CXX=#{cxx}
50 |   EOF
51 | 
52 |   File.open('Makefile', 'w') { |mf|
53 |     mf.print new_mf
54 |   }
55 | end
56 | 


--------------------------------------------------------------------------------
/ext/unf_ext/unf.cc:
--------------------------------------------------------------------------------
 1 | #include "unf/normalizer.hh"
 2 | 
 3 | #include <ruby.h>
 4 | #if defined(HAVE_RUBY_ENCODING_H)
 5 | #include <ruby/encoding.h>
 6 | #endif
 7 | 
 8 | extern "C" {
 9 |   VALUE unf_allocate(VALUE klass);
10 |   VALUE unf_initialize(VALUE self);
11 |   void unf_delete(void* ptr);
12 |   VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form);
13 | 
14 |   ID FORM_NFD;
15 |   ID FORM_NFC;
16 |   ID FORM_NFKD;
17 |   ID FORM_NFKC;
18 | 
19 |   void Init_unf_ext() {
20 |     VALUE mdl = rb_define_module("UNF");
21 | 
22 |     VALUE cls = rb_define_class_under(mdl, "Normalizer", rb_cObject);
23 |     rb_define_alloc_func(cls, unf_allocate);
24 |     rb_define_method(cls, "initialize", (VALUE (*)(...))unf_initialize, 0);
25 |     rb_define_method(cls, "normalize", (VALUE (*)(...))unf_normalize, 2);
26 | 
27 |     FORM_NFD = rb_intern("nfd");
28 |     FORM_NFC = rb_intern("nfc");
29 |     FORM_NFKD= rb_intern("nfkd");
30 |     FORM_NFKC= rb_intern("nfkc");
31 |   }
32 | 
33 |   static const rb_data_type_t unf_normalizer_data_type = {
34 |     .wrap_struct_name = "UNF::Normalizer",
35 |     .function = {
36 |       .dmark = NULL,
37 |       .dfree = unf_delete,
38 |       .dsize = NULL
39 |     },
40 |     .parent = NULL,
41 |     .data = NULL,
42 |     .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
43 |   };
44 | 
45 |   VALUE unf_allocate(VALUE klass) {
46 |     UNF::Normalizer* ptr;
47 |     VALUE obj = TypedData_Make_Struct(klass, UNF::Normalizer, &unf_normalizer_data_type, ptr);
48 |     new ((void*)ptr) UNF::Normalizer;
49 |     return obj;
50 |   }
51 | 
52 |   VALUE unf_initialize(VALUE self) {
53 |     return self;
54 |   }
55 | 
56 |   void unf_delete(void *data) {
57 |     UNF::Normalizer* ptr = (UNF::Normalizer*)data;
58 |     ptr->~Normalizer();
59 |     ruby_xfree(ptr);
60 |   }
61 | 
62 |   VALUE unf_normalize(VALUE self, VALUE source, VALUE normalization_form) {
63 |     UNF::Normalizer* ptr;
64 |     TypedData_Get_Struct(self, UNF::Normalizer, &unf_normalizer_data_type, ptr);
65 | 
66 |     const char* src = StringValueCStr(source);
67 |     const char* rlt;
68 |     ID form_id = SYM2ID(normalization_form);
69 | 
70 |     if(form_id == FORM_NFD)
71 |       rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFD);
72 |     else if(form_id == FORM_NFC)
73 |       rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFC);
74 |     else if(form_id == FORM_NFKD)
75 |       rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKD);
76 |     else if(form_id == FORM_NFKC)
77 |       rlt = ptr->normalize(src, UNF::Normalizer::FORM_NFKC);
78 |     else
79 |       rb_raise(rb_eArgError, "Specified Normalization-Form is unknown. Please select one from among :nfc, :nfd, :nfkc, :nfkd.");
80 | 
81 | #if defined(HAVE_RUBY_ENCODING_H)
82 |     return rb_enc_str_new(rlt, strlen(rlt), rb_utf8_encoding());
83 | #else
84 |     return rb_str_new2(rlt);
85 | #endif
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/ext/unf_ext/unf/normalizer.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UNF_NORMALIZER_HH
  2 | #define UNF_NORMALIZER_HH
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include <algorithm>
  7 | #include <cstring>
  8 | #include "trie/searcher.hh"
  9 | #include "trie/char_stream.hh"
 10 | #include "table.hh"
 11 | #include "util.hh"
 12 | 
 13 | namespace UNF {
 14 |   class Normalizer {
 15 |   public:
 16 |     enum Form { FORM_NFD, FORM_NFC, FORM_NFKD, FORM_NFKC };
 17 | 
 18 |   public:
 19 |     Normalizer()
 20 |       : nf_d(TABLE::NODES, TABLE::CANONICAL_DECOM_ROOT, (const char *)TABLE::STRINGS),
 21 | 	nf_kd(TABLE::NODES, TABLE::COMPATIBILITY_DECOM_ROOT, (const char *)TABLE::STRINGS),
 22 | 	nf_c(TABLE::NODES, TABLE::CANONICAL_COM_ROOT, (const char *)TABLE::STRINGS),
 23 | 	nf_c_qc(TABLE::NODES, TABLE::NFC_ILLEGAL_ROOT),
 24 | 	nf_kc_qc(TABLE::NODES, TABLE::NFKC_ILLEGAL_ROOT),
 25 | 	ccc(TABLE::NODES, TABLE::CANONICAL_CLASS_ROOT)
 26 |     {}
 27 | 
 28 |     const char* normalize(const char* src, Form form) {
 29 |       switch(form) {
 30 |       case FORM_NFD:  return nfd(src);
 31 |       case FORM_NFC:  return nfc(src);
 32 |       case FORM_NFKD: return nfkd(src);
 33 |       case FORM_NFKC: return nfkc(src);
 34 |       default:        return src;
 35 |       }
 36 |     }
 37 |     const char* nfd(const char* src)  { return decompose(src, nf_d); }
 38 |     const char* nfkd(const char* src) { return decompose(src, nf_kd); }
 39 |     const char* nfc(const char* src)  { return compose(src, nf_c_qc, nf_d); }
 40 |     const char* nfkc(const char* src) { return compose(src, nf_kc_qc, nf_kd); }
 41 | 
 42 |   private:
 43 |     const char* decompose(const char* src, const Trie::NormalizationForm& nf) {
 44 |       const char* beg = next_invalid_char(src, nf);
 45 |       if(*beg=='\0')
 46 | 	return src;
 47 |       
 48 |       buffer.assign(src, beg);
 49 |       do {
 50 | 	const char* end = next_valid_starter(beg, nf);
 51 | 	decompose_one(beg, end, nf, buffer);
 52 | 	beg = next_invalid_char(end, nf);
 53 | 	buffer.append(end, beg);
 54 |       } while(*beg!='\0');
 55 |       
 56 |       return buffer.c_str();      
 57 |     }
 58 | 
 59 |     void decompose_one(const char* beg, const char* end, const Trie::NormalizationForm& nf, std::string& buf) {
 60 |       unsigned last = buf.size();
 61 |       nf.decompose(Trie::RangeCharStream(beg,end), buf);
 62 |       char* bufbeg = const_cast<char*>(buf.data());
 63 |       canonical_combining_class_ordering(bufbeg+last, bufbeg+buf.size());
 64 |     }
 65 | 
 66 |     const char* compose(const char* src, const Trie::NormalizationForm& nf, const Trie::NormalizationForm& nf_decomp) {
 67 |       const char* beg = next_invalid_char(src, nf);
 68 |       if(*beg=='\0')
 69 | 	return src;
 70 |       
 71 |       buffer.assign(src, beg);
 72 |       while(*beg!='\0') {
 73 | 	const char* end = next_valid_starter(beg, nf);
 74 | 	buffer2.clear();
 75 | 	decompose_one(beg, end, nf_decomp, buffer2);
 76 | 	end = compose_one(buffer2.c_str(), end, buffer);
 77 | 	beg = next_invalid_char(end, nf);
 78 | 	buffer.append(end, beg);
 79 |       }
 80 | 
 81 |       return buffer.c_str();      
 82 |     }
 83 | 
 84 |     const char* compose_one(const char* starter, const char* rest_starter, std::string& buf) {
 85 |       Trie::CharStreamForComposition in(starter, rest_starter, canonical_classes, buffer3);
 86 |       while(in.within_first())
 87 | 	nf_c.compose(in, buf);
 88 |       return in.cur();
 89 |     }
 90 | 
 91 |     void canonical_combining_class_ordering(char* beg, const char* end) {
 92 |       canonical_classes.assign(end-beg+1, 0); // +1 is for sentinel value
 93 |       ccc.sort(beg, canonical_classes);
 94 |     }
 95 | 
 96 |     const char* next_invalid_char(const char* src, const Trie::NormalizationForm& nf) const {
 97 |       int last_canonical_class = 0;
 98 |       const char* cur = Util::nearest_utf8_char_start_point(src);
 99 |       const char* starter = cur;
100 |       
101 |       for(; *cur != '\0'; cur = Util::nearest_utf8_char_start_point(cur+1)) {
102 | 	int canonical_class = ccc.get_class(cur);
103 | 	if(last_canonical_class > canonical_class && canonical_class != 0)
104 | 	  return starter;
105 | 
106 | 	if(nf.quick_check(cur)==false)
107 | 	  return starter;
108 | 
109 | 	if(canonical_class==0)
110 | 	  starter=cur;
111 | 
112 | 	last_canonical_class = canonical_class;
113 |       }
114 |       return cur;
115 |     }
116 | 
117 |     const char* next_valid_starter(const char* src, const Trie::NormalizationForm& nf) const {
118 |       const char* cur = Util::nearest_utf8_char_start_point(src+1);
119 |       while(ccc.get_class(cur)!=0 || nf.quick_check(cur)==false)
120 | 	cur = Util::nearest_utf8_char_start_point(cur+1);
121 |       return cur;
122 |     }
123 | 
124 |   private:
125 |     const Trie::NormalizationForm nf_d;
126 |     const Trie::NormalizationForm nf_kd;
127 |     const Trie::NormalizationForm nf_c;
128 |     const Trie::NormalizationForm nf_c_qc;
129 |     const Trie::NormalizationForm nf_kc_qc;
130 |     const Trie::CanonicalCombiningClass ccc;
131 |     
132 |     std::string buffer;
133 |     std::string buffer2;
134 |     std::string buffer3;
135 |     std::vector<unsigned char> canonical_classes;
136 |   };
137 | }
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/ext/unf_ext/unf/trie/char_stream.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UNF_TRIE_CHAR_STREAM_HH
  2 | #define UNF_TRIE_CHAR_STREAM_HH
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include "../util.hh"
  7 | 
  8 | namespace UNF {
  9 |   namespace Trie {
 10 |     class CharStream {
 11 |     public:
 12 |       CharStream(const char* str) : cur_(str) {}
 13 |       unsigned char read() { return eos() ? '\0' : *cur_++; }
 14 |       unsigned char prev() const { return cur_[-1]; }
 15 |       unsigned char peek() const { return *cur_; } 
 16 |       const char*   cur() const { return cur_; }
 17 |       bool          eos() const { return *cur_ == '\0'; }
 18 |       void          setCur(const char* new_cur) { cur_ = new_cur; }
 19 | 
 20 |     private:
 21 |       const char* cur_;
 22 |     };
 23 | 
 24 |     class RangeCharStream {
 25 |     public:
 26 |       RangeCharStream(const char* beg, const char* end) : cur_(beg), end_(end) {}
 27 |       unsigned char read() { return eos() ? '\0' : *cur_++; }
 28 |       unsigned char prev() const { return cur_[-1]; }
 29 |       unsigned char peek() const { return *cur_; } 
 30 |       const char*   cur() const { return cur_; }
 31 |       const char*   end() const { return end_; }
 32 |       bool          eos() const { return cur_ == end_; }
 33 | 
 34 |     private:
 35 |       const char* cur_;
 36 |       const char* end_;
 37 |     };
 38 | 
 39 |     class CompoundCharStream {
 40 |     public:
 41 |       CompoundCharStream(const char* first, const char* second) 
 42 | 	: beg1(first), beg2(second), cur1(beg1), cur2(beg2) {}
 43 | 
 44 |       unsigned char read() { return !eos1() ? read1() : read2(); }
 45 |       unsigned char peek() const { return !eos1() ? *cur1 : *cur2; }
 46 |       unsigned char prev() const { return !eos1() || beg2==cur2 ? cur1[-1] : cur2[-1]; }
 47 | 
 48 |       const char* cur() const { return !eos1() ? cur1 : cur2; }
 49 |       bool eos() const { return eos1() && eos2(); }
 50 |       bool within_first() const { return !eos1(); }
 51 | 
 52 |       unsigned offset() const { return cur1-beg1 + cur2-beg2; }
 53 |       void setCur(const char* p) { 
 54 | 	if(beg1 <= p && p <= cur1) {
 55 | 	  cur1=p;
 56 | 	  cur2=beg2;
 57 | 	} else {
 58 | 	  cur2=p;
 59 | 	}
 60 |       }
 61 | 
 62 |     protected:
 63 |       unsigned char read1() { return eos1() ? '\0' : *cur1++; }
 64 |       unsigned char read2() { return eos2() ? '\0' : *cur2++; }
 65 |       bool eos1() const { return *cur1=='\0'; }
 66 |       bool eos2() const { return *cur2=='\0'; }
 67 |       
 68 |     protected:
 69 |       const char* beg1;
 70 |       const char* beg2;
 71 |       const char* cur1;
 72 |       const char* cur2;
 73 |     };
 74 | 
 75 |     class CharStreamForComposition : public CompoundCharStream {
 76 |     public:
 77 |       CharStreamForComposition (const char* first, const char* second, 
 78 | 				const std::vector<unsigned char>& canonical_classes, 
 79 | 				std::string& buf)
 80 | 	: CompoundCharStream(first, second), classes(canonical_classes), skipped(buf) 
 81 |       {}
 82 |       
 83 |       void init_skipinfo() { 
 84 | 	skipped.clear();
 85 | 	skipped_tail = 0;
 86 |       }
 87 | 
 88 |       void mark_as_last_valid_point() {
 89 | 	skipped_tail = skipped.size();
 90 | 	marked_point = cur();
 91 |       }
 92 | 
 93 |       void reset_at_marked_point() {
 94 | 	setCur(marked_point);
 95 |       }
 96 | 
 97 |       void append_read_char_to_str(std::string& s, const char* beg) const {
 98 | 	if(eos1()==false) {
 99 | 	  s.append(beg, cur());
100 | 	} else {
101 | 	  s.append(beg,  cur1);
102 | 	  s.append(beg2, cur());
103 | 	}
104 |       }
105 | 
106 |       void append_skipped_chars_to_str(std::string& s) const {
107 | 	s.append(skipped.begin(), skipped.begin()+skipped_tail);
108 |       }
109 | 
110 |       unsigned char get_canonical_class() const { 
111 | 	return offset() < classes.size() ? classes[offset()] : 0;
112 |       }
113 |       
114 |       bool next_combining_char(unsigned char prev_class, const char* ppp) {
115 | 	while(Util::is_utf8_char_start_byte(peek()) == false)
116 | 	  read();
117 | 	
118 | 	unsigned char mid_class = get_prev_canonical_class();
119 | 	unsigned char cur_class = get_canonical_class();
120 | 	
121 | 	if(prev_class==0 && mid_class==0 && cur_class!=0)
122 | 	  return false;
123 | 
124 | 	if(prev_class < cur_class && mid_class < cur_class) {
125 | 	  skipped.append(ppp, cur());
126 | 	  return true;
127 | 	} else {
128 | 	  if(cur_class != 0) {
129 | 	    read();
130 | 	    return next_combining_char(prev_class,ppp);
131 | 	  }
132 | 	  return false;
133 | 	}
134 |       }
135 | 
136 |     private:
137 |       unsigned char get_prev_canonical_class() const { 
138 | 	return offset()-1 < classes.size() ? classes[offset()-1] : 0;
139 |       }
140 | 
141 |     private:
142 |       const std::vector<unsigned char>& classes;
143 |       std::string& skipped;
144 |       unsigned skipped_tail;
145 |       const char* marked_point;
146 |     };
147 |   }
148 | }
149 | 
150 | #endif
151 | 


--------------------------------------------------------------------------------
/ext/unf_ext/unf/trie/node.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UNF_TRIE_NODE_HH
 2 | #define UNF_TRIE_NODE_HH
 3 | 
 4 | namespace UNF {
 5 |   namespace Trie {
 6 |     class Node {
 7 |     public:
 8 |       unsigned jump(unsigned char ch) const { return base() + ch; }
 9 |       unsigned value() const { return base(); }
10 |       unsigned check_char() const { return data>>24; }
11 |       unsigned to_uint() const { return data; }
12 | 
13 |       static const Node* from_uint_array(const unsigned* node_uints)
14 |       { return reinterpret_cast<const Node*>(node_uints); }
15 | 
16 |     private:
17 |       unsigned base() const { return data & 0xFFFFFF; }
18 | 
19 |     private:
20 |       unsigned data;
21 |     };
22 |   }
23 | }
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/ext/unf_ext/unf/trie/searcher.hh:
--------------------------------------------------------------------------------
  1 | #ifndef UNF_TRIE_SEARCHER_HH
  2 | #define UNF_TRIE_SEARCHER_HH
  3 | 
  4 | #include "char_stream.hh"
  5 | #include "node.hh"
  6 | #include "../util.hh"
  7 | 
  8 | namespace UNF {
  9 |   namespace Trie {
 10 |     class Searcher {
 11 |     public:
 12 |       Searcher(const Node* nodes, unsigned root, const char* value=NULL)
 13 | 	: nodes(nodes), root(root), value(value) {}
 14 | 
 15 |       unsigned find_value(const char* key, int default_value) const {
 16 | 	unsigned node_index=root;
 17 | 	for(CharStream in(key);; in.read()) {
 18 | 	  node_index = nodes[node_index].jump(in.peek());
 19 | 	  if(nodes[node_index].check_char()==in.peek()) {
 20 | 	    unsigned terminal_index = nodes[node_index].jump('\0'); 
 21 | 	    if(nodes[terminal_index].check_char()=='\0') {
 22 | 	      return nodes[terminal_index].value();
 23 |             }
 24 | 	  } else
 25 | 	    return default_value;
 26 | 	}
 27 |       }     
 28 | 
 29 |     protected:
 30 |       const Node* nodes;
 31 |       const unsigned root;
 32 |       const char* value;
 33 |     }; 
 34 |     
 35 |     class CanonicalCombiningClass : private Searcher {
 36 |     public:
 37 |       CanonicalCombiningClass(const unsigned* node_uints, unsigned root)
 38 | 	: Searcher(Node::from_uint_array(node_uints), root) {}
 39 |       
 40 |       unsigned get_class(const char* str) const { return find_value(str,0); }
 41 | 
 42 |       void sort(char* str, std::vector<unsigned char>& classes) const {
 43 | 	CharStream in(str);
 44 | 	unsigned sort_beg=0;
 45 | 	unsigned sort_end=0;
 46 | 	unsigned unicode_char_count=0;
 47 | 
 48 |       loop_head:
 49 | 	unsigned beg = in.cur()-str;
 50 | 	
 51 | 	for(unsigned node_index=root;;){
 52 | 	  node_index = nodes[node_index].jump(in.read());
 53 | 	  
 54 | 	  if(nodes[node_index].check_char()==in.prev()) {
 55 | 	    unsigned terminal_index = nodes[node_index].jump('\0');
 56 | 	    if(nodes[terminal_index].check_char()=='\0') {
 57 | 	      if((unicode_char_count++)==0)
 58 | 		sort_beg = beg;
 59 | 	      sort_end = in.cur()-str;
 60 | 	      
 61 | 	      unsigned char klass = nodes[terminal_index].value();
 62 | 	      for(unsigned i=beg; i < sort_end; i++) 
 63 | 		classes[i] = klass;
 64 |  	      break;
 65 | 	    }
 66 | 	  } else {
 67 | 	    if(unicode_char_count > 1)
 68 | 	      bubble_sort(str, classes, sort_beg, sort_end);
 69 | 	    unicode_char_count = 0;
 70 | 	    break;
 71 | 	  }
 72 | 	} 
 73 | 	Util::eat_until_utf8_char_start_point(in);
 74 | 
 75 | 	if(in.eos()==false)
 76 | 	  goto loop_head;
 77 | 
 78 | 	if(unicode_char_count > 1)
 79 | 	  bubble_sort(str, classes, sort_beg, sort_end);
 80 |       }      
 81 | 
 82 |     private:
 83 |       void bubble_sort(char* str, std::vector<unsigned char>& canonical_classes, unsigned beg, unsigned end) const {
 84 | 	for(unsigned limit=beg, next=end; limit != next;) {
 85 | 	  limit = next;
 86 | 	  for(unsigned i=beg+1; i < limit; i++)
 87 | 	    if(canonical_classes[i-1] > canonical_classes[i]) {
 88 | 	      std::swap(canonical_classes[i-1], canonical_classes[i]);
 89 | 	      std::swap(str[i-1], str[i]);
 90 | 	      next = i;
 91 | 	    }
 92 | 	}
 93 |       }
 94 |     };
 95 | 
 96 |     class NormalizationForm : private Searcher {
 97 |     public:
 98 |       NormalizationForm(const unsigned* node_uints, unsigned root, const char* value=NULL)
 99 | 	: Searcher(Node::from_uint_array(node_uints), root, value) {} 
100 | 
101 |       bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
102 | 
103 |       void decompose(RangeCharStream in, std::string& buffer) const {
104 |       loop_head:
105 | 	const char* beg = in.cur();
106 | 
107 | 	for(unsigned node_index=root;;) {
108 | 	  node_index = nodes[node_index].jump(in.read());
109 | 	  if(nodes[node_index].check_char()==in.prev()) {
110 | 	    unsigned terminal_index = nodes[node_index].jump('\0');
111 | 	    if(nodes[terminal_index].check_char()=='\0') {
112 |               word_append(buffer, value, nodes[terminal_index].value());
113 | 	      beg = in.cur();
114 | 	      break;
115 | 	    }
116 | 	  } else {
117 | 	    Util::eat_until_utf8_char_start_point(in);
118 | 	    buffer.append(beg, in.cur());
119 | 	    break;
120 | 	  }
121 | 	}  
122 | 
123 | 	if(in.eos()==false)
124 | 	  goto loop_head;
125 |       }
126 | 
127 |       void compose(CharStreamForComposition& in, std::string& buf) const {
128 | 	in.init_skipinfo();
129 | 
130 | 	const char* const beg = in.cur();
131 | 	const char* current_char_head = in.cur();
132 | 	unsigned composed_char_info = 0;
133 | 	
134 | 	unsigned node_index = root;
135 | 	unsigned retry_root_node = root;
136 | 	unsigned char retry_root_class = 0;
137 | 
138 | 	for(bool first=true;;) {
139 | 	  if(Util::is_utf8_char_start_byte(in.peek())) {
140 | 	    if(node_index != root)
141 | 	      first=false;
142 | 	    current_char_head = in.cur();
143 | 
144 | 	    retry_root_node = node_index;
145 | 	    retry_root_class = in.get_canonical_class();
146 | 	  }
147 | 
148 | 	retry:
149 | 	  unsigned next_index = nodes[node_index].jump(in.peek());
150 | 	  if(nodes[next_index].check_char()==in.read()) {
151 | 	    // succeeded
152 | 	    node_index = next_index;
153 | 	    unsigned terminal_index = nodes[node_index].jump('\0');
154 | 	    if(nodes[terminal_index].check_char()=='\0') {
155 | 	      composed_char_info = nodes[terminal_index].value();
156 |               
157 | 	      in.mark_as_last_valid_point();
158 | 	      if(in.eos() || retry_root_class > in.get_canonical_class())
159 | 		break;
160 | 	    }
161 | 	  } else if (first==true) {
162 | 	    // no retry if current point is a part of first starter
163 | 	    break;
164 | 	  } else if (in.next_combining_char(retry_root_class, current_char_head)==true) { 
165 | 	    // back previous code-point and retry
166 | 	    node_index = retry_root_node;
167 | 	    current_char_head = in.cur();
168 | 	    goto retry;
169 | 	  } else {
170 | 	    break;
171 | 	  }
172 | 	}  
173 | 	
174 | 	if(composed_char_info != 0) {
175 | 	  // append composed unicode-character and skipped combining-characters
176 | 	  word_append(buf, value, composed_char_info);
177 | 	  in.append_skipped_chars_to_str(buf);
178 | 	  in.reset_at_marked_point();
179 | 	} else {
180 | 	  // append one unicode-character
181 | 	  in.setCur(Util::nearest_utf8_char_start_point(beg+1));
182 | 	  in.append_read_char_to_str(buf, beg);
183 | 	}
184 |       }
185 |       
186 |     private:
187 |       static void word_append(std::string& buffer, const char* base, unsigned pos_info) {
188 |         buffer.append(base+(pos_info&0x3FFFF), pos_info>>18);
189 |       }
190 |     };
191 |   }
192 | }
193 | 
194 | #endif
195 | 


--------------------------------------------------------------------------------
/ext/unf_ext/unf/util.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UNF_UTIL_HH
 2 | #define UNF_UTIL_HH
 3 | 
 4 | namespace UNF {
 5 |   namespace Util {
 6 |     inline bool is_utf8_char_start_byte(char byte) {
 7 |       if(!(byte&0x80))    return true; // ascii
 8 |       else if (byte&0x40) return true; // start of a UTF-8 character byte sequence
 9 |       return false;
10 |     }
11 | 
12 |     inline const char* nearest_utf8_char_start_point(const char* s) {
13 |       for(; is_utf8_char_start_byte(*s)==false; s++);
14 |       return s;
15 |     }
16 | 
17 |     template <class CharStream>
18 |     inline void eat_until_utf8_char_start_point(CharStream& in) {
19 |       for(; is_utf8_char_start_byte(in.peek())==false; in.read());
20 |     }
21 |   }
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/lib/unf_ext.rb:
--------------------------------------------------------------------------------
1 | begin
2 |   require "#{RUBY_VERSION[/\A[0-9]+\.[0-9]+/]}/unf_ext.so"
3 | rescue LoadError
4 |   require "unf_ext.so"
5 | end
6 | 


--------------------------------------------------------------------------------
/lib/unf_ext/version.rb:
--------------------------------------------------------------------------------
1 | module UNF
2 |   class Normalizer
3 |     VERSION = "0.0.9.1"
4 |   end
5 | end
6 | 


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'bundler'
 3 | begin
 4 |   Bundler.setup(:default, :development)
 5 | rescue Bundler::BundlerError => e
 6 |   $stderr.puts e.message
 7 |   $stderr.puts "Run `bundle install` to install missing gems"
 8 |   exit e.status_code
 9 | end
10 | require 'test/unit'
11 | 
12 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..'))
14 | $LOAD_PATH.unshift(File.dirname(__FILE__))
15 | require 'unf_ext'
16 | 
17 | class Test::Unit::TestCase
18 | end
19 | 


--------------------------------------------------------------------------------
/test/test_unf_ext.rb:
--------------------------------------------------------------------------------
 1 | require 'helper'
 2 | require 'pathname'
 3 | 
 4 | class TestUnf < Test::Unit::TestCase
 5 |   test "raise ArgumentError if an unknown normalization form is given" do
 6 |     normalizer = UNF::Normalizer.new
 7 |     assert_raises(ArgumentError) { normalizer.normalize("が", :nfck) }
 8 |   end
 9 | 
10 |   test "pass all tests bundled with the original unf" do
11 |     normalizer = UNF::Normalizer.new
12 |     open(Pathname(__FILE__).dirname + 'normalization-test.txt', 'r:utf-8').each_slice(6) { |lines|
13 |       flunk "broken test file" if lines.size != 6 || lines.pop !~ /^$/
14 |       str, nfc, nfd, nfkc, nfkd = lines
15 |       assert_equal nfd,  normalizer.normalize(str,  :nfd)
16 |       assert_equal nfd,  normalizer.normalize(nfd,  :nfd)
17 |       assert_equal nfd,  normalizer.normalize(nfc,  :nfd)
18 |       assert_equal nfkd, normalizer.normalize(nfkc, :nfd)
19 |       assert_equal nfkd, normalizer.normalize(nfkc, :nfd)
20 | 
21 |       assert_equal nfc,  normalizer.normalize(str,  :nfc)
22 |       assert_equal nfc,  normalizer.normalize(nfd,  :nfc)
23 |       assert_equal nfc,  normalizer.normalize(nfc,  :nfc)
24 |       assert_equal nfkc, normalizer.normalize(nfkc, :nfc)
25 |       assert_equal nfkc, normalizer.normalize(nfkd, :nfc)
26 | 
27 |       assert_equal nfkd, normalizer.normalize(str,  :nfkd)
28 |       assert_equal nfkd, normalizer.normalize(nfd,  :nfkd)
29 |       assert_equal nfkd, normalizer.normalize(nfc,  :nfkd)
30 |       assert_equal nfkd, normalizer.normalize(nfkc, :nfkd)
31 |       assert_equal nfkd, normalizer.normalize(nfkd, :nfkd)
32 | 
33 |       assert_equal nfkc, normalizer.normalize(str,  :nfkc)
34 |       assert_equal nfkc, normalizer.normalize(nfd,  :nfkc)
35 |       assert_equal nfkc, normalizer.normalize(nfc,  :nfkc)
36 |       assert_equal nfkc, normalizer.normalize(nfkc, :nfkc)
37 |       assert_equal nfkc, normalizer.normalize(nfkd, :nfkc)
38 |     }
39 |   end
40 | end
41 | 


--------------------------------------------------------------------------------
/unf_ext.gemspec:
--------------------------------------------------------------------------------
 1 | lib = File.expand_path('../lib', __FILE__)
 2 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 3 | require 'unf_ext/version'
 4 | 
 5 | Gem::Specification.new do |gem|
 6 |   gem.name          = "unf_ext"
 7 |   gem.version       = UNF::Normalizer::VERSION
 8 |   gem.authors       = ["Takeru Ohta", "Akinori MUSHA"]
 9 |   gem.email         = ["knu@idaemons.org"]
10 |   gem.description   = %q{Unicode Normalization Form support library for CRuby}
11 |   gem.summary       = %q{Unicode Normalization Form support library for CRuby}
12 |   gem.homepage      = "https://github.com/knu/ruby-unf_ext"
13 |   gem.licenses      = ["MIT"]
14 | 
15 |   gem.files         = `git ls-files`.split($/)
16 |   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17 |   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/}).grep(%r{/test_[^/]+\.rb$})
18 |   gem.require_paths = ["lib"]
19 |   gem.extensions    = ["ext/unf_ext/extconf.rb"]
20 | 
21 |   gem.extra_rdoc_files = [
22 |     "LICENSE.txt",
23 |     "README.md"
24 |   ]
25 | 
26 |   gem.required_ruby_version = '>= 2.2'
27 | 
28 |   gem.add_development_dependency("rake", [">= 0.9.2.2"])
29 |   gem.add_development_dependency("test-unit")
30 |   gem.add_development_dependency("rdoc", ["> 2.4.2"])
31 |   gem.add_development_dependency("bundler", [">= 1.2"])
32 |   gem.add_development_dependency("rake-compiler", [">= 1.2.1"])
33 |   gem.add_development_dependency("rake-compiler-dock", [">= 1.3.0"])
34 | end
35 | 


--------------------------------------------------------------------------------