├── .document
├── .gitignore
├── .rspec
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.rdoc
├── Rakefile
├── VERSION
├── igo-ruby.gemspec
├── lib
    ├── igo-ruby.rb
    └── igo
    │   ├── dictionary.rb
    │   ├── tagger.rb
    │   ├── trie.rb
    │   ├── util.rb
    │   └── version.rb
├── spec
    ├── igo-ruby_spec.rb
    └── spec_helper.rb
└── test
    └── test.rb


/.document:
--------------------------------------------------------------------------------
1 | lib/**/*.rb
2 | bin/*
3 | - 
4 | features/**/*.feature
5 | LICENSE.txt
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # rcov generated
 2 | coverage
 3 | 
 4 | # rdoc generated
 5 | rdoc
 6 | 
 7 | # yard generated
 8 | doc
 9 | .yardoc
10 | 
11 | # bundler
12 | .bundle
13 | 
14 | # jeweler generated
15 | pkg
16 | 
17 | # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore: 
18 | #
19 | # * Create a file at ~/.gitignore
20 | # * Include files you want ignored
21 | # * Run: git config --global core.excludesfile ~/.gitignore
22 | #
23 | # After doing this, these files will be ignored in all your git projects,
24 | # saving you from having to 'pollute' every project you touch with them
25 | #
26 | # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
27 | #
28 | # For MacOS:
29 | 
30 | .DS_Store
31 | 
32 | # For TextMate
33 | #*.tmproj
34 | #tmtags
35 | #
36 | # For emacs:
37 | #*~
38 | #\#*
39 | #.\#*
40 | #
41 | # For vim:
42 | #*.swp
43 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "http://rubygems.org"
 2 | # Add dependencies required to use your gem here.
 3 | # Example:
 4 | #   gem "activesupport", ">= 2.3.5"
 5 | 
 6 | # Add dependencies to develop your gem here.
 7 | # Include everything needed to run rake, tests, features, etc.
 8 | group :development do
 9 |   gem "rspec", "~> 2.1.0"
10 |   gem "bundler", "~> 1.0.0"
11 |   gem "jeweler", "~> 1.5.1"
12 |   gem "rcov", ">= 0"
13 | end
14 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: http://rubygems.org/
 3 |   specs:
 4 |     diff-lcs (1.1.2)
 5 |     git (1.2.5)
 6 |     jeweler (1.5.1)
 7 |       bundler (~> 1.0.0)
 8 |       git (>= 1.2.5)
 9 |       rake
10 |     rake (0.8.7)
11 |     rcov (0.9.9)
12 |     rspec (2.1.0)
13 |       rspec-core (~> 2.1.0)
14 |       rspec-expectations (~> 2.1.0)
15 |       rspec-mocks (~> 2.1.0)
16 |     rspec-core (2.1.0)
17 |     rspec-expectations (2.1.0)
18 |       diff-lcs (~> 1.1.2)
19 |     rspec-mocks (2.1.0)
20 | 
21 | PLATFORMS
22 |   ruby
23 | 
24 | DEPENDENCIES
25 |   bundler (~> 1.0.0)
26 |   jeweler (~> 1.5.1)
27 |   rcov
28 |   rspec (~> 2.1.0)
29 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 kyow
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
 1 | = igo-ruby
 2 | igo-rubyはJavaおよびCommon Lispで実装された形態素解析器 Igo[http://igo.sourceforge.jp] のRuby実装です。
 3 | 
 4 | igo-rubyでは、 Igo[http://igo.sourceforge.jp] と同一の解析用辞書ファイルを使用します。
 5 | 従って Igo[http://igo.sourceforge.jp] の機能を使用して解析用辞書ファイルを生成する必要があります。
 6 | 
 7 | == インストール方法
 8 | コマンドプロンプトより以下を実行してください。
 9 |   $ gem install igo-ruby
10 | 
11 | == 解析用辞書ファイルの生成
12 | {Igoのインストール/使い方}[http://igo.sourceforge.jp/index.html#usage] を参照してください。
13 | 
14 | == サンプル
15 | === 形態素解析
16 |   require 'rubygems'
17 |   require 'igo-ruby'
18 |   tagger = Igo::Tagger.new('../../ipadic')  # 解析用辞書のディレクトリを指定
19 |   
20 |   t = tagger.parse('吾輩は猫である。名前はまだ無い。')
21 |   t.each{|m|
22 |     puts "#{m.surface} #{m.feature} #{m.start}"
23 |   }
24 |   
25 |   # 実行結果
26 |   吾輩 名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ 0
27 |   は 助詞,係助詞,*,*,*,*,は,ハ,ワ 2
28 |   猫 名詞,一般,*,*,*,*,猫,ネコ,ネコ 3
29 |   で 助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ 4
30 |   ある 助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル 5
31 |   。 記号,句点,*,*,*,*,。,。,。 7
32 |   名前 名詞,一般,*,*,*,*,名前,ナマエ,ナマエ 8
33 |   は 助詞,係助詞,*,*,*,*,は,ハ,ワ 10
34 |   まだ 副詞,助詞類接続,*,*,*,*,まだ,マダ,マダ 11
35 |   無い 形容詞,自立,*,*,形容詞・アウオ段,基本形,無い,ナイ,ナイ 13
36 |   。 記号,句点,*,*,*,*,。,。,。 15
37 | 
38 | === 分かち書き
39 |   require 'rubygems'
40 |   require 'igo-ruby'
41 |   
42 |   tagger = Igo::Tagger.new('../../ipadic')  # 解析用辞書のディレクトリを指定
43 |   t = tagger.wakati('どこで生れたかとんと見当がつかぬ。')
44 |   puts t.join(' ')
45 |   
46 |   # 実行結果
47 |   どこ で 生れ た か とんと 見当 が つか ぬ 。
48 | 
49 | === ウェブアプリ例
50 | * {igo-ruby.heroku.com}[http://igo-ruby.heroku.com/]
51 | 
52 | == 付録
53 | === 公開場所
54 | * RubyGems
55 |   * igo-ruby[https://rubygems.org/gems/igo-ruby]
56 | * ソース(github)
57 |   * {kyow/igo-ruby}[https://github.com/kyow/igo-ruby]
58 | 
59 | === 参照
60 | * Igo
61 |   1. {Igo - Java形態素解析器}[http://igo.sourceforge.jp/index.html]
62 |   2. {Igo}[http://sourceforge.jp/projects/igo/releases/]
63 | * Igo-python
64 |   1. {igo-python 0.3a}[http://pypi.python.org/pypi/igo-python/0.3a]
65 |   2. {Igo Japanease morphological analyzer for python}[https://launchpad.net/igo-python/]
66 | 
67 | == Copyright
68 | Copyright (c) kyow, 2010. See LICENSE.txt for further details.


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'bundler'
 3 | begin
 4 |   Bundler.setup(:default, :development)
 5 | rescue Bundler::BundlerError => e
 6 |   $stderr.puts e.message
 7 |   $stderr.puts "Run `bundle install` to install missing gems"
 8 |   exit e.status_code
 9 | end
10 | require 'rake'
11 | 
12 | require 'jeweler'
13 | Jeweler::Tasks.new do |gem|
14 |   # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15 |   gem.name = "igo-ruby"
16 |   gem.homepage = "http://github.com/kyow/igo-ruby"
17 |   gem.license = "MIT"
18 |   gem.summary = %Q{Ruby port of Igo Japanese morphological analyzer.}
19 |   gem.description = %Q{
20 |     Ruby port of Igo Japanese morphological analyzer. Igo-ruby needs Igo's binary dictionary files.
21 |     These files created by Java programs.
22 |     See: http://igo.sourceforge.jp/
23 |   }
24 |   gem.email = "24signals@gmail.com"
25 |   gem.authors = ["K.Nishi"]
26 |   # Include your dependencies below. Runtime dependencies are required when using your gem,
27 |   # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
28 |   #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
29 |   #  gem.add_development_dependency 'rspec', '> 1.2.3'
30 |   
31 |   gem.files = Rake::FileList.new('lib/**/*.rb', '[A-Z]*')
32 |   gem.required_rubygems_version = ">1.3.6"
33 |   gem.rdoc_options << '-c UTF-8' << '-S' << '-U'
34 | end
35 | Jeweler::RubygemsDotOrgTasks.new
36 | 
37 | require 'rspec/core'
38 | require 'rspec/core/rake_task'
39 | RSpec::Core::RakeTask.new(:spec) do |spec|
40 |   spec.pattern = FileList['spec/**/*_spec.rb']
41 | end
42 | 
43 | RSpec::Core::RakeTask.new(:rcov) do |spec|
44 |   spec.pattern = 'spec/**/*_spec.rb'
45 |   spec.rcov = true
46 | end
47 | 
48 | task :default => :spec
49 | 
50 | require 'rake/rdoctask'
51 | Rake::RDocTask.new do |rdoc|
52 |   version = File.exist?('VERSION') ? File.read('VERSION') : ""
53 | 
54 |   rdoc.rdoc_dir = 'rdoc'
55 |   rdoc.title = "igo-ruby #{version}"
56 |   rdoc.rdoc_files.include('README*')
57 |   rdoc.rdoc_files.include('lib/**/*.rb')
58 |   
59 |   rdoc.options << '-c UTF-8' << '-S' << '-U'
60 | end
61 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.5


--------------------------------------------------------------------------------
/igo-ruby.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = %q{igo-ruby}
 8 |   s.version = "0.1.5"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new("> 1.3.6") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["K.Nishi"]
12 |   s.date = %q{2011-06-15}
13 |   s.description = %q{
14 |     Ruby port of Igo Japanese morphological analyzer. Igo-ruby needs Igo's binary dictionary files.
15 |     These files created by Java programs.
16 |     See: http://igo.sourceforge.jp/
17 |   }
18 |   s.email = %q{24signals@gmail.com}
19 |   s.extra_rdoc_files = [
20 |     "LICENSE.txt",
21 |     "README.rdoc"
22 |   ]
23 |   s.files = [
24 |     "Gemfile",
25 |     "Gemfile.lock",
26 |     "LICENSE.txt",
27 |     "README.rdoc",
28 |     "Rakefile",
29 |     "VERSION",
30 |     "lib/igo-ruby.rb",
31 |     "lib/igo/dictionary.rb",
32 |     "lib/igo/tagger.rb",
33 |     "lib/igo/trie.rb",
34 |     "lib/igo/util.rb",
35 |     "lib/igo/version.rb"
36 |   ]
37 |   s.homepage = %q{http://github.com/kyow/igo-ruby}
38 |   s.licenses = ["MIT"]
39 |   s.rdoc_options = ["-c UTF-8", "-S", "-U"]
40 |   s.require_paths = ["lib"]
41 |   s.rubygems_version = %q{1.5.0}
42 |   s.summary = %q{Ruby port of Igo Japanese morphological analyzer.}
43 |   s.test_files = [
44 |     "spec/igo-ruby_spec.rb",
45 |     "spec/spec_helper.rb",
46 |     "test/test.rb"
47 |   ]
48 | 
49 |   if s.respond_to? :specification_version then
50 |     s.specification_version = 3
51 | 
52 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
53 |       s.add_development_dependency(%q<rspec>, ["~> 2.1.0"])
54 |       s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
55 |       s.add_development_dependency(%q<jeweler>, ["~> 1.5.1"])
56 |       s.add_development_dependency(%q<rcov>, [">= 0"])
57 |     else
58 |       s.add_dependency(%q<rspec>, ["~> 2.1.0"])
59 |       s.add_dependency(%q<bundler>, ["~> 1.0.0"])
60 |       s.add_dependency(%q<jeweler>, ["~> 1.5.1"])
61 |       s.add_dependency(%q<rcov>, [">= 0"])
62 |     end
63 |   else
64 |     s.add_dependency(%q<rspec>, ["~> 2.1.0"])
65 |     s.add_dependency(%q<bundler>, ["~> 1.0.0"])
66 |     s.add_dependency(%q<jeweler>, ["~> 1.5.1"])
67 |     s.add_dependency(%q<rcov>, [">= 0"])
68 |   end
69 | end
70 | 
71 | 


--------------------------------------------------------------------------------
/lib/igo-ruby.rb:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | #
 3 | #= 形態素解析エンジンIgoのRuby実装
 4 | #解析結果がほぼMeCab互換の形態素解析エンジン"Igo"のRuby実装
 5 | #
 6 | #Copyright:: Copyright (c) kyow, 2010
 7 | #Authors:: K.Nishi
 8 | #License:: MIT License ただし、使用する辞書のライセンスに関しては、辞書配布元のそれに準ずる
 9 | #
10 | #== 注意
11 | #igo-rubyには辞書ファイルを生成する機能はありません。
12 | #Igoで生成した辞書ファイルを使用してください。
13 | #
14 | #== 公開
15 | #* RubyGems
16 | #  * igo-ruby[https://rubygems.org/gems/igo-ruby]
17 | #* ソース(github)
18 | #  * {kyow/igo-ruby}[https://github.com/kyow/igo-ruby]
19 | #
20 | #== 参照
21 | #* Igo
22 | #  1. {Igo - Java形態素解析器}[http://igo.sourceforge.jp/index.html]
23 | #  2. {Igo}[http://sourceforge.jp/projects/igo/releases/]
24 | #* Igo-python
25 | #  1. {igo-python 0.3a}[http://pypi.python.org/pypi/igo-python/0.3a]
26 | #  2. {Igo Japanease morphological analyzer for python}[https://launchpad.net/igo-python/]
27 | #
28 | 
29 | $:.unshift(File.dirname(__FILE__))
30 | 
31 | require 'nkf'
32 | require 'kconv'
33 | 
34 | #
35 | #== Igoモジュール
36 | #
37 | module Igo
38 |   autoload :Tagger, 'igo/tagger'
39 |   autoload :Version, 'igo/version'
40 | end
41 | 


--------------------------------------------------------------------------------
/lib/igo/dictionary.rb:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | #= 辞書クラス群
  3 | 
  4 | module Igo
  5 |   #
  6 |   # Viterbiアルゴリズムで使用されるノードクラス
  7 |   #
  8 |   class ViterbiNode
  9 |     attr_accessor :cost, :prev, :word_id, :start, :length, :left_id, :right_id, :is_space
 10 |     def initialize(word_id, start, length, left_id, right_id, is_space)
 11 |       @cost = 0            # 始点からノードまでの総コスト
 12 |       @prev = nil          # コスト最小の前方のノードへのリンク
 13 |       @word_id = word_id   # 単語ID
 14 |       @start = start       # 入力テキスト内での形態素の開始位置
 15 |       @length = length     # 形態素の表層形の長さ(文字数)
 16 |       @left_id = left_id   # 左文脈ID
 17 |       @right_id = right_id # 右文脈ID
 18 |       @is_space = is_space # 形態素の文字種(文字カテゴリ)が空白かどうか
 19 |     end
 20 |   
 21 |     def self.make_BOSEOS
 22 |       return ViterbiNode.new(0, 0, 0, 0, 0, false)
 23 |     end
 24 |   end
 25 | 
 26 |   class CharCategory
 27 |     def initialize(data_dir)
 28 |       @categories = CharCategory.read_categories(data_dir)
 29 |       fmis = FileMappedInputStream.new(data_dir + "/code2category")
 30 |       @char2id = fmis.get_int_array(fmis.size / 4 / 2)
 31 |       @eql_masks = fmis.get_int_array(fmis.size / 4 /2)
 32 |       fmis.close
 33 |     end
 34 |   
 35 |     def category(code)
 36 |       return @categories[@char2id[code]]
 37 |     end
 38 |   
 39 |     def compatible?(code1, code2)
 40 |       return (@eql_masks[code1] & @eql_masks[code2]) != 0
 41 |     end
 42 |   
 43 |     def self.read_categories(data_dir)
 44 |       data = FileMappedInputStream::get_int_array(data_dir + "/char.category")
 45 |       size = data.size / 4
 46 |       ary = []
 47 |       for i in 0 .. (size - 1)
 48 |         ary.push(Category.new(data[i * 4], data[i * 4 + 1], data[i * 4 + 2] == 1, data[i * 4 + 3] == 1))
 49 |       end
 50 |       return ary
 51 |     end
 52 |   end
 53 | 
 54 |   class Category
 55 |     attr_reader :id, :length, :invoke, :group
 56 |     def initialize(i, l, iv, g)
 57 |       @id = i
 58 |       @length = l
 59 |       @invoke = iv
 60 |       @group = g
 61 |     end
 62 |   end
 63 | 
 64 |   #
 65 |   # 形態素の連接コスト表クラス
 66 |   #
 67 |   class Matrix
 68 |     # コンストラクタ
 69 |     # data_dir:: 辞書ファイルのディレクトリパス
 70 |     def initialize(data_dir)
 71 |       fmis = FileMappedInputStream.new(data_dir + "/matrix.bin")
 72 |       @left_size = fmis.get_int
 73 |       @right_size = fmis.get_int
 74 |       @matrix = fmis.get_short_array(@left_size * @right_size)
 75 |       fmis.close
 76 |     end
 77 |   
 78 |     # 形態素同士の連接コストを求める
 79 |     # left_id:: 左文脈ID
 80 |     # right_id:: 右文脈ID
 81 |     def link_cost(left_id, right_id)
 82 |       return @matrix[right_id * @right_size + left_id]
 83 |     end
 84 |   end
 85 | 
 86 |   #
 87 |   # 未知語の検索を行うクラス
 88 |   #
 89 |   class Unknown
 90 |   
 91 |     # コンストラクタ
 92 |     #data_dir:: 辞書ファイルのディレクトリパス
 93 |     def initialize(data_dir)
 94 |       # 文字カテゴリ管理クラス
 95 |       @category = CharCategory.new(data_dir)
 96 |     
 97 |       # 文字カテゴリが空白の文字のID
 98 |       @space_id = @category.category(' '.unpack("U*")[0]).id
 99 |     end
100 |   
101 |     # 検索
102 |     #text::
103 |     #start::
104 |     #wdic::
105 |     #result::
106 |     def search(text, start, wdic, result)
107 |       if RUBY_VERSION >= '1.9.0'
108 |         txt = text.bytes.to_a
109 |       else
110 |         txt = text.unpack('C*')
111 |       end
112 |       txtu = text.unpack("U*")
113 |       length = txtu.size
114 |       ch = txtu[start]
115 |       ct = @category.category(ch)
116 |     
117 |       if !result.empty? and !ct.invoke
118 |         return
119 |       end
120 |     
121 |       is_space = (ct.id == @space_id)
122 |       limit = [length, ct.length + start].min
123 |     
124 |       for i in start..(limit - 1)
125 |         wdic.search_from_trie_id(ct.id, start, (i - start) + 1, is_space, result)
126 |         
127 |         if((i + 1) != limit and !(@category.compatible?(ch, txt[i + 1])))
128 |           return
129 |         end
130 |       end
131 |     
132 |       if ct.group and limit < length
133 |         for i in limit..(length - 1)
134 |           if not @category.compatible?(ch, txtu[i])
135 |             wdic.search_from_trie_id(ct.id, start, i - start, is_space, result)
136 |             return
137 |           end
138 |         end
139 |         wdic.search_from_trie_id(ct.id, start, length - start, is_space, result)
140 |       end
141 |     end
142 |   end
143 | 
144 |   class WordDic
145 |     # コンストラクタ
146 |     #data_dir:: 辞書ファイルのディレクトリパス
147 |     def initialize(data_dir)
148 |       @trie = Searcher.new(data_dir + "/word2id")
149 |       @data = FileMappedInputStream.get_string(data_dir + "/word.dat")
150 |       @indices = FileMappedInputStream.get_int_array(data_dir + "/word.ary.idx")
151 |     
152 |       fmis = FileMappedInputStream.new(data_dir + "/word.inf")
153 |       word_count = fmis.size / (4 + 2 + 2 + 2)
154 |       @data_offsets = fmis.get_int_array(word_count)   # 単語の素性データの開始位置
155 |       @left_ids     = fmis.get_short_array(word_count) # 単語の左文脈ID
156 |       @right_ids    = fmis.get_short_array(word_count) # 単語の右文脈ID
157 |       @costs        = fmis.get_short_array(word_count) # 単語のコスト
158 |       fmis.close
159 |     end
160 |   
161 |     def cost(word_id)
162 |       return @costs[word_id]
163 |     end
164 |   
165 |     def search(text, start, result)
166 |       indices = @indices
167 |       left_ids = @left_ids
168 |       right_ids = @right_ids
169 |     
170 |       @trie.each_common_prefix(text, start, Proc.new { |start, offset, trie_id|
171 |         ed = @indices[trie_id + 1]
172 |       
173 |         for i in indices[trie_id]..(ed - 1)
174 |           result.push(ViterbiNode.new(i, start, offset, @left_ids[i], right_ids[i], false))
175 |         end
176 |       })
177 |     end
178 |   
179 |     def search_from_trie_id(trie_id, start, word_length, is_space, result)
180 |       ed = @indices[trie_id + 1]
181 |       for i in @indices[trie_id]..(ed - 1)
182 |         result.push(ViterbiNode.new(i, start, word_length, @left_ids[i], @right_ids[i], is_space))
183 |       end
184 |     end
185 |   
186 |     def word_data(word_id)
187 |       return @data.slice(@data_offsets[word_id]*2..@data_offsets[word_id + 1]*2 - 1)
188 |     end
189 |   end
190 | end
191 | 


--------------------------------------------------------------------------------
/lib/igo/tagger.rb:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | #形態素解析と分かち書きを行う機能の実装
  3 | 
  4 | require 'igo/dictionary'
  5 | require 'igo/trie'
  6 | 
  7 | module Igo
  8 |   #
  9 |   #形態素クラス
 10 |   #
 11 |   class Morpheme
 12 |     attr_accessor :surface, :feature, :start
 13 |     
 14 |     #surface:: 形態素の表層形
 15 |     #feature:: 形態素の素性
 16 |     #start:: テキスト内でも形態素の出現開始位置
 17 |     def initialize(surface, feature, start)
 18 |       @surface = surface
 19 |       @feature = feature
 20 |       @start = start
 21 |     end
 22 | end
 23 | 
 24 |   #
 25 |   #形態素解析を行うクラス
 26 |   #
 27 |   class Tagger
 28 |     def self.__BOS_NODES
 29 |       return [ViterbiNode.make_BOSEOS]
 30 |     end
 31 |     
 32 |     #dir:: 辞書ファイルのディレクトリパス
 33 |     def initialize(dir)
 34 |       @wdc = WordDic.new(dir)
 35 |       @unk = Unknown.new(dir)
 36 |       @mtx = Matrix.new(dir)
 37 |     end
 38 |     
 39 |     #形態素解析を行う
 40 |     #text:: 解析対象テキスト
 41 |     #result:: 解析結果の形態素が追加される配列
 42 |     #return:: 解析結果の形態素配列
 43 |     def parse(text, result=[])
 44 |       vn = impl(text, result)
 45 |       txt = text.unpack("U*")
 46 |       while vn
 47 |         surface = txt.slice(vn.start, vn.length).pack("U*")
 48 |       
 49 |         s = @wdc.word_data(vn.word_id)
 50 |       
 51 |         feature = NKF.nkf('-W16L0 --utf8', s)
 52 |         result.push(Morpheme.new(surface, feature, vn.start))
 53 |         vn = vn.prev
 54 |       end
 55 |       return result
 56 |     end
 57 |   
 58 |     #分かち書きを行う
 59 |     #text:: 分かち書きされるテキスト
 60 |     #result:: 分かち書き結果の文字列が追加される配列
 61 |     #return:: 分かち書き結果の文字列の配列
 62 |     def wakati(text, result=[])
 63 |       vn = impl(text, result)
 64 |       txt = text.unpack("U*")
 65 |     
 66 |       while vn
 67 |         a = txt.slice(vn.start, vn.length).pack("U*")
 68 |         result.push(a)
 69 |         vn = vn.prev
 70 |       end
 71 |       return result
 72 |     end
 73 |   
 74 |     private
 75 |   
 76 |     def impl(text, result=[])
 77 |       txs = text.unpack("U*")
 78 |       len = txs.size
 79 |     
 80 |       node_ary = [Tagger.__BOS_NODES]
 81 |       for i in 0..(len-1)
 82 |         node_ary.push([])
 83 |       end
 84 |     
 85 |       for i in 0..(len-1)
 86 |         per_result = []
 87 |       
 88 |         unless node_ary[i].empty?
 89 |           @wdc.search(text, i, per_result)
 90 |           @unk.search(text, i, @wdc, per_result)
 91 |           prevs = node_ary[i]
 92 |         
 93 |           for j in 0..(per_result.size - 1)
 94 |             vn = per_result[j]
 95 |             if(vn.is_space)
 96 |               node_ary[i + vn.length] = prevs
 97 |             else
 98 |               node_ary[i + vn.length].push(set_min_cost_node(vn, prevs))
 99 |             end
100 |           end
101 |         end
102 |       end
103 |     
104 |       cur = set_min_cost_node(ViterbiNode.make_BOSEOS, node_ary[len]).prev
105 |     
106 |       # reverse
107 |       head = nil
108 |       while cur.prev
109 |         tmp = cur.prev
110 |         cur.prev = head
111 |         head = cur
112 |         cur = tmp
113 |       end
114 |       return head
115 |     end
116 |   
117 |     def set_min_cost_node(vn, prevs)
118 |       f = vn.prev = prevs[0]
119 |       vn.cost = f.cost + @mtx.link_cost(f.right_id, vn.left_id)
120 |     
121 |       for i in 1..(prevs.size - 1)
122 |         p = prevs[i]
123 |         cost = p.cost + @mtx.link_cost(p.right_id, vn.left_id)
124 |         if(cost < vn.cost)
125 |           vn.cost = cost
126 |           vn.prev = p
127 |         end
128 |       end
129 |       vn.cost += @wdc.cost(vn.word_id)
130 |       return vn
131 |     end
132 |   end
133 | 
134 | end


--------------------------------------------------------------------------------
/lib/igo/trie.rb:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | require 'igo/util'
  3 | 
  4 | #
  5 | #Stringクラスの拡張
  6 | #
  7 | class String
  8 |   # 文字列がパラメタの接頭辞で開始するかどうかを返却する
  9 |   #prefix:: 接頭辞
 10 |   #return:: true - 接頭辞で開始する
 11 |   def starts_with?(prefix)
 12 |     prefix = prefix.to_s
 13 |     self[0, prefix.length] == prefix
 14 |   end
 15 | end
 16 | 
 17 | module Igo
 18 | 
 19 |   #
 20 |   #DoubleArrayのノード用の定数などが定義されているクラス
 21 |   #
 22 |   class Node
 23 |     #
 24 |     #BASEノード用のメソッドが定義されているクラス
 25 |     #
 26 |     class Base
 27 |       #BASEノードに格納するID値をエンコードする
 28 |       def self.ids(nid)
 29 |         return (-1 * nid) - 1
 30 |       end
 31 |     end
 32 |   
 33 |     #
 34 |     #CHECKノード用の定数が定義されているクラス
 35 |     #
 36 |       class Chck
 37 |       #文字列の終端文字コード
 38 |       #この文字はシステムにより予約されており、辞書内の形態素の表層形および解析対象テキストに含まれていた場合の動作は未定義
 39 |       TERMINATE_CODE = 0
 40 |       #文字列の終端を表す文字定数
 41 |       TERMINATE_CHAR = TERMINATE_CODE.chr
 42 |       #CHECKノードが未使用であることを示す文字コード
 43 |       #この文字はシステムにより予約されており、辞書内の形態素の表層形および解析対象テキストに含まれていた場合の動作は未定義
 44 |       VACANT_CODE = 1
 45 |       #使用可能な文字の最大値
 46 |       CODE_LIMIT = 0xffff
 47 |     end
 48 |   end
 49 | 
 50 |   #
 51 |   #文字列を文字のストリームとして扱うためのクラス
 52 |   #* readメソッドで個々の文字を順に読み込み、文字列の終端に達した場合にはNode::Chck::TERMINATE_CODEが返される。
 53 |   #
 54 |   class KeyStream
 55 |   
 56 |     def initialize(key, start = 0)
 57 |       @s = key
 58 |       @cur = start
 59 |       @len = key.unpack("U*").size
 60 |     end
 61 |     
 62 |     def compare_to(ks)
 63 |       return rest.compare_to(ks.rest)
 64 |     end
 65 |   
 66 |     #このメソッドは動作的には、rest().starts_with?(prefix.substring(beg, len))と等価。
 67 |     #ほんの若干だが、パフォーマンスを改善するために導入。
 68 |     #簡潔性のためになくしても良いかもしれない。
 69 |     def start_with(prefix, beg, len)
 70 |       s = @s
 71 |       c = @cur
 72 |       if @len - c < len
 73 |         return false
 74 |       end
 75 |       word = s.unpack("U*")[c]
 76 |       if word.nil?
 77 |         return (prefix.slice(beg, len-beg) == nil)
 78 |       else
 79 |         [word].pack("U*").starts_with?(prefix.slice(beg, len-beg))
 80 |       end
 81 |     end
 82 |   
 83 |     def rest
 84 |       return @s.slice(@cur, @s.length)
 85 |     end
 86 |   
 87 |     def read
 88 |   
 89 |       if eos?
 90 |         return Node::Chck::TERMINATE_CODE
 91 |       else
 92 |         r = @s.unpack("U*")[@cur]
 93 |         result = [r].pack("U*")
 94 |         @cur += 1
 95 |         return r
 96 |       end
 97 |     end
 98 |   
 99 |     def eos?
100 |       return (@cur == @len) ? true : false
101 |     end
102 |   end
103 | 
104 |   #
105 |   # DoubleArray検索用のクラス
106 |   #
107 |   class Searcher
108 |     #保存されているDoubleArrayを読み込んで、このクラスのインスタンスを作成する
109 |     #path:: DoubleArrayが保存されているファイルのパス
110 |     def initialize(path)
111 |       fmis = FileMappedInputStream.new(path)
112 |       node_size = fmis.get_int()
113 |       tind_size = fmis.get_int()
114 |       tail_size = fmis.get_int()
115 |       @key_set_size = tind_size
116 |       @begs = fmis.get_int_array(tind_size)
117 |       @base = fmis.get_int_array(node_size)
118 |       @lens = fmis.get_short_array(tind_size)
119 |       @chck = fmis.get_char_array(node_size)
120 |       @tail = fmis.get_string(tail_size)
121 |       fmis.close
122 |     end
123 |   
124 |     #DoubleArrayに格納されているキーの数を返却
125 |     #return:: DoubleArrayに格納されているキーの数
126 |     def size
127 |       return @key_set_size
128 |     end
129 |   
130 |     #キーを検索する
131 |     #key:: 検索対象のキー文字列
132 |     #return:: キーが見つかった場合はそのIDを、見つからなかった場合は-1を返す
133 |     def search(key)
134 |       base = @base
135 |       chck = @chck
136 |       node = @base[0]
137 |       kin = KeyStream.new(key)
138 |     
139 |       while true
140 |         code = kin.read
141 |         idx = node + code
142 |         node = base[idx]
143 |       
144 |         if(chck[idx] == code)
145 |           if(node >= 0)
146 |             next
147 |           elsif(kin.eos? or key_exists?(kin, node))
148 |             return Node::Base.ids(node)
149 |           end
150 |           return -1
151 |         end
152 |       end
153 |     end
154 |   
155 |     #common-prefix検索を行う
156 |     #* 条件に一致するキーが見つかる度に、callback.callメソッドが呼び出される
157 |     #key:: 検索対象のキー文字列
158 |     #start:: 検索対象となるキー文字列の最初の添字
159 |     #callback:: 一致を検出した場合に呼び出されるコールバックメソッド
160 |     def each_common_prefix(key, start, callback)
161 |       base = @base
162 |       chck = @chck
163 |       node = @base[0]
164 |       offset = -1
165 |       kin = KeyStream.new(key, start)
166 |     
167 |       while true
168 |         code = kin.read
169 |         offset += 1
170 |         terminal_index = node
171 |       
172 |         if(chck[terminal_index] == Node::Chck::TERMINATE_CODE)
173 |           callback.call(start, offset, Node::Base.ids(base[terminal_index]))
174 |         
175 |           if(code == Node::Chck::TERMINATE_CODE)
176 |             return
177 |           end
178 |         end
179 |       
180 |         idx = node + code
181 |         node = base[idx]
182 |       
183 |         if(chck[idx] == code)
184 |           if(node >= 0)
185 |             next
186 |           else
187 |             call_if_key_including(kin, node, start, offset, callback)
188 |           end
189 |         end
190 |         return
191 |       end
192 |     end
193 |   
194 |     private
195 |   
196 |     def call_if_key_including(kin, node, start, offset, callback)
197 |       node_id = Node::Base.ids(node)
198 |       if(kin.start_with(@tail, @begs[node_id], @lens[node_id]))
199 |         callback.call(start, offset + @lens[node_id] + 1, node_id)
200 |       end
201 |     end
202 |   
203 |     def key_exists?(kin, node)
204 |       nid = Node.Base.ids(node)
205 |       beg = @begs[nid]
206 |       s = @tail.slice(beg, beg + @lens[nid])
207 |       return kin.rest == s ? true : false
208 |     end
209 |   end
210 | end
211 | 


--------------------------------------------------------------------------------
/lib/igo/util.rb:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | #= ファイルユーティリティ
 3 | 
 4 | module Igo
 5 |   #
 6 |   #=== ファイルにマッピングされた入力ストリーム
 7 |   # ファイルからバイナリデータを取得する場合、必ずこのクラスが使用される。
 8 |   #
 9 |   class FileMappedInputStream
10 |     # 入力ストリームの初期化
11 |     # path:: 入力ファイルのパス
12 |     def initialize(path)
13 |       @path = path
14 |       @cur = 0
15 |       @file = open(path, "rb")
16 |     end
17 |   
18 |     # int値で読み取り
19 |     def get_int()
20 |       return @file.read(4).unpack("i*")[0]
21 |     end
22 |     
23 |     # int配列で読み取り
24 |     # count:: 読み取りカウント
25 |     def get_int_array(count)
26 |       return @file.read(count * 4).unpack("i*")
27 |     end
28 |   
29 |     # int配列で読み取り
30 |     # path:: 入力ファイルのパス
31 |     def self.get_int_array(path)
32 |       fmis = FileMappedInputStream.new(path)
33 |       array = fmis.get_int_array((File::stat(path).size)/4)
34 |       fmis.close
35 |       return array
36 |     end
37 |   
38 |     # short配列で読み取り
39 |     # count:: 読み取りカウント
40 |     def get_short_array(count)
41 |       return @file.read(count * 2).unpack("s*")
42 |     end
43 |   
44 |     # char配列で読み取り
45 |     # count:: 読み取りカウント
46 |     def get_char_array(count)
47 |       return @file.read(count * 2).unpack("S!*")
48 |     end
49 |   
50 |     # stringで読み取り
51 |     # count:: 読み取りカウント
52 |     def get_string(count)
53 |       return @file.read(count * 2)
54 |     end
55 |   
56 |     # stringで読み取り
57 |     # path:: 入力ファイル
58 |     def self.get_string(path)
59 |       fmis = FileMappedInputStream.new(path)
60 |       str = fmis.get_string((File::stat(path).size)/2)
61 |       fmis.close
62 |     
63 |       return str
64 |     end
65 |   
66 |     # 入力ファイルのサイズを返却する
67 |     def size
68 |       return File::stat(@path).size
69 |     end
70 |   
71 |     # 入力ストリームを閉じる
72 |     #* newした場合、必ずcloseを呼ぶこと
73 |     def close
74 |       @file.close
75 |     end
76 |   
77 |     # char配列で読み取り
78 |     # path:: 入力ファイル
79 |     def self.get_char_array(path)
80 |       fmis = FileMappedInputStream.new(path)
81 |       array = fmis.get_char_array(fmis.size / 2)
82 |       fmis.close
83 |       return array
84 |     end
85 |   
86 |     private
87 |   
88 |     # ファイルマップ
89 |     #* 現在、不使用
90 |     def map(size)
91 |       @file.pos = @cur
92 |       @cur += size
93 |       return @file.read(size)
94 |     end
95 |   end
96 | end


--------------------------------------------------------------------------------
/lib/igo/version.rb:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | module Igo
 3 |   #
 4 |   #バージョンクラス
 5 |   #
 6 |   class Version
 7 |     #igo-rubyのRubyGemsバージョンを出力する
 8 |     def self.igo_ruby
 9 |       version_file = File.dirname(__FILE__) + '/../../VERSION'
10 |       version = ""
11 |       open(version_file) { |igo_ruby_version|
12 |         version = igo_ruby_version.gets
13 |       }
14 |       return version
15 |     end
16 |   end
17 | end


--------------------------------------------------------------------------------
/spec/igo-ruby_spec.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2 | 
3 | describe "IgoRuby" do
4 |   it "fails" do
5 |     fail "hey buddy, you should probably rename this file and start specing for real"
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 2 | $LOAD_PATH.unshift(File.dirname(__FILE__))
 3 | require 'rspec'
 4 | require 'igo-ruby'
 5 | 
 6 | # Requires supporting files with custom matchers and macros, etc,
 7 | # in ./support/ and its subdirectories.
 8 | Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
 9 | 
10 | RSpec.configure do |config|
11 |   
12 | end
13 | 


--------------------------------------------------------------------------------
/test/test.rb:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | require 'rubygems'
 3 | require 'igo-ruby'
 4 | #require File.dirname(__FILE__) + '/../lib/igo-ruby'
 5 | 
 6 | puts "version -> #{Igo::Version.igo_ruby}"
 7 | 
 8 | tagger = Igo::Tagger.new(File.dirname(__FILE__) + '/../../ipadic')
 9 | t = tagger.parse('吾輩は猫である。名前はまだ無い。')
10 | puts "parse ->"
11 | t.each{|m|
12 |   puts "#{m.surface} #{m.feature} #{m.start}"
13 | }
14 | puts "wakati ->"
15 | t = tagger.wakati('どこで生れたかとんと見当がつかぬ。')
16 | puts t.join(' ')
17 | 
18 | t = tagger.parse('取り敢えずビール')
19 | puts "1.9 character code bug fix ->"
20 | t.each{|m|
21 |   puts "#{m.surface} #{m.feature} #{m.start}"
22 | }
23 | 
24 | t = tagger.parse('Let’s Dance')
25 | puts "Fix error raised when fullwidth symbol mixed ->"
26 | t.each{|m|
27 |   puts "#{m.surface} #{m.feature} #{m.start}"
28 | }
29 | 


--------------------------------------------------------------------------------