├── Rakefile ├── lib ├── cseg │ └── version.rb ├── cseg.rb └── cseg.rb.new ├── Gemfile ├── .gitignore ├── cseg.gemspec ├── cseg.rb ├── LICENSE.txt └── README.md /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | -------------------------------------------------------------------------------- /lib/cseg/version.rb: -------------------------------------------------------------------------------- 1 | module Cseg 2 | VERSION = "0.1.1" 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in cseg.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | data/ 19 | -------------------------------------------------------------------------------- /cseg.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'cseg/version' 5 | 6 | Gem::Specification.new do |gem| 7 | gem.name = "cseg" 8 | gem.version = Cseg::VERSION 9 | gem.authors = ["gyorou"] 10 | gem.email = ["gyorou@tjjtds.com"] 11 | gem.description = %q{"a chinese segmentation tool using CRF"} 12 | gem.summary = %q{""} 13 | gem.homepage = "" 14 | 15 | gem.files = [".gitignore", 16 | "LICENSE.txt", 17 | "README.md", 18 | "Gemfile", 19 | "data/pku_training.data", 20 | "data/as_training_less.data", 21 | "lib/cseg/version.rb", 22 | "lib/cseg.rb", 23 | "cseg.gemspec", 24 | "Rakefile", 25 | ] 26 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 27 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 28 | gem.require_paths = ["lib","data"] 29 | end 30 | -------------------------------------------------------------------------------- /cseg.rb: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | require "cseg/version" 3 | class Kurumi 4 | # since crf++ can only read from file 5 | @modle_sp=File.expand_path("../../data/pku_training.data", __FILE__) 6 | @modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__) 7 | def self.segment(str, mode="sp") 8 | @result=Array.new 9 | case mode 10 | when "sp" 11 | @modle=@modle_sp 12 | when "tr" 13 | @modle=@modle_tr 14 | else 15 | raise "no such parameter, please use sp or tr" 16 | end 17 | result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io| 18 | io.puts *str.chars 19 | io.close_write 20 | io.read 21 | end 22 | word="" 23 | result_data.each_line do |line| 24 | token=line.chomp.split(" ") 25 | if token[1]=="B"or token[1]=="O" 26 | if word!="" 27 | @result.push(word) 28 | end 29 | word=token[0] 30 | elsif token[1]=="I" 31 | word+=token[0] 32 | else 33 | if word!="" 34 | @result.push(word) 35 | word="" 36 | end 37 | end 38 | end 39 | return @result 40 | end 41 | end 42 | 43 | #print Kurumi.segment("屌丝是一种自我讽刺。") 44 | -------------------------------------------------------------------------------- /lib/cseg.rb: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | require "cseg/version" 3 | class Kurumi 4 | # since crf++ can only read from file 5 | @modle_sp=File.expand_path("../../data/pku_training.data", __FILE__) 6 | @modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__) 7 | def self.segment(str, mode="sp") 8 | @result=Array.new 9 | case mode 10 | when "sp" 11 | @modle=@modle_sp 12 | when "tr" 13 | @modle=@modle_tr 14 | else 15 | raise "no such parameter, please use sp or tr" 16 | end 17 | result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io| 18 | io.puts *str.chars 19 | io.close_write 20 | io.read 21 | end 22 | word="" 23 | result_data.each_line do |line| 24 | token=line.chomp.split(" ") 25 | if token[1]=="B"or token[1]=="O" 26 | if word!="" 27 | @result.push(word) 28 | end 29 | word=token[0] 30 | elsif token[1]=="I" 31 | word+=token[0] 32 | else 33 | if word!="" 34 | @result.push(word) 35 | word="" 36 | end 37 | end 38 | end 39 | return @result 40 | end 41 | end 42 | 43 | #print Kurumi.segment("屌丝是一种自我讽刺。") 44 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 gyorou 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kurumi 2 | 3 | Use MIRA to train a large amount of features. 4 | 5 | Segment chinese(both traditional and simplified) sentences into words in high speed and correctly. 6 | 7 | take care the name of the gem is different from the repo name! 8 | 9 | ## Installation 10 | 11 | Add this line to your application's Gemfile: 12 | 13 | gem 'cseg' 14 | 15 | And then execute: 16 | 17 | $ bundle 18 | 19 | Or install it yourself as: 20 | 21 | $ gem install cseg 22 | 23 | you need to install [CRF++](http://crfpp.googlecode.com/svn/trunk/doc/index.html?source=navbar) first and set the environment variables. 24 | 25 | On github the dictionary file was deleted since it is quite large, though you can get all from rubygems. 26 | 27 | ## Recall and Precision 28 | 29 | Tested on seghanbakeoff pku test set 30 | 31 | Precision: 94.43% 32 | 33 | Recall: 92.86% 34 | 35 | ## Usage 36 | 37 | ```ruby 38 | #The default is Simplified Chinese 39 | require "cseg" 40 | Kurumi.segment("屌丝是一种自我讽刺。") 41 | #=>["屌丝", "是", "一", "种", "自我", "讽刺", "。"] 42 | #Use parameter "tr" to specify Traditional Chinese 43 | Kurumi.segment("台妹真是正點。","tr") 44 | #=>["台妹", "真", "是", "正點", "。"] 45 | 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /lib/cseg.rb.new: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | require "cseg/version" 3 | require "tempfile" 4 | class Kurumi 5 | # since crf++ can only read from file 6 | @modle_sp=File.expand_path("../../data/pkumodle.data", __FILE__) 7 | @modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__) 8 | def self.segment(str,mode="sp") 9 | tmpstr="" 10 | for i in 0..str.length-1 11 | tmpstr+=str[i]+"\n" 12 | end 13 | @tmp=Tempfile::new("tmp") 14 | @resultfile=Tempfile::new("result") 15 | @tmp.write(tmpstr) 16 | @tmp.rewind 17 | @result=Array.new 18 | case mode 19 | when "sp" 20 | @mode=@mode_sp 21 | when "tr" 22 | @mode=@mode_tr 23 | else 24 | raise "no such parameter, please use sp or tr" 25 | end 26 | system("crf_test -m #{@modle} #{@tmp.path}>#{@resultfile.path}") 27 | @resultfile.rewind 28 | word="" 29 | @resultfile.read.each_line{|line| 30 | token=line.chomp.split(" ") 31 | if token[1]=="B"or token[1]=="O" 32 | if word!="" 33 | @result.push(word) 34 | end 35 | word=token[0] 36 | elsif token[1]=="I" 37 | word+=token[0] 38 | else 39 | if word!="" 40 | @result.push(word) 41 | word="" 42 | end 43 | end 44 | 45 | } 46 | 47 | @resultfile.close(true) 48 | @tmp.close(true) 49 | return @result 50 | end 51 | end 52 | 53 | # result=Kurumi.segment("屌丝是一种生活态度") 54 | # print result 55 | --------------------------------------------------------------------------------