├── Rakefile
├── lib
    ├── cseg
    │   └── version.rb
    ├── cseg.rb
    └── cseg.rb.new
├── Gemfile
├── .gitignore
├── cseg.gemspec
├── cseg.rb
├── LICENSE.txt
└── README.md


/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | 


--------------------------------------------------------------------------------
/lib/cseg/version.rb:
--------------------------------------------------------------------------------
1 | module Cseg
2 | 	VERSION = "0.1.1"
3 | end
4 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Specify your gem's dependencies in cseg.gemspec
4 | gemspec
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | .yardoc
 6 | Gemfile.lock
 7 | InstalledFiles
 8 | _yardoc
 9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 | data/
19 | 


--------------------------------------------------------------------------------
/cseg.gemspec:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | lib = File.expand_path('../lib', __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require 'cseg/version'
 5 | 
 6 | Gem::Specification.new do |gem|
 7 |   gem.name          = "cseg"
 8 |   gem.version       = Cseg::VERSION
 9 |   gem.authors       = ["gyorou"]
10 |   gem.email         = ["gyorou@tjjtds.com"]
11 |   gem.description   = %q{"a chinese segmentation tool using CRF"}
12 |   gem.summary       = %q{""}
13 |   gem.homepage      = ""
14 | 
15 |   gem.files         = [".gitignore",
16 | 						"LICENSE.txt",
17 | 						"README.md",
18 | 						"Gemfile",
19 | 						"data/pku_training.data",
20 | 						"data/as_training_less.data",
21 | 						"lib/cseg/version.rb",
22 | 						"lib/cseg.rb",
23 | 						"cseg.gemspec",
24 | 						"Rakefile",
25 | 						]
26 |   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
27 |   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
28 |   gem.require_paths = ["lib","data"]
29 | end
30 | 


--------------------------------------------------------------------------------
/cseg.rb:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | require "cseg/version"
 3 | class Kurumi
 4 |  # since crf++ can only read from file
 5 |  @modle_sp=File.expand_path("../../data/pku_training.data", __FILE__)
 6 |  @modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
 7 |  def self.segment(str, mode="sp")
 8 |    @result=Array.new
 9 |    case mode
10 |    when "sp"
11 |    @modle=@modle_sp
12 |    when "tr"
13 |    @modle=@modle_tr
14 |    else
15 |    raise "no such parameter, please use sp or tr"		
16 |    end
17 |   result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io|
18 |     io.puts *str.chars
19 |     io.close_write
20 |     io.read
21 |    end
22 |   word=""
23 |   result_data.each_line do |line|
24 |   token=line.chomp.split("	")
25 |   if token[1]=="B"or token[1]=="O"
26 |     if word!=""
27 |     @result.push(word)
28 |     end
29 |     word=token[0]
30 |   elsif token[1]=="I"
31 |     word+=token[0]
32 |   else
33 |     if word!=""
34 |       @result.push(word)
35 |        word=""
36 |     end
37 |   end
38 |  end
39 |   return @result
40 |  end
41 | end
42 | 
43 | #print Kurumi.segment("屌丝是一种自我讽刺。")
44 | 


--------------------------------------------------------------------------------
/lib/cseg.rb:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | require "cseg/version"
 3 | class Kurumi
 4 |  # since crf++ can only read from file
 5 |  @modle_sp=File.expand_path("../../data/pku_training.data", __FILE__)
 6 |  @modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
 7 |  def self.segment(str, mode="sp")
 8 |    @result=Array.new
 9 |    case mode
10 |    when "sp"
11 |    @modle=@modle_sp
12 |    when "tr"
13 |    @modle=@modle_tr
14 |    else
15 |    raise "no such parameter, please use sp or tr"		
16 |    end
17 |   result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io|
18 |     io.puts *str.chars
19 |     io.close_write
20 |     io.read
21 |    end
22 |   word=""
23 |   result_data.each_line do |line|
24 |   token=line.chomp.split("	")
25 |   if token[1]=="B"or token[1]=="O"
26 |     if word!=""
27 |     @result.push(word)
28 |     end
29 |     word=token[0]
30 |   elsif token[1]=="I"
31 |     word+=token[0]
32 |   else
33 |     if word!=""
34 |       @result.push(word)
35 |        word=""
36 |     end
37 |   end
38 |  end
39 |   return @result
40 |  end
41 | end
42 | 
43 | #print Kurumi.segment("屌丝是一种自我讽刺。")
44 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 gyorou
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kurumi
 2 | 
 3 | Use MIRA to train a large amount of features.
 4 | 
 5 | Segment chinese(both traditional and simplified) sentences into words in high speed and correctly.
 6 | 
 7 | take care the name of the gem is different from the repo name!
 8 | 
 9 | ## Installation
10 | 
11 | Add this line to your application's Gemfile:
12 | 
13 |     gem 'cseg'
14 | 
15 | And then execute:
16 | 
17 |     $ bundle
18 | 
19 | Or install it yourself as:
20 | 
21 |     $ gem install cseg
22 | 
23 | you need to install [CRF++](http://crfpp.googlecode.com/svn/trunk/doc/index.html?source=navbar) first and set the environment variables.
24 | 
25 | On github the dictionary file was deleted since it is quite large, though you can get all from rubygems.
26 | 
27 | ## Recall and Precision
28 | 
29 | Tested on seghanbakeoff pku test set
30 | 
31 | Precision: 94.43%
32 | 
33 | Recall: 92.86%
34 | 	
35 | ## Usage
36 | 
37 | ```ruby
38 | #The default is Simplified Chinese
39 | require "cseg"
40 | Kurumi.segment("屌丝是一种自我讽刺。")
41 | #=>["屌丝", "是", "一", "种", "自我", "讽刺", "。"]
42 | #Use parameter "tr" to specify Traditional Chinese
43 | Kurumi.segment("台妹真是正點。","tr")  
44 | #=>["台妹", "真", "是", "正點", "。"]	
45 | 
46 | ```	
47 | 
48 | 


--------------------------------------------------------------------------------
/lib/cseg.rb.new:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | require "cseg/version"
 3 | require "tempfile"
 4 | class Kurumi
 5 | 	# since crf++ can only read from file
 6 | 	@modle_sp=File.expand_path("../../data/pkumodle.data", __FILE__)
 7 | 	@modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
 8 | 	def self.segment(str,mode="sp")
 9 | 		tmpstr=""
10 | 		for i in 0..str.length-1
11 | 			tmpstr+=str[i]+"\n"
12 | 		end
13 | 		@tmp=Tempfile::new("tmp")
14 | 		@resultfile=Tempfile::new("result")
15 | 		@tmp.write(tmpstr)
16 | 		@tmp.rewind
17 | 		@result=Array.new
18 | 	  	case mode
19 | 			when "sp"
20 | 			       @mode=@mode_sp
21 | 			when "tr"
22 | 		 		@mode=@mode_tr
23 | 			else
24 | 			raise "no such parameter, please use sp or tr"		
25 | 		end
26 | 		system("crf_test -m #{@modle} #{@tmp.path}>#{@resultfile.path}")
27 | 		@resultfile.rewind
28 | 		word=""
29 | 		@resultfile.read.each_line{|line|
30 | 			token=line.chomp.split("	")
31 | 			if token[1]=="B"or token[1]=="O"
32 | 				if word!=""
33 | 					@result.push(word)
34 | 				end
35 | 				word=token[0]
36 | 			elsif token[1]=="I"
37 | 				word+=token[0]
38 | 			else
39 | 				if word!=""
40 | 					@result.push(word)
41 | 					word=""
42 | 				end
43 | 			end
44 | 
45 | 		}
46 | 		
47 | 		@resultfile.close(true)
48 | 		@tmp.close(true)
49 | 		return @result
50 | 	end
51 | end
52 | 
53 |  # result=Kurumi.segment("屌丝是一种生活态度")
54 |  # print result
55 | 


--------------------------------------------------------------------------------