├── .gem-version
├── .gitignore
├── .travis.yml
├── .yardopts
├── CHANGELOG.rdoc
├── Gemfile
├── LICENSE.rdoc
├── README.rdoc
├── Rakefile
├── bin
└── tokenize
├── lib
├── tokenizer.rb
└── tokenizer
│ ├── tokenizer.rb
│ └── version.rb
├── test
├── development_tests
│ ├── test_by_tokenizer_dev.rb
│ ├── test_de_tokenizer_dev.rb
│ ├── test_en_tokenizer_dev.rb
│ ├── test_fr_tokenizer_dev.rb
│ ├── test_it_tokenizer_dev.rb
│ ├── test_parameters.rb
│ └── test_ru_tokenizer_dev.rb
└── regression_tests
│ └── test_de_tokenizer.rb
└── tokenizer.gemspec
/.gem-version:
--------------------------------------------------------------------------------
1 | 2016-03-28 13:30:07 +0200 0.3.0
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ydoc
2 | rdoc
3 | .yardoc
4 | *.gem
5 | .ruby-version
6 | Gemfile.lock
7 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 |
3 | before_install:
4 | - gem install bundler
5 |
6 | rvm:
7 | - 1.9.3
8 | - jruby-19mode # JRuby in 1.9 mode
9 | - 2.0.0
10 | - 2.1.2
11 | - 2.2.0
12 | - 2.3.0
13 | - rbx-19mode
14 | - ruby-head
15 |
16 |
17 | script: "bundle exec rake test"
18 |
19 | matrix:
20 | allow_failures:
21 | - rvm: rbx-19mode
22 | - rvm: ruby-head
23 |
24 | notifications:
25 | email:
26 | on_success: always
27 |
--------------------------------------------------------------------------------
/.yardopts:
--------------------------------------------------------------------------------
1 | --private
2 | --protected
3 | --title 'A simple tokenizer for NLP tasks.'
4 | --main README.rdoc
5 | -
6 | CHANGELOG.rdoc
7 | README.rdoc
8 | LICENSE.rdoc
9 | bin/*
10 | lib/**/*
11 |
--------------------------------------------------------------------------------
/CHANGELOG.rdoc:
--------------------------------------------------------------------------------
1 | == COMPLETED
2 | === 0.1.1
3 | * Documentation and the whole project tree updated. No functional improvements.
4 | * Corrected typos.
5 | === 0.1.0
6 | * Notion of binary tokenizer and a library for embedded tokenization.
7 | * Separation of punctuaction marks.
8 | === 0.0.1
9 | * Simple tokenization is desired.
10 |
11 | == PLANNED
12 | === 0.2.0
13 | === 0.3.0
14 | === 0.4.0
15 | === 0.5.0
16 | === 0.6.0
17 | === 0.7.0
18 | === 0.8.0
19 | === 0.9.0
20 | === 1.0.0
21 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | gemspec
4 |
5 | group :development do
6 | gem 'rdoc', '>= 3.9.1'
7 | gem 'rake', '~> 11.1'
8 | gem 'yard', '~> 0.8'
9 | gem 'bundler', '~> 1.7'
10 | gem 'minitest', '~> 5.8'
11 | gem 'travis', '~> 1.8'
12 | gem 'rubocop', '~> 0.38'
13 | end
14 |
--------------------------------------------------------------------------------
/LICENSE.rdoc:
--------------------------------------------------------------------------------
1 | Copyright (c) 2011- Andrei Beliankou, Sven Naumann
2 | University of Trier, Germany
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy
5 | of this software and associated documentation files (the "Software"), to deal
6 | in the Software without restriction, including without limitation the rights
7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the Software is
9 | furnished to do so, subject to the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE
21 |
--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
1 | = Tokenizer
2 |
3 | {RubyGems}[http://rubygems.org/gems/tokenizer] |
4 | {Homepage}[http://bu.chsta.be/projects/tokenizer] |
5 | {Source Code}[https://github.com/arbox/tokenizer] |
6 | {Bug Tracker}[https://github.com/arbox/tokenizer/issues]
7 |
8 | {
}[https://rubygems.org/gems/tokenizer]
9 | {
}[https://travis-ci.org/arbox/tokenizer]
10 | {
}[https://codeclimate.com/github/arbox/tokenizer]
11 | {
}[https://gemnasium.com/arbox/tokenizer]
12 |
13 | == DESCRIPTION
14 | A simple multilingual tokenizer -- a linguistic tool intended to split a written text
15 | into tokens for NLP tasks. This tool provides a CLI and a library for
16 | linguistic tokenization which is an anavoidable step for many HLT (Human
17 | Language Technology) tasks in the preprocessing phase for further syntactic,
18 | semantic and other higher level processing goals.
19 |
20 | Tokenization task involves Sentence Segmentation, Word Segmentation and Boundary
21 | Disambiguation for the both tasks.
22 |
23 | Use it for tokenization of German, English and Dutch texts.
24 |
25 | === Implemented Algorithms
26 | to be ...
27 |
28 | == INSTALLATION
29 | +Tokenizer+ is provided as a .gem package. Simply install it via
30 | {RubyGems}[http://rubygems.org/gems/tokenizer].
31 |
32 | To install +tokenizer+ issue the following command:
33 | $ gem install tokenizer
34 |
35 | If you want to do a system wide installation, do this as root
36 | (possibly using +sudo+).
37 |
38 | Alternatively use your Gemfile for dependency management.
39 |
40 | == SYNOPSIS
41 |
42 | You can use +Tokenizer+ in two ways.
43 | * As a command line tool:
44 | $ echo 'Hi, ich gehe in die Schule!. | tokenize
45 |
46 | * As a library for embedded tokenization:
47 | > require 'tokenizer'
48 | > de_tokenizer = Tokenizer::WhitespaceTokenizer.new
49 | > de_tokenizer.tokenize('Ich gehe in die Schule!')
50 | > => ["Ich", "gehe", "in", "die", "Schule", "!"]
51 |
52 | * Customizable PRE and POST list
53 | > require 'tokenizer'
54 | > de_tokenizer = Tokenizer::WhitespaceTokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
55 | > de_tokenizer.tokenize('Ich gehe|in die Schule!')
56 | > => ["Ich", "gehe", "|in", "die", "Schule", "!"]
57 |
58 | See documentation in the Tokenizer::WhitespaceTokenizer class for details
59 | on particular methods.
60 |
61 | == SUPPORT
62 |
63 | If you have question, bug reports or any suggestions, please drop me an email :)
64 | Any help is deeply appreciated!
65 |
66 | == CHANGELOG
67 | For details on future plan and working progress see CHANGELOG.rdoc.
68 |
69 | == CAUTION
70 | This library is work in process! Though the interface is mostly complete,
71 | you might face some not implemented features.
72 |
73 | Please contact me with your suggestions, bug reports and feature requests.
74 |
75 | == LICENSE
76 |
77 | +Tokenizer+ is a copyrighted software by Andrei Beliankou, 2011-
78 |
79 | You may use, redistribute and change it under the terms provided
80 | in the LICENSE.rdoc file.
81 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | lib_path = File.expand_path('../lib', __FILE__)
2 | $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
3 |
4 | # Rake provides FileUtils and file lists.
5 | require 'rake'
6 | require 'rdoc'
7 |
8 | # clean and clobber tasks.
9 | require 'rake/clean'
10 | CLOBBER.include('rdoc',
11 | 'ydoc',
12 | '.yardoc',
13 | '**/*.gem')
14 |
15 | # Running tests.
16 | require 'rake/testtask'
17 | Rake::TestTask.new do |t|
18 | t.test_files = FileList.new('test/regression_tests/*.rb').to_a
19 | end
20 |
21 | # Generate documentation
22 | require 'rdoc/task'
23 | RDoc::Task.new do |rdoc|
24 | rdoc.rdoc_files.include('README.rdoc',
25 | 'LICENSE.rdoc',
26 | 'CHANGELOG.rdoc',
27 | 'lib/**/*',
28 | 'bin/*'
29 | )
30 | rdoc.rdoc_dir = 'rdoc'
31 | end
32 |
33 | require 'yard'
34 | YARD::Rake::YardocTask.new do |ydoc|
35 | ydoc.options += ['-o', 'ydoc', '--no-cache']
36 | end
37 |
38 | desc 'Document the code using Yard and RDoc.'
39 | task doc: [:clobber, :rdoc, :yard]
40 |
41 | # Custom gem building and releasing tasks.
42 | require 'tokenizer/version'
43 | desc 'Commit pending changes.'
44 | task :commit do
45 | end
46 |
47 | desc 'Create a tag in the repository for the current release.'
48 | task :tag do
49 | end
50 |
51 | desc "Build the gem package tokenizer-#{Tokenizer::VERSION}.gem"
52 | task :build => :clobber do
53 | system 'bundle exec gem build tokenizer.gemspec'
54 | end
55 |
56 | desc 'Deploy the gem package to RubyGems.'
57 | task release: [:commit, :tag, :build] do
58 | system "gem push tokenizer-#{Tokenizer::VERSION}.gem"
59 | end
60 |
61 | desc 'Open an irb session preloaded with this library.'
62 | task :console do
63 | sh 'irb -I lib -r tokenizer.rb'
64 | end
65 |
66 | task :travis do
67 | sh 'git pull'
68 | message = "#{Time.now}\t#{Tokenizer::VERSION}\n"
69 | File.open('.gem-version', 'w') do |file|
70 | file.write(message)
71 | end
72 | sh 'git add .gem-version'
73 | sh "git commit -m '#{message.chomp}'"
74 | sh 'git push origin master'
75 | end
76 |
77 | task :default => :test
78 |
--------------------------------------------------------------------------------
/bin/tokenize:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'tokenizer'
4 |
5 | tokenizer = Tokenizer::Tokenizer.new
6 |
7 | while (line = gets)
8 | puts tokenizer.tokenize(line).join("\n")
9 | end
10 |
--------------------------------------------------------------------------------
/lib/tokenizer.rb:
--------------------------------------------------------------------------------
1 | require 'tokenizer/tokenizer'
2 | require 'tokenizer/version'
3 |
--------------------------------------------------------------------------------
/lib/tokenizer/tokenizer.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # :title: A simple Tokenizer for NLP Tasks.
3 | # :main: README.rdoc
4 |
5 | # A namespace for all project related stuff.
6 | module Tokenizer
7 | # Simple whitespace based tokenizer with configurable punctuation detection.
8 | class WhitespaceTokenizer
9 | # Default whitespace separator.
10 | FS = Regexp.new('[[:blank:]]+')
11 |
12 | # Characters only in the role of splittable prefixes.
13 | SIMPLE_PRE = ['¿', '¡']
14 |
15 | # Characters only in the role of splittable suffixes.
16 | SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
17 |
18 | # Characters as splittable prefixes with an optional matching suffix.
19 | PAIR_PRE = ['(', '{', '[', '<', '«', '„']
20 |
21 | # Characters as splittable suffixes with an optional matching prefix.
22 | PAIR_POST = [')', '}', ']', '>', '»', '“']
23 |
24 | # Characters which can be both prefixes AND suffixes.
25 | PRE_N_POST = ['"', "'"]
26 |
27 | private_constant :FS
28 |
29 | # @param [Symbol] lang Language identifier.
30 | # @param [Hash] options Additional options.
31 | # @option options [Array] :pre Array of splittable prefix characters.
32 | # @option options [Array] :post Array of splittable suffix characters.
33 | # @option options [Array] :pre_n_post Array of characters with
34 | # suffix AND prefix functions.
35 | def initialize(lang = :de, options = {})
36 | @lang = lang
37 | @options = {
38 | pre: SIMPLE_PRE + PAIR_PRE,
39 | post: SIMPLE_POST + PAIR_POST,
40 | pre_n_post: PRE_N_POST
41 | }.merge(options)
42 | end
43 |
44 | # @param [String] str String to be tokenized.
45 | # @return [Array] Array of tokens.
46 | def tokenize(str)
47 | tokens = sanitize_input(str).split(FS)
48 | return [''] if tokens.empty?
49 |
50 | splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
51 | pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
52 | output = []
53 | tokens.each do |token|
54 | prefix, stem, suffix = token.partition(pattern)
55 | output << prefix.split('') unless prefix.empty?
56 | output << stem unless stem.empty?
57 | output << suffix.split('') unless suffix.empty?
58 | end
59 |
60 | output.flatten
61 | end
62 |
63 | alias process tokenize
64 |
65 | private
66 |
67 | # @param [String] str User defined string to be tokenized.
68 | # @return [String] A new modified string.
69 | def sanitize_input(str)
70 | str.chomp.strip
71 | end
72 | end # class
73 |
74 | # @deprecated Use {WhitespaceTokenizer} instead.
75 | class Tokenizer < WhitespaceTokenizer
76 | def initialize(*args)
77 | warn '[Deprecated!] Use WhitespaceTokenizer instead.'
78 | super(*args)
79 | end
80 | end
81 | end # module
82 |
--------------------------------------------------------------------------------
/lib/tokenizer/version.rb:
--------------------------------------------------------------------------------
1 | module Tokenizer
2 | VERSION = '0.3.0'
3 | end
4 |
--------------------------------------------------------------------------------
/test/development_tests/test_by_tokenizer_dev.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @by_tokenizer = Tokenizer::Tokenizer.new(:by)
9 | end
10 |
11 | def test_tokenization_001
12 | end
13 |
14 | private
15 | def compare(exp_result, input)
16 | act_result = @de_tokenizer.tokenize(input)
17 | assert_equal(exp_result, act_result)
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/test/development_tests/test_de_tokenizer_dev.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @de_tokenizer = Tokenizer::Tokenizer.new(:de)
9 | end
10 |
11 | def test_tokenization_001
12 | input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!'
13 | etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !}
14 | compare(etalon, input)
15 | end
16 |
17 | def test_tokenization_002
18 | input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
19 | etalon = %w{Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd !}
20 | compare(etalon, input)
21 | end
22 |
23 | def test_tokenization_003
24 | input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
25 | etalon = %w{Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen .}
26 | compare(etalon, input)
27 | end
28 |
29 | def test_tokenization_004
30 | input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
31 | etalon = %w{Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen .}
32 | compare(etalon, input)
33 | end
34 |
35 | def test_tokenization_005
36 | input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
37 | etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
38 | compare(etalon, input)
39 | end
40 |
41 | def test_tokenization_006
42 | input = 'Es gibt viele verschiedene Zeichen, die noch in Texten vorkommen können wie - zum Beispiel - diese hier "text" oder (text).'
43 | etalon = %w{Es gibt viele verschiedene Zeichen , die noch in Texten vorkommen können wie - zum Beispiel - diese hier " text " oder ( text ) .}
44 | compare(etalon, input)
45 | end
46 |
47 | def test_tokenization_007
48 | input = "Abkürzungen sind immer ein Problem, da auch Leerzeichen dazwischen stehen können, wie z. B. hier."
49 | etalon = ["Abkürzungen", "sind", "immer", "ein", "Problem", ",", "da", "auch", "Leerzeichen", "dazwischen", "stehen", "können", ",", "wie", "z. B.", "hier", "."]
50 | compare(etalon, input)
51 | end
52 |
53 | def test_tokenization_008
54 | input = "Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen, bei z.B. Aufzählungen."
55 | etalon = %w{Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen , bei z.B. Aufzählungen .}
56 | compare(etalon, input)
57 | end
58 |
59 | def test_tokenization_009
60 | input = "Ein weiteres Problem sind solche Getrennt- und Zusammenschreibungen."
61 | etalon = %w{Ein weiteres Problem sind solche Getrenntschreibungen und Zusammenschreibungen .}
62 | compare(etalon, input)
63 | end
64 |
65 | def test_tokenization_010
66 | input = "In manchen Texten gibt es auch Worttrennung am Zeilen- ende."
67 | etalon = %w{In manchen Texten gibt es auch Worttrennung am Zeilenende .}
68 | compare(etalon, input)
69 | end
70 |
71 | def test_tokenization_011 #Ellipsis
72 | input = "Der Satz endet in einer Ellips..."
73 | etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
74 | compare(etalon, input)
75 | end
76 |
77 | def test_tokenization_012 #Fehlende Leerzeichen
78 | input = "Der Satz endet.Das Leerzeichen fehlt."
79 | etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
80 | compare(etalon, input)
81 | end
82 |
83 | def test_tokenization_013 #Bindestriche
84 | input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
85 | etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
86 | compare(etalon, input)
87 | end
88 |
89 | def test_tokenization_014 #Abkuerzungen
90 | input = "Der Satz enthielt z.B. Fehler"
91 | etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
92 | compare(etalon, input)
93 | end
94 |
95 | def test_tokenization_015 #Fehlende Grossbuchstaben
96 | input = "Der Satz endet. der Satz beginnt"
97 | etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
98 | compare(etalon, input)
99 | end
100 |
101 | def test_tokenization_016 #Franzoesisch
102 | input = "L'art de l'univers, c'est un art"
103 | etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
104 | compare(etalon, input)
105 | end
106 |
107 | def test_tokenization_017 #James Bond
108 | input = "Bond,... James Bond."
109 | etalon = %w{ Bond , ... James Bond . } #Kontrovers!
110 | compare(etalon, input)
111 | end
112 |
113 | def test_tokenization_018 #Inches
114 | input = "The square had four 9\" sides"
115 | etalon = %w{ The square had four 9" sides }
116 | compare(etalon, input)
117 | end
118 |
119 | def test_tokenization_019 #Abkuerzung zugleich Lexikon-Eintrag
120 | input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
121 | etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
122 | compare(etalon, input)
123 | end
124 |
125 | def test_tokenization_020 #Leerzeichen-getrennte Zusammengehörigkeiten
126 | input = "They booked the flight New York-Los Angeles"
127 | etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
128 | compare(etalon, input)
129 | end
130 |
131 | def test_tokenization_021 #Ordinale
132 | input = "Der 1. Platz ging an den Sieger"
133 | etalon = %w{ Der 1. Platz ging an den Sieger }
134 | compare(etalon, input)
135 | end
136 |
137 | def test_tokenization_022 #Klitika
138 | input = "Er war's, stimmt's?"
139 | etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
140 | compare(etalon, input)
141 | end
142 |
143 | def test_tokenization_023 #Datums- und Zeitangaben
144 | input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
145 | etalon = [ "Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
146 | compare(etalon, input)
147 | end
148 |
149 | def test_tokenization_024 #Eingebettete Saetze
150 | input = "\"This is all?\" George asked."
151 | etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
152 | compare(etalon, input)
153 | end
154 |
155 | def test_tokenization_025 #Eingebettete Saetze 2
156 | input = "\"Das ist alles?\" fragte sie."
157 | etalon = %w{ Das ist alles ? fragte sie . } #ungrammatischer Satz "fragte sie."
158 | compare(etalon, input)
159 | end
160 |
161 |
162 | def test_tokenization_026
163 | input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
164 | etalon = %w{ Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd ! }
165 | compare(etalon, input)
166 | end
167 |
168 | def test_tokenization_027
169 | input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
170 | etalon = %w{ Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen . }
171 | compare(etalon, input)
172 | end
173 |
174 | def test_tokenization_028
175 | input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
176 | etalon = %w{ Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen . }
177 | compare(etalon, input)
178 | end
179 |
180 | def test_tokenization_029
181 | input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
182 | etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
183 | compare(etalon, input)
184 | end
185 |
186 | def test_tokenization_030 #Ellipsis
187 | input = "Der Satz endet in einer Ellips..."
188 | etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
189 | compare(etalon, input)
190 | end
191 |
192 | def test_tokenization_031 #Fehlende Leerzeichen
193 | input = "Der Satz endet.Das Leerzeichen fehlt."
194 | etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
195 | compare(etalon, input)
196 | end
197 |
198 | def test_tokenization_032 #Bindestriche
199 | input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
200 | etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
201 | compare(etalon, input)
202 | end
203 |
204 | def test_tokenization_033 #Abkuerzungen
205 | input = "Der Satz enthielt z.B. Fehler"
206 | etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
207 | compare(etalon, input)
208 | end
209 |
210 | def test_tokenization_034 #Fehlende Grossbuchstaben
211 | input = "Der Satz endet. der Satz beginnt"
212 | etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
213 | compare(etalon, input)
214 | end
215 |
216 | def test_tokenization_035 #Franzoesisch
217 | input = "L'art de l'univers, c'est un art"
218 | etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
219 | compare(etalon, input)
220 | end
221 |
222 | def test_tokenization_036 #James Bond
223 | input = "Bond,... James Bond."
224 | etalon = %w{ Bond , ... James Bond . } #Kontrovers!
225 | compare(etalon, input)
226 | end
227 |
228 | def test_tokenization_037 #Inches
229 | input = "The square had four 9\" sides"
230 | etalon = %w{ The square had four 9" sides }
231 | compare(etalon, input)
232 | end
233 |
234 | def test_tokenization_039 #Abkuerzung zugleich Lexikon-Eintrag
235 | input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
236 | etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
237 | compare(etalon, input)
238 | end
239 |
240 | def test_tokenization_040 #Leerzeichen-getrennte Zusammengehörigkeiten
241 | input = "They booked the flight New York-Los Angeles"
242 | etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
243 | compare(etalon, input)
244 | end
245 |
246 | def test_tokenization_041 #Ordinale
247 | input = "Der 1. Platz ging an den Sieger"
248 | etalon = %w{ Der 1. Platz ging an den Sieger }
249 | compare(etalon, input)
250 | end
251 |
252 | def test_tokenization_042 #Klitika
253 | input = "Er war's, stimmt's?"
254 | etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
255 | compare(etalon, input)
256 | end
257 |
258 | #Datums- und Zeitangaben
259 | def test_tokenization_043
260 | input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
261 | etalon = ["Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
262 | compare(etalon, input)
263 | end
264 |
265 | #Eingebettete Sätze
266 | def test_tokenization_044
267 | input = '"This is all?" George asked.'
268 | etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
269 | compare(etalon, input)
270 | end
271 |
272 | def test_tokenization_046 #Eingebettete Saetze 2
273 | input = '"Das ist alles?" fragte sie.'
274 | etalon = %w{Das ist alles ? fragte sie .} #ungrammatischer Satz "fragte sie."
275 | compare(etalon, input)
276 | end
277 |
278 | private
279 | def compare(exp_result, input)
280 | act_result = @de_tokenizer.tokenize(input)
281 | assert_equal(exp_result, act_result)
282 | end
283 | end
284 |
--------------------------------------------------------------------------------
/test/development_tests/test_en_tokenizer_dev.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @en_tokenizer = Tokenizer::Tokenizer.new(:en)
9 | end
10 |
11 | def test_tokenization_001
12 | result = @en_tokenizer.tokenize('testing normal, english sentence')
13 | assert_equal(['testing', 'normal', ',', 'english', 'sentence', ''], result)
14 | end
15 |
16 | private
17 | def compare(exp_result, input)
18 | act_result = @de_tokenizer.tokenize(input)
19 | assert_equal(exp_result, act_result)
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/test/development_tests/test_fr_tokenizer_dev.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @fr_tokenizer = Tokenizer::Tokenizer.new(:fr)
9 | end
10 |
11 | def test_tokenization_001
12 | end
13 |
14 | private
15 | def compare(exp_result, input)
16 | act_result = @de_tokenizer.tokenize(input)
17 | assert_equal(exp_result, act_result)
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/test/development_tests/test_it_tokenizer_dev.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @it_tokenizer = Tokenizer::Tokenizer.new(:it)
9 | end
10 |
11 | def test_tokenization_001
12 | end
13 |
14 | private
15 | def compare(exp_result, input)
16 | act_result = @de_tokenizer.tokenize(input)
17 | assert_equal(exp_result, act_result)
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/test/development_tests/test_parameters.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @en_tokenizer = Tokenizer::Tokenizer.new(:en, {PRE: [], POST: ['|']})
9 | end
10 |
11 | def test_tokenization_001
12 | result = @en_tokenizer.tokenize('testing| new')
13 | assert_equal(['testing', '|', 'new', ''], result)
14 | end
15 |
16 | def test_tokenization_002
17 | result = @en_tokenizer.tokenize('testing, new')
18 | assert_equal(['testing,', 'new', ''], result)
19 | end
20 |
21 | private
22 | def compare(exp_result, input)
23 | act_result = @de_tokenizer.tokenize(input)
24 | assert_equal(exp_result, act_result)
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/test/development_tests/test_ru_tokenizer_dev.rb:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | require 'test/unit'
3 | require 'tokenizer'
4 |
5 | class TestTokenizerDev < Test::Unit::TestCase
6 |
7 | def setup
8 | @ru_tokenizer = Tokenizer::Tokenizer.new(:ru)
9 | end
10 |
11 | def test_tokenization_001
12 | end
13 |
14 | private
15 | def compare(exp_result, input)
16 | act_result = @de_tokenizer.tokenize(input)
17 | assert_equal(exp_result, act_result)
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/test/regression_tests/test_de_tokenizer.rb:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | require 'minitest/autorun'
3 | require 'minitest/spec'
4 | require 'tokenizer'
5 |
6 | class TestTokenizer < Minitest::Test
7 |
8 | def setup
9 | @t = Tokenizer::Tokenizer.new(:de)
10 | end
11 |
12 | def test_constants
13 | assert(Tokenizer::VERSION.is_a?(String) && !Tokenizer::VERSION.empty?)
14 | end
15 |
16 | def test_output_type
17 | output = @t.tokenize('ich gehe in die Schule')
18 | assert(output.is_a?(Array))
19 | end
20 |
21 | def test_tokenization_001
22 | input = 'Ich ging in die Schule!'
23 | etalon = %w(Ich ging in die Schule !)
24 | output = @t.tokenize(input)
25 | assert_equal(etalon, output)
26 | end
27 |
28 | def test_tokenization_002
29 | input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
30 | etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
31 | output = @t.tokenize(input)
32 | assert_equal(etalon, output)
33 | end
34 | end
35 |
36 | describe Tokenizer do
37 | describe 'empty input' do
38 | it 'should return an Array with an empty string' do
39 | tokens = Tokenizer::Tokenizer.new.tokenize('')
40 | tokens.must_equal([''])
41 | end
42 | end
43 | end
44 |
--------------------------------------------------------------------------------
/tokenizer.gemspec:
--------------------------------------------------------------------------------
1 | lib_path = File.expand_path('../lib', __FILE__)
2 | $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
3 |
4 | require 'tokenizer/version'
5 | require 'rake'
6 |
7 | Gem::Specification.new do |s|
8 | s.name = 'tokenizer'
9 | s.summary = 'Tokenizer is a tool intended to split a text into tokens.'
10 |
11 | s.description = 'A simple multilingual tokenizer for NLP tasks. This tool '\
12 | 'provides a CLI and a library for linguistic tokenization '\
13 | 'which is an anavoidable step for many HLT (human language '\
14 | 'technology) tasks in the preprocessing phase for further '\
15 | 'syntactic, semantic and other higher level processing '\
16 | 'goals. Use it for tokenization of German, '\
17 | 'English and French texts.'
18 | s.rubyforge_project = 'tokenizer'
19 | s.version = Tokenizer::VERSION
20 | s.author = 'Andrei Beliankou'
21 | s.email = 'arbox@yandex.ru'
22 | s.homepage = 'https://github.com/arbox/tokenizer'
23 | s.license = 'MIT'
24 | s.executables << 'tokenize'
25 | s.extra_rdoc_files = FileList['*.rdoc'].to_a
26 | s.required_ruby_version = '>= 1.9.3'
27 | s.files = FileList['lib/**/*.rb',
28 | 'README.rdoc',
29 | 'LICENSE.rdoc',
30 | 'CHANGELOG.rdoc',
31 | '.yardopts',
32 | 'test/**/*',
33 | 'bin/*'].to_a
34 | s.test_files = FileList['test/**/*'].to_a
35 | end
36 |
--------------------------------------------------------------------------------