├── .gem-version
├── .gitignore
├── .travis.yml
├── .yardopts
├── CHANGELOG.rdoc
├── Gemfile
├── LICENSE.rdoc
├── README.rdoc
├── Rakefile
├── bin
    └── tokenize
├── lib
    ├── tokenizer.rb
    └── tokenizer
    │   ├── tokenizer.rb
    │   └── version.rb
├── test
    ├── development_tests
    │   ├── test_by_tokenizer_dev.rb
    │   ├── test_de_tokenizer_dev.rb
    │   ├── test_en_tokenizer_dev.rb
    │   ├── test_fr_tokenizer_dev.rb
    │   ├── test_it_tokenizer_dev.rb
    │   ├── test_parameters.rb
    │   └── test_ru_tokenizer_dev.rb
    └── regression_tests
    │   └── test_de_tokenizer.rb
└── tokenizer.gemspec


/.gem-version:
--------------------------------------------------------------------------------
1 | 2016-03-28 13:30:07 +0200	0.3.0
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ydoc
2 | rdoc
3 | .yardoc
4 | *.gem
5 | .ruby-version
6 | Gemfile.lock
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: ruby
 2 | 
 3 | before_install:
 4 |   - gem install bundler
 5 |   
 6 | rvm:
 7 |   - 1.9.3
 8 |   - jruby-19mode # JRuby in 1.9 mode
 9 |   - 2.0.0
10 |   - 2.1.2
11 |   - 2.2.0
12 |   - 2.3.0
13 |   - rbx-19mode
14 |   - ruby-head
15 | 
16 | 
17 | script: "bundle exec rake test"
18 | 
19 | matrix:
20 |   allow_failures:
21 |     - rvm: rbx-19mode
22 |     - rvm: ruby-head
23 | 
24 | notifications:
25 |   email:
26 |     on_success: always
27 | 


--------------------------------------------------------------------------------
/.yardopts:
--------------------------------------------------------------------------------
 1 | --private
 2 | --protected
 3 | --title 'A simple tokenizer for NLP tasks.'
 4 | --main README.rdoc
 5 | -
 6 | CHANGELOG.rdoc
 7 | README.rdoc
 8 | LICENSE.rdoc
 9 | bin/*
10 | lib/**/*
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.rdoc:
--------------------------------------------------------------------------------
 1 | == COMPLETED
 2 | === 0.1.1
 3 | * Documentation and the whole project tree updated. No functional improvements.
 4 | * Corrected typos.
 5 | === 0.1.0
 6 | * Notion of binary tokenizer and a library for embedded tokenization.
 7 | * Separation of punctuaction marks.
 8 | === 0.0.1
 9 | * Simple tokenization is desired.
10 | 
11 | == PLANNED
12 | === 0.2.0
13 | === 0.3.0
14 | === 0.4.0
15 | === 0.5.0
16 | === 0.6.0
17 | === 0.7.0
18 | === 0.8.0
19 | === 0.9.0
20 | === 1.0.0
21 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'https://rubygems.org'
 2 | 
 3 | gemspec
 4 | 
 5 | group :development do
 6 |   gem 'rdoc', '>= 3.9.1'
 7 |   gem 'rake', '~> 11.1'
 8 |   gem 'yard', '~> 0.8'
 9 |   gem 'bundler', '~> 1.7'
10 |   gem 'minitest', '~> 5.8'
11 |   gem 'travis', '~> 1.8'
12 |   gem 'rubocop', '~> 0.38'
13 | end
14 | 


--------------------------------------------------------------------------------
/LICENSE.rdoc:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011- Andrei Beliankou, Sven Naumann
 2 | University of Trier, Germany
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE
21 | 


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
 1 | = Tokenizer
 2 | 
 3 | {RubyGems}[http://rubygems.org/gems/tokenizer] |
 4 | {Homepage}[http://bu.chsta.be/projects/tokenizer] |
 5 | {Source Code}[https://github.com/arbox/tokenizer] |
 6 | {Bug Tracker}[https://github.com/arbox/tokenizer/issues]
 7 | 
 8 | {<img src="https://img.shields.io/gem/v/tokenizer.svg" alt="Gem Version" />}[https://rubygems.org/gems/tokenizer]
 9 | {<img src="https://img.shields.io/travis/arbox/tokenizer.svg" alt="Build Status" />}[https://travis-ci.org/arbox/tokenizer]
10 | {<img src="https://img.shields.io/codeclimate/github/arbox/tokenizer.svg" alt="Code Climate" />}[https://codeclimate.com/github/arbox/tokenizer]
11 | {<img src="https://img.shields.io/gemnasium/arbox/tokenizer.svg" alt="Dependency Status" />}[https://gemnasium.com/arbox/tokenizer]
12 | 
13 | == DESCRIPTION
14 | A simple multilingual tokenizer -- a linguistic tool intended to split a written text
15 | into tokens for NLP tasks. This tool provides a CLI and a library for
16 | linguistic tokenization which is an anavoidable step for many HLT (Human
17 | Language Technology) tasks in the preprocessing phase for further syntactic,
18 | semantic  and other higher level processing goals.
19 | 
20 | Tokenization task involves Sentence Segmentation, Word Segmentation and Boundary
21 | Disambiguation for the both tasks.
22 | 
23 | Use it for tokenization of German, English and Dutch texts.
24 | 
25 | === Implemented Algorithms
26 | to be ...
27 | 
28 | == INSTALLATION
29 | +Tokenizer+ is provided as a .gem package. Simply install it via
30 | {RubyGems}[http://rubygems.org/gems/tokenizer].
31 | 
32 | To install +tokenizer+ issue the following command:
33 |   $ gem install tokenizer
34 | 
35 | If you want to do a system wide installation, do this as root
36 | (possibly using +sudo+).
37 | 
38 | Alternatively use your Gemfile for dependency management.
39 | 
40 | == SYNOPSIS
41 | 
42 | You can use +Tokenizer+ in two ways.
43 | * As a command line tool:
44 |     $ echo 'Hi, ich gehe in die Schule!. | tokenize
45 | 
46 | * As a library for embedded tokenization:
47 |     > require 'tokenizer'
48 |     > de_tokenizer = Tokenizer::WhitespaceTokenizer.new
49 |     > de_tokenizer.tokenize('Ich gehe in die Schule!')
50 |     > => ["Ich", "gehe", "in", "die", "Schule", "!"]
51 | 
52 | * Customizable PRE and POST list
53 |     > require 'tokenizer'
54 |     > de_tokenizer = Tokenizer::WhitespaceTokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
55 |     > de_tokenizer.tokenize('Ich gehe|in die Schule!')
56 |     > => ["Ich", "gehe", "|in", "die", "Schule", "!"]
57 | 
58 | See documentation in the Tokenizer::WhitespaceTokenizer class for details
59 | on particular methods.
60 | 
61 | == SUPPORT
62 | 
63 | If you have question, bug reports or any suggestions, please drop me an email :)
64 | Any help is deeply appreciated!
65 | 
66 | == CHANGELOG
67 | For details on future plan and working progress see CHANGELOG.rdoc.
68 | 
69 | == CAUTION
70 | This library is <b>work in process</b>! Though the interface is mostly complete,
71 | you might face some not implemented features.
72 | 
73 | Please contact me with your suggestions, bug reports and feature requests.
74 | 
75 | == LICENSE
76 | 
77 | +Tokenizer+ is a copyrighted software by Andrei Beliankou, 2011-
78 | 
79 | You may use, redistribute and change it under the terms provided
80 | in the LICENSE.rdoc file.
81 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | lib_path = File.expand_path('../lib', __FILE__)
 2 | $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
 3 | 
 4 | # Rake provides FileUtils and file lists.
 5 | require 'rake'
 6 | require 'rdoc'
 7 | 
 8 | # clean and clobber tasks.
 9 | require 'rake/clean'
10 | CLOBBER.include('rdoc',
11 |                 'ydoc',
12 |                 '.yardoc',
13 |                 '**/*.gem')
14 | 
15 | # Running tests.
16 | require 'rake/testtask'
17 | Rake::TestTask.new do |t|
18 |   t.test_files = FileList.new('test/regression_tests/*.rb').to_a
19 | end
20 | 
21 | # Generate documentation
22 | require 'rdoc/task'
23 | RDoc::Task.new do |rdoc|
24 |   rdoc.rdoc_files.include('README.rdoc',
25 |                           'LICENSE.rdoc',
26 |                           'CHANGELOG.rdoc',
27 |                           'lib/**/*',
28 |                           'bin/*'
29 |                           )
30 |   rdoc.rdoc_dir = 'rdoc'
31 | end
32 | 
33 | require 'yard'
34 | YARD::Rake::YardocTask.new do |ydoc|
35 |   ydoc.options += ['-o', 'ydoc', '--no-cache']
36 | end
37 | 
38 | desc 'Document the code using Yard and RDoc.'
39 | task doc: [:clobber, :rdoc, :yard]
40 | 
41 | # Custom gem building and releasing tasks.
42 | require 'tokenizer/version'
43 | desc 'Commit pending changes.'
44 | task :commit do
45 | end
46 | 
47 | desc 'Create a tag in the repository for the current release.'
48 | task :tag do
49 | end
50 | 
51 | desc "Build the gem package tokenizer-#{Tokenizer::VERSION}.gem"
52 | task :build => :clobber do
53 |   system 'bundle exec gem build tokenizer.gemspec'
54 | end
55 | 
56 | desc 'Deploy the gem package to RubyGems.'
57 | task release: [:commit, :tag, :build] do
58 |   system "gem push tokenizer-#{Tokenizer::VERSION}.gem"
59 | end
60 | 
61 | desc 'Open an irb session preloaded with this library.'
62 | task :console do
63 |   sh 'irb -I lib -r tokenizer.rb'
64 | end
65 | 
66 | task :travis do
67 |   sh 'git pull'
68 |   message = "#{Time.now}\t#{Tokenizer::VERSION}\n"
69 |   File.open('.gem-version', 'w') do |file|
70 |     file.write(message)
71 |   end
72 |   sh 'git add .gem-version'
73 |   sh "git commit -m '#{message.chomp}'"
74 |   sh 'git push origin master'
75 | end
76 | 
77 | task :default => :test
78 | 


--------------------------------------------------------------------------------
/bin/tokenize:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'tokenizer'
 4 | 
 5 | tokenizer = Tokenizer::Tokenizer.new
 6 | 
 7 | while (line = gets)
 8 |   puts tokenizer.tokenize(line).join("\n")
 9 | end
10 | 


--------------------------------------------------------------------------------
/lib/tokenizer.rb:
--------------------------------------------------------------------------------
1 | require 'tokenizer/tokenizer'
2 | require 'tokenizer/version'
3 | 


--------------------------------------------------------------------------------
/lib/tokenizer/tokenizer.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # :title: A simple Tokenizer for NLP Tasks.
 3 | # :main: README.rdoc
 4 | 
 5 | # A namespace for all project related stuff.
 6 | module Tokenizer
 7 |   # Simple whitespace based tokenizer with configurable punctuation detection.
 8 |   class WhitespaceTokenizer
 9 |     # Default whitespace separator.
10 |     FS = Regexp.new('[[:blank:]]+')
11 | 
12 |     # Characters only in the role of splittable prefixes.
13 |     SIMPLE_PRE = ['¿', '¡']
14 | 
15 |     # Characters only in the role of splittable suffixes.
16 |     SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
17 | 
18 |     # Characters as splittable prefixes with an optional matching suffix.
19 |     PAIR_PRE = ['(', '{', '[', '<', '«', '„']
20 | 
21 |     # Characters as splittable suffixes with an optional matching prefix.
22 |     PAIR_POST = [')', '}', ']', '>', '»', '“']
23 | 
24 |     # Characters which can be both prefixes AND suffixes.
25 |     PRE_N_POST = ['"', "'"]
26 | 
27 |     private_constant :FS
28 | 
29 |     # @param [Symbol] lang Language identifier.
30 |     # @param [Hash] options Additional options.
31 |     # @option options [Array] :pre Array of splittable prefix characters.
32 |     # @option options [Array] :post Array of splittable suffix characters.
33 |     # @option options [Array] :pre_n_post Array of characters with
34 |     #   suffix AND prefix functions.
35 |     def initialize(lang = :de, options = {})
36 |       @lang = lang
37 |       @options = {
38 |         pre: SIMPLE_PRE + PAIR_PRE,
39 |         post: SIMPLE_POST + PAIR_POST,
40 |         pre_n_post: PRE_N_POST
41 |       }.merge(options)
42 |     end
43 | 
44 |     # @param [String] str String to be tokenized.
45 |     # @return [Array<String>] Array of tokens.
46 |     def tokenize(str)
47 |       tokens = sanitize_input(str).split(FS)
48 |       return [''] if tokens.empty?
49 | 
50 |       splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
51 |       pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
52 |       output = []
53 |       tokens.each do |token|
54 |         prefix, stem, suffix = token.partition(pattern)
55 |         output << prefix.split('') unless prefix.empty?
56 |         output << stem unless stem.empty?
57 |         output << suffix.split('') unless suffix.empty?
58 |       end
59 | 
60 |       output.flatten
61 |     end
62 | 
63 |     alias process tokenize
64 | 
65 |     private
66 | 
67 |     # @param [String] str User defined string to be tokenized.
68 |     # @return [String] A new modified string.
69 |     def sanitize_input(str)
70 |       str.chomp.strip
71 |     end
72 |   end # class
73 | 
74 |   # @deprecated Use {WhitespaceTokenizer} instead.
75 |   class Tokenizer < WhitespaceTokenizer
76 |     def initialize(*args)
77 |       warn '[Deprecated!] Use WhitespaceTokenizer instead.'
78 |       super(*args)
79 |     end
80 |   end
81 | end # module
82 | 


--------------------------------------------------------------------------------
/lib/tokenizer/version.rb:
--------------------------------------------------------------------------------
1 | module Tokenizer
2 |   VERSION = '0.3.0'
3 | end
4 | 


--------------------------------------------------------------------------------
/test/development_tests/test_by_tokenizer_dev.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | require 'test/unit'
 3 | require 'tokenizer'
 4 | 
 5 | class TestTokenizerDev < Test::Unit::TestCase
 6 | 
 7 |   def setup
 8 |     @by_tokenizer = Tokenizer::Tokenizer.new(:by)
 9 |   end
10 | 
11 |   def test_tokenization_001
12 |   end
13 | 
14 |   private
15 |   def compare(exp_result, input)
16 |     act_result = @de_tokenizer.tokenize(input)
17 |     assert_equal(exp_result, act_result)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/test/development_tests/test_de_tokenizer_dev.rb:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | require 'test/unit'
  3 | require 'tokenizer'
  4 | 
  5 | class TestTokenizerDev < Test::Unit::TestCase
  6 | 
  7 |   def setup
  8 |     @de_tokenizer = Tokenizer::Tokenizer.new(:de)
  9 |   end
 10 | 
 11 |   def test_tokenization_001
 12 |     input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!'
 13 |     etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !}
 14 |     compare(etalon, input)
 15 |   end
 16 | 
 17 |   def test_tokenization_002
 18 |     input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
 19 |     etalon = %w{Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd !}
 20 |     compare(etalon, input)
 21 |   end
 22 | 
 23 |   def test_tokenization_003
 24 |     input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
 25 |     etalon = %w{Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen .}
 26 |     compare(etalon, input)
 27 |   end
 28 | 
 29 |   def test_tokenization_004
 30 |     input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
 31 |     etalon = %w{Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen .}
 32 |     compare(etalon, input)
 33 |   end
 34 | 
 35 |   def test_tokenization_005
 36 |     input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
 37 |     etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
 38 |     compare(etalon, input)
 39 |   end
 40 | 
 41 |   def test_tokenization_006
 42 |     input = 'Es gibt viele verschiedene Zeichen, die noch in Texten vorkommen können wie - zum Beispiel - diese hier "text" oder (text).'
 43 |     etalon = %w{Es gibt viele verschiedene Zeichen , die noch in Texten vorkommen können wie - zum Beispiel - diese hier " text " oder ( text ) .}
 44 |     compare(etalon, input)
 45 |   end
 46 | 
 47 |   def test_tokenization_007
 48 |     input = "Abkürzungen sind immer ein Problem, da auch Leerzeichen dazwischen stehen können, wie z. B. hier."
 49 |     etalon = ["Abkürzungen", "sind", "immer", "ein", "Problem", ",", "da", "auch", "Leerzeichen", "dazwischen", "stehen", "können", ",", "wie", "z. B.", "hier", "."]
 50 |     compare(etalon, input)
 51 |   end
 52 | 
 53 |   def test_tokenization_008
 54 |     input = "Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen, bei z.B. Aufzählungen."
 55 |     etalon = %w{Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen , bei z.B. Aufzählungen .}
 56 |     compare(etalon, input)
 57 |   end
 58 | 
 59 |   def test_tokenization_009
 60 |     input = "Ein weiteres Problem sind solche Getrennt- und Zusammenschreibungen."
 61 |     etalon = %w{Ein weiteres Problem sind solche Getrenntschreibungen und Zusammenschreibungen .}
 62 |     compare(etalon, input)
 63 |   end
 64 | 
 65 |   def test_tokenization_010
 66 |     input = "In manchen Texten gibt es auch Worttrennung am Zeilen- ende."
 67 |     etalon = %w{In manchen Texten gibt es auch Worttrennung am Zeilenende .}
 68 |     compare(etalon, input)
 69 |   end
 70 | 
 71 |   def test_tokenization_011 #Ellipsis
 72 |     input = "Der Satz endet in einer Ellips..."
 73 |     etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
 74 |     compare(etalon, input)
 75 |   end
 76 | 
 77 |   def test_tokenization_012 #Fehlende Leerzeichen
 78 |     input = "Der Satz endet.Das Leerzeichen fehlt."
 79 |     etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
 80 |     compare(etalon, input)
 81 |   end
 82 | 
 83 |   def test_tokenization_013 #Bindestriche
 84 |     input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
 85 |     etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
 86 |     compare(etalon, input)
 87 |   end
 88 | 
 89 |   def test_tokenization_014 #Abkuerzungen
 90 |     input = "Der Satz enthielt z.B. Fehler"
 91 |     etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
 92 |     compare(etalon, input)
 93 |   end
 94 | 
 95 |   def test_tokenization_015 #Fehlende Grossbuchstaben
 96 |     input = "Der Satz endet. der Satz beginnt"
 97 |     etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
 98 |     compare(etalon, input)
 99 |   end
100 | 
101 |   def test_tokenization_016 #Franzoesisch
102 |     input = "L'art de l'univers, c'est un art"
103 |     etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
104 |     compare(etalon, input)
105 |   end
106 | 
107 |   def test_tokenization_017 #James Bond
108 |     input = "Bond,... James Bond."
109 |     etalon = %w{ Bond , ... James Bond . } #Kontrovers!
110 |     compare(etalon, input)
111 |   end
112 | 
113 |   def test_tokenization_018 #Inches
114 |     input = "The square had four 9\" sides"
115 |     etalon = %w{ The square had four 9" sides }
116 |     compare(etalon, input)
117 |   end
118 | 
119 |   def test_tokenization_019 #Abkuerzung zugleich Lexikon-Eintrag
120 |     input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
121 |     etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
122 |     compare(etalon, input)
123 |   end
124 | 
125 |   def test_tokenization_020 #Leerzeichen-getrennte Zusammengehörigkeiten
126 |     input = "They booked the flight New York-Los Angeles"
127 |     etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
128 |     compare(etalon, input)
129 |   end
130 | 
131 |   def test_tokenization_021 #Ordinale
132 |     input = "Der 1. Platz ging an den Sieger"
133 |     etalon = %w{ Der 1. Platz ging an den Sieger }
134 |     compare(etalon, input)
135 |   end
136 | 
137 |   def test_tokenization_022 #Klitika
138 |     input = "Er war's, stimmt's?"
139 |     etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
140 |     compare(etalon, input)
141 |   end
142 | 
143 |   def test_tokenization_023 #Datums- und Zeitangaben
144 |     input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
145 |     etalon = [ "Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
146 |     compare(etalon, input)
147 |   end
148 | 
149 |   def test_tokenization_024 #Eingebettete Saetze
150 |     input = "\"This is all?\" George asked."
151 |     etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
152 |     compare(etalon, input)
153 |   end
154 | 
155 |   def test_tokenization_025 #Eingebettete Saetze 2
156 |     input = "\"Das ist alles?\" fragte sie."
157 |     etalon = %w{ Das ist alles ? fragte sie . } #ungrammatischer Satz "fragte sie."
158 |     compare(etalon, input)
159 |   end
160 | 
161 | 
162 |   def test_tokenization_026
163 |     input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
164 |     etalon = %w{ Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd ! }
165 |     compare(etalon, input)
166 |   end
167 | 
168 |   def test_tokenization_027
169 |     input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
170 |     etalon = %w{ Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen . }
171 |     compare(etalon, input)
172 |   end
173 | 
174 |   def test_tokenization_028
175 |     input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
176 |     etalon = %w{ Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen . }
177 |     compare(etalon, input)
178 |   end
179 | 
180 |   def test_tokenization_029
181 |     input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
182 |     etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
183 |     compare(etalon, input)
184 |   end
185 | 
186 |   def test_tokenization_030 #Ellipsis
187 |     input = "Der Satz endet in einer Ellips..."
188 |     etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
189 |     compare(etalon, input)
190 |   end
191 | 
192 |   def test_tokenization_031 #Fehlende Leerzeichen
193 |     input = "Der Satz endet.Das Leerzeichen fehlt."
194 |     etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
195 |     compare(etalon, input)
196 |   end
197 | 
198 |   def test_tokenization_032 #Bindestriche
199 |     input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
200 |     etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
201 |     compare(etalon, input)
202 |   end
203 | 
204 |   def test_tokenization_033 #Abkuerzungen
205 |     input = "Der Satz enthielt z.B. Fehler"
206 |     etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
207 |     compare(etalon, input)
208 |   end
209 | 
210 |   def test_tokenization_034 #Fehlende Grossbuchstaben
211 |     input = "Der Satz endet. der Satz beginnt"
212 |     etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
213 |     compare(etalon, input)
214 |   end
215 | 
216 |   def test_tokenization_035 #Franzoesisch
217 |     input = "L'art de l'univers, c'est un art"
218 |     etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
219 |     compare(etalon, input)
220 |   end
221 | 
222 |   def test_tokenization_036 #James Bond
223 |     input = "Bond,... James Bond."
224 |     etalon = %w{ Bond , ... James Bond . } #Kontrovers!
225 |     compare(etalon, input)
226 |   end
227 | 
228 |   def test_tokenization_037 #Inches
229 |     input = "The square had four 9\" sides"
230 |     etalon = %w{ The square had four 9" sides }
231 |     compare(etalon, input)
232 |   end
233 | 
234 |   def test_tokenization_039 #Abkuerzung zugleich Lexikon-Eintrag
235 |     input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
236 |     etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
237 |     compare(etalon, input)
238 |   end
239 | 
240 |   def test_tokenization_040 #Leerzeichen-getrennte Zusammengehörigkeiten
241 |     input = "They booked the flight New York-Los Angeles"
242 |     etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
243 |     compare(etalon, input)
244 |   end
245 | 
246 |   def test_tokenization_041 #Ordinale
247 |     input = "Der 1. Platz ging an den Sieger"
248 |     etalon = %w{ Der 1. Platz ging an den Sieger }
249 |     compare(etalon, input)
250 |   end
251 | 
252 |   def test_tokenization_042 #Klitika
253 |     input = "Er war's, stimmt's?"
254 |     etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
255 |     compare(etalon, input)
256 |   end
257 | 
258 |   #Datums- und Zeitangaben
259 |   def test_tokenization_043
260 |     input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
261 |     etalon = ["Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
262 |     compare(etalon, input)
263 |   end
264 | 
265 |   #Eingebettete Sätze
266 |   def test_tokenization_044
267 |     input = '"This is all?" George asked.'
268 |     etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
269 |     compare(etalon, input)
270 |   end
271 | 
272 |   def test_tokenization_046 #Eingebettete Saetze 2
273 |     input = '"Das ist alles?" fragte sie.'
274 |     etalon = %w{Das ist alles ? fragte sie .} #ungrammatischer Satz "fragte sie."
275 |     compare(etalon, input)
276 |   end
277 | 
278 |   private
279 |   def compare(exp_result, input)
280 |     act_result = @de_tokenizer.tokenize(input)
281 |     assert_equal(exp_result, act_result)
282 |   end
283 | end
284 | 


--------------------------------------------------------------------------------
/test/development_tests/test_en_tokenizer_dev.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | require 'test/unit'
 3 | require 'tokenizer'
 4 | 
 5 | class TestTokenizerDev < Test::Unit::TestCase
 6 | 
 7 |   def setup
 8 |     @en_tokenizer = Tokenizer::Tokenizer.new(:en)
 9 |   end
10 | 
11 |   def test_tokenization_001
12 |     result = @en_tokenizer.tokenize('testing normal, english sentence')
13 |     assert_equal(['testing', 'normal', ',', 'english', 'sentence', ''], result)
14 |   end
15 | 
16 |   private
17 |   def compare(exp_result, input)
18 |     act_result = @de_tokenizer.tokenize(input)
19 |     assert_equal(exp_result, act_result)
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/test/development_tests/test_fr_tokenizer_dev.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | require 'test/unit'
 3 | require 'tokenizer'
 4 | 
 5 | class TestTokenizerDev < Test::Unit::TestCase
 6 | 
 7 |   def setup
 8 |     @fr_tokenizer = Tokenizer::Tokenizer.new(:fr)
 9 |   end
10 | 
11 |   def test_tokenization_001
12 |   end
13 | 
14 |   private
15 |   def compare(exp_result, input)
16 |     act_result = @de_tokenizer.tokenize(input)
17 |     assert_equal(exp_result, act_result)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/test/development_tests/test_it_tokenizer_dev.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | require 'test/unit'
 3 | require 'tokenizer'
 4 | 
 5 | class TestTokenizerDev < Test::Unit::TestCase
 6 | 
 7 |   def setup
 8 |     @it_tokenizer = Tokenizer::Tokenizer.new(:it)
 9 |   end
10 | 
11 |   def test_tokenization_001
12 |   end
13 | 
14 |   private
15 |   def compare(exp_result, input)
16 |     act_result = @de_tokenizer.tokenize(input)
17 |     assert_equal(exp_result, act_result)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/test/development_tests/test_parameters.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | require 'test/unit'
 3 | require 'tokenizer'
 4 | 
 5 | class TestTokenizerDev < Test::Unit::TestCase
 6 | 
 7 |   def setup
 8 |     @en_tokenizer = Tokenizer::Tokenizer.new(:en, {PRE: [], POST: ['|']})
 9 |   end
10 | 
11 |   def test_tokenization_001
12 |     result = @en_tokenizer.tokenize('testing| new')
13 |     assert_equal(['testing', '|', 'new', ''], result)
14 |   end
15 | 
16 |   def test_tokenization_002
17 |     result = @en_tokenizer.tokenize('testing, new')
18 |     assert_equal(['testing,', 'new', ''], result)
19 |   end
20 | 
21 |   private
22 |   def compare(exp_result, input)
23 |     act_result = @de_tokenizer.tokenize(input)
24 |     assert_equal(exp_result, act_result)
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/test/development_tests/test_ru_tokenizer_dev.rb:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | require 'test/unit'
 3 | require 'tokenizer'
 4 | 
 5 | class TestTokenizerDev < Test::Unit::TestCase
 6 | 
 7 |   def setup
 8 |     @ru_tokenizer = Tokenizer::Tokenizer.new(:ru)
 9 |   end
10 | 
11 |   def test_tokenization_001
12 |   end
13 | 
14 |   private
15 |   def compare(exp_result, input)
16 |     act_result = @de_tokenizer.tokenize(input)
17 |     assert_equal(exp_result, act_result)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/test/regression_tests/test_de_tokenizer.rb:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | require 'minitest/autorun'
 3 | require 'minitest/spec'
 4 | require 'tokenizer'
 5 | 
 6 | class TestTokenizer < Minitest::Test
 7 | 
 8 |   def setup
 9 |     @t = Tokenizer::Tokenizer.new(:de)
10 |   end
11 | 
12 |   def test_constants
13 |     assert(Tokenizer::VERSION.is_a?(String) && !Tokenizer::VERSION.empty?)
14 |   end
15 | 
16 |   def test_output_type
17 |     output = @t.tokenize('ich gehe in die Schule')
18 |     assert(output.is_a?(Array))
19 |   end
20 | 
21 |   def test_tokenization_001
22 |     input = 'Ich ging in die Schule!'
23 |     etalon = %w(Ich ging in die Schule !)
24 |     output = @t.tokenize(input)
25 |     assert_equal(etalon, output)
26 |   end
27 | 
28 |   def test_tokenization_002
29 |     input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
30 |     etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
31 |     output = @t.tokenize(input)
32 |     assert_equal(etalon, output)
33 |   end
34 | end
35 | 
36 | describe Tokenizer do
37 |   describe 'empty input' do
38 |     it 'should return an Array with an empty string' do
39 |       tokens = Tokenizer::Tokenizer.new.tokenize('')
40 |       tokens.must_equal([''])
41 |     end
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/tokenizer.gemspec:
--------------------------------------------------------------------------------
 1 | lib_path = File.expand_path('../lib', __FILE__)
 2 | $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path)
 3 | 
 4 | require 'tokenizer/version'
 5 | require 'rake'
 6 | 
 7 | Gem::Specification.new do |s|
 8 |   s.name = 'tokenizer'
 9 |   s.summary = 'Tokenizer is a tool intended to split a text into tokens.'
10 | 
11 |   s.description = 'A simple multilingual tokenizer for NLP tasks. This tool '\
12 |                   'provides a CLI and a library for linguistic tokenization '\
13 |                   'which is an anavoidable step for many HLT (human language '\
14 |                   'technology) tasks in the preprocessing phase for further '\
15 |                   'syntactic, semantic and other higher level processing '\
16 |                   'goals. Use it for tokenization of German, '\
17 |                   'English and French texts.'
18 |   s.rubyforge_project = 'tokenizer'
19 |   s.version = Tokenizer::VERSION
20 |   s.author = 'Andrei Beliankou'
21 |   s.email = 'arbox@yandex.ru'
22 |   s.homepage = 'https://github.com/arbox/tokenizer'
23 |   s.license = 'MIT'
24 |   s.executables << 'tokenize'
25 |   s.extra_rdoc_files = FileList['*.rdoc'].to_a
26 |   s.required_ruby_version = '>= 1.9.3'
27 |   s.files = FileList['lib/**/*.rb',
28 |                      'README.rdoc',
29 |                      'LICENSE.rdoc',
30 |                      'CHANGELOG.rdoc',
31 |                      '.yardopts',
32 |                      'test/**/*',
33 |                      'bin/*'].to_a
34 |   s.test_files = FileList['test/**/*'].to_a
35 | end
36 | 


--------------------------------------------------------------------------------