├── .gem-version ├── .gitignore ├── .travis.yml ├── .yardopts ├── CHANGELOG.rdoc ├── Gemfile ├── LICENSE.rdoc ├── README.rdoc ├── Rakefile ├── bin └── tokenize ├── lib ├── tokenizer.rb └── tokenizer │ ├── tokenizer.rb │ └── version.rb ├── test ├── development_tests │ ├── test_by_tokenizer_dev.rb │ ├── test_de_tokenizer_dev.rb │ ├── test_en_tokenizer_dev.rb │ ├── test_fr_tokenizer_dev.rb │ ├── test_it_tokenizer_dev.rb │ ├── test_parameters.rb │ └── test_ru_tokenizer_dev.rb └── regression_tests │ └── test_de_tokenizer.rb └── tokenizer.gemspec /.gem-version: -------------------------------------------------------------------------------- 1 | 2016-03-28 13:30:07 +0200 0.3.0 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ydoc 2 | rdoc 3 | .yardoc 4 | *.gem 5 | .ruby-version 6 | Gemfile.lock 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | 3 | before_install: 4 | - gem install bundler 5 | 6 | rvm: 7 | - 1.9.3 8 | - jruby-19mode # JRuby in 1.9 mode 9 | - 2.0.0 10 | - 2.1.2 11 | - 2.2.0 12 | - 2.3.0 13 | - rbx-19mode 14 | - ruby-head 15 | 16 | 17 | script: "bundle exec rake test" 18 | 19 | matrix: 20 | allow_failures: 21 | - rvm: rbx-19mode 22 | - rvm: ruby-head 23 | 24 | notifications: 25 | email: 26 | on_success: always 27 | -------------------------------------------------------------------------------- /.yardopts: -------------------------------------------------------------------------------- 1 | --private 2 | --protected 3 | --title 'A simple tokenizer for NLP tasks.' 4 | --main README.rdoc 5 | - 6 | CHANGELOG.rdoc 7 | README.rdoc 8 | LICENSE.rdoc 9 | bin/* 10 | lib/**/* 11 | -------------------------------------------------------------------------------- /CHANGELOG.rdoc: -------------------------------------------------------------------------------- 1 | == COMPLETED 2 | === 0.1.1 3 | * Documentation and the whole project tree updated. No functional improvements. 4 | * Corrected typos. 5 | === 0.1.0 6 | * Notion of binary tokenizer and a library for embedded tokenization. 7 | * Separation of punctuaction marks. 8 | === 0.0.1 9 | * Simple tokenization is desired. 10 | 11 | == PLANNED 12 | === 0.2.0 13 | === 0.3.0 14 | === 0.4.0 15 | === 0.5.0 16 | === 0.6.0 17 | === 0.7.0 18 | === 0.8.0 19 | === 0.9.0 20 | === 1.0.0 21 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | 5 | group :development do 6 | gem 'rdoc', '>= 3.9.1' 7 | gem 'rake', '~> 11.1' 8 | gem 'yard', '~> 0.8' 9 | gem 'bundler', '~> 1.7' 10 | gem 'minitest', '~> 5.8' 11 | gem 'travis', '~> 1.8' 12 | gem 'rubocop', '~> 0.38' 13 | end 14 | -------------------------------------------------------------------------------- /LICENSE.rdoc: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011- Andrei Beliankou, Sven Naumann 2 | University of Trier, Germany 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE 21 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = Tokenizer 2 | 3 | {RubyGems}[http://rubygems.org/gems/tokenizer] | 4 | {Homepage}[http://bu.chsta.be/projects/tokenizer] | 5 | {Source Code}[https://github.com/arbox/tokenizer] | 6 | {Bug Tracker}[https://github.com/arbox/tokenizer/issues] 7 | 8 | {Gem Version}[https://rubygems.org/gems/tokenizer] 9 | {Build Status}[https://travis-ci.org/arbox/tokenizer] 10 | {Code Climate}[https://codeclimate.com/github/arbox/tokenizer] 11 | {Dependency Status}[https://gemnasium.com/arbox/tokenizer] 12 | 13 | == DESCRIPTION 14 | A simple multilingual tokenizer -- a linguistic tool intended to split a written text 15 | into tokens for NLP tasks. This tool provides a CLI and a library for 16 | linguistic tokenization which is an anavoidable step for many HLT (Human 17 | Language Technology) tasks in the preprocessing phase for further syntactic, 18 | semantic and other higher level processing goals. 19 | 20 | Tokenization task involves Sentence Segmentation, Word Segmentation and Boundary 21 | Disambiguation for the both tasks. 22 | 23 | Use it for tokenization of German, English and Dutch texts. 24 | 25 | === Implemented Algorithms 26 | to be ... 27 | 28 | == INSTALLATION 29 | +Tokenizer+ is provided as a .gem package. Simply install it via 30 | {RubyGems}[http://rubygems.org/gems/tokenizer]. 31 | 32 | To install +tokenizer+ issue the following command: 33 | $ gem install tokenizer 34 | 35 | If you want to do a system wide installation, do this as root 36 | (possibly using +sudo+). 37 | 38 | Alternatively use your Gemfile for dependency management. 39 | 40 | == SYNOPSIS 41 | 42 | You can use +Tokenizer+ in two ways. 43 | * As a command line tool: 44 | $ echo 'Hi, ich gehe in die Schule!. | tokenize 45 | 46 | * As a library for embedded tokenization: 47 | > require 'tokenizer' 48 | > de_tokenizer = Tokenizer::WhitespaceTokenizer.new 49 | > de_tokenizer.tokenize('Ich gehe in die Schule!') 50 | > => ["Ich", "gehe", "in", "die", "Schule", "!"] 51 | 52 | * Customizable PRE and POST list 53 | > require 'tokenizer' 54 | > de_tokenizer = Tokenizer::WhitespaceTokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] }) 55 | > de_tokenizer.tokenize('Ich gehe|in die Schule!') 56 | > => ["Ich", "gehe", "|in", "die", "Schule", "!"] 57 | 58 | See documentation in the Tokenizer::WhitespaceTokenizer class for details 59 | on particular methods. 60 | 61 | == SUPPORT 62 | 63 | If you have question, bug reports or any suggestions, please drop me an email :) 64 | Any help is deeply appreciated! 65 | 66 | == CHANGELOG 67 | For details on future plan and working progress see CHANGELOG.rdoc. 68 | 69 | == CAUTION 70 | This library is work in process! Though the interface is mostly complete, 71 | you might face some not implemented features. 72 | 73 | Please contact me with your suggestions, bug reports and feature requests. 74 | 75 | == LICENSE 76 | 77 | +Tokenizer+ is a copyrighted software by Andrei Beliankou, 2011- 78 | 79 | You may use, redistribute and change it under the terms provided 80 | in the LICENSE.rdoc file. 81 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | lib_path = File.expand_path('../lib', __FILE__) 2 | $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path) 3 | 4 | # Rake provides FileUtils and file lists. 5 | require 'rake' 6 | require 'rdoc' 7 | 8 | # clean and clobber tasks. 9 | require 'rake/clean' 10 | CLOBBER.include('rdoc', 11 | 'ydoc', 12 | '.yardoc', 13 | '**/*.gem') 14 | 15 | # Running tests. 16 | require 'rake/testtask' 17 | Rake::TestTask.new do |t| 18 | t.test_files = FileList.new('test/regression_tests/*.rb').to_a 19 | end 20 | 21 | # Generate documentation 22 | require 'rdoc/task' 23 | RDoc::Task.new do |rdoc| 24 | rdoc.rdoc_files.include('README.rdoc', 25 | 'LICENSE.rdoc', 26 | 'CHANGELOG.rdoc', 27 | 'lib/**/*', 28 | 'bin/*' 29 | ) 30 | rdoc.rdoc_dir = 'rdoc' 31 | end 32 | 33 | require 'yard' 34 | YARD::Rake::YardocTask.new do |ydoc| 35 | ydoc.options += ['-o', 'ydoc', '--no-cache'] 36 | end 37 | 38 | desc 'Document the code using Yard and RDoc.' 39 | task doc: [:clobber, :rdoc, :yard] 40 | 41 | # Custom gem building and releasing tasks. 42 | require 'tokenizer/version' 43 | desc 'Commit pending changes.' 44 | task :commit do 45 | end 46 | 47 | desc 'Create a tag in the repository for the current release.' 48 | task :tag do 49 | end 50 | 51 | desc "Build the gem package tokenizer-#{Tokenizer::VERSION}.gem" 52 | task :build => :clobber do 53 | system 'bundle exec gem build tokenizer.gemspec' 54 | end 55 | 56 | desc 'Deploy the gem package to RubyGems.' 57 | task release: [:commit, :tag, :build] do 58 | system "gem push tokenizer-#{Tokenizer::VERSION}.gem" 59 | end 60 | 61 | desc 'Open an irb session preloaded with this library.' 62 | task :console do 63 | sh 'irb -I lib -r tokenizer.rb' 64 | end 65 | 66 | task :travis do 67 | sh 'git pull' 68 | message = "#{Time.now}\t#{Tokenizer::VERSION}\n" 69 | File.open('.gem-version', 'w') do |file| 70 | file.write(message) 71 | end 72 | sh 'git add .gem-version' 73 | sh "git commit -m '#{message.chomp}'" 74 | sh 'git push origin master' 75 | end 76 | 77 | task :default => :test 78 | -------------------------------------------------------------------------------- /bin/tokenize: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'tokenizer' 4 | 5 | tokenizer = Tokenizer::Tokenizer.new 6 | 7 | while (line = gets) 8 | puts tokenizer.tokenize(line).join("\n") 9 | end 10 | -------------------------------------------------------------------------------- /lib/tokenizer.rb: -------------------------------------------------------------------------------- 1 | require 'tokenizer/tokenizer' 2 | require 'tokenizer/version' 3 | -------------------------------------------------------------------------------- /lib/tokenizer/tokenizer.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # :title: A simple Tokenizer for NLP Tasks. 3 | # :main: README.rdoc 4 | 5 | # A namespace for all project related stuff. 6 | module Tokenizer 7 | # Simple whitespace based tokenizer with configurable punctuation detection. 8 | class WhitespaceTokenizer 9 | # Default whitespace separator. 10 | FS = Regexp.new('[[:blank:]]+') 11 | 12 | # Characters only in the role of splittable prefixes. 13 | SIMPLE_PRE = ['¿', '¡'] 14 | 15 | # Characters only in the role of splittable suffixes. 16 | SIMPLE_POST = ['!', '?', ',', ':', ';', '.'] 17 | 18 | # Characters as splittable prefixes with an optional matching suffix. 19 | PAIR_PRE = ['(', '{', '[', '<', '«', '„'] 20 | 21 | # Characters as splittable suffixes with an optional matching prefix. 22 | PAIR_POST = [')', '}', ']', '>', '»', '“'] 23 | 24 | # Characters which can be both prefixes AND suffixes. 25 | PRE_N_POST = ['"', "'"] 26 | 27 | private_constant :FS 28 | 29 | # @param [Symbol] lang Language identifier. 30 | # @param [Hash] options Additional options. 31 | # @option options [Array] :pre Array of splittable prefix characters. 32 | # @option options [Array] :post Array of splittable suffix characters. 33 | # @option options [Array] :pre_n_post Array of characters with 34 | # suffix AND prefix functions. 35 | def initialize(lang = :de, options = {}) 36 | @lang = lang 37 | @options = { 38 | pre: SIMPLE_PRE + PAIR_PRE, 39 | post: SIMPLE_POST + PAIR_POST, 40 | pre_n_post: PRE_N_POST 41 | }.merge(options) 42 | end 43 | 44 | # @param [String] str String to be tokenized. 45 | # @return [Array] Array of tokens. 46 | def tokenize(str) 47 | tokens = sanitize_input(str).split(FS) 48 | return [''] if tokens.empty? 49 | 50 | splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST 51 | pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+") 52 | output = [] 53 | tokens.each do |token| 54 | prefix, stem, suffix = token.partition(pattern) 55 | output << prefix.split('') unless prefix.empty? 56 | output << stem unless stem.empty? 57 | output << suffix.split('') unless suffix.empty? 58 | end 59 | 60 | output.flatten 61 | end 62 | 63 | alias process tokenize 64 | 65 | private 66 | 67 | # @param [String] str User defined string to be tokenized. 68 | # @return [String] A new modified string. 69 | def sanitize_input(str) 70 | str.chomp.strip 71 | end 72 | end # class 73 | 74 | # @deprecated Use {WhitespaceTokenizer} instead. 75 | class Tokenizer < WhitespaceTokenizer 76 | def initialize(*args) 77 | warn '[Deprecated!] Use WhitespaceTokenizer instead.' 78 | super(*args) 79 | end 80 | end 81 | end # module 82 | -------------------------------------------------------------------------------- /lib/tokenizer/version.rb: -------------------------------------------------------------------------------- 1 | module Tokenizer 2 | VERSION = '0.3.0' 3 | end 4 | -------------------------------------------------------------------------------- /test/development_tests/test_by_tokenizer_dev.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @by_tokenizer = Tokenizer::Tokenizer.new(:by) 9 | end 10 | 11 | def test_tokenization_001 12 | end 13 | 14 | private 15 | def compare(exp_result, input) 16 | act_result = @de_tokenizer.tokenize(input) 17 | assert_equal(exp_result, act_result) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /test/development_tests/test_de_tokenizer_dev.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @de_tokenizer = Tokenizer::Tokenizer.new(:de) 9 | end 10 | 11 | def test_tokenization_001 12 | input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!' 13 | etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !} 14 | compare(etalon, input) 15 | end 16 | 17 | def test_tokenization_002 18 | input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!" 19 | etalon = %w{Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd !} 20 | compare(etalon, input) 21 | end 22 | 23 | def test_tokenization_003 24 | input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen." 25 | etalon = %w{Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen .} 26 | compare(etalon, input) 27 | end 28 | 29 | def test_tokenization_004 30 | input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen." 31 | etalon = %w{Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen .} 32 | compare(etalon, input) 33 | end 34 | 35 | def test_tokenization_005 36 | input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme." 37 | etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . } 38 | compare(etalon, input) 39 | end 40 | 41 | def test_tokenization_006 42 | input = 'Es gibt viele verschiedene Zeichen, die noch in Texten vorkommen können wie - zum Beispiel - diese hier "text" oder (text).' 43 | etalon = %w{Es gibt viele verschiedene Zeichen , die noch in Texten vorkommen können wie - zum Beispiel - diese hier " text " oder ( text ) .} 44 | compare(etalon, input) 45 | end 46 | 47 | def test_tokenization_007 48 | input = "Abkürzungen sind immer ein Problem, da auch Leerzeichen dazwischen stehen können, wie z. B. hier." 49 | etalon = ["Abkürzungen", "sind", "immer", "ein", "Problem", ",", "da", "auch", "Leerzeichen", "dazwischen", "stehen", "können", ",", "wie", "z. B.", "hier", "."] 50 | compare(etalon, input) 51 | end 52 | 53 | def test_tokenization_008 54 | input = "Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen, bei z.B. Aufzählungen." 55 | etalon = %w{Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen , bei z.B. Aufzählungen .} 56 | compare(etalon, input) 57 | end 58 | 59 | def test_tokenization_009 60 | input = "Ein weiteres Problem sind solche Getrennt- und Zusammenschreibungen." 61 | etalon = %w{Ein weiteres Problem sind solche Getrenntschreibungen und Zusammenschreibungen .} 62 | compare(etalon, input) 63 | end 64 | 65 | def test_tokenization_010 66 | input = "In manchen Texten gibt es auch Worttrennung am Zeilen- ende." 67 | etalon = %w{In manchen Texten gibt es auch Worttrennung am Zeilenende .} 68 | compare(etalon, input) 69 | end 70 | 71 | def test_tokenization_011 #Ellipsis 72 | input = "Der Satz endet in einer Ellips..." 73 | etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden 74 | compare(etalon, input) 75 | end 76 | 77 | def test_tokenization_012 #Fehlende Leerzeichen 78 | input = "Der Satz endet.Das Leerzeichen fehlt." 79 | etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen 80 | compare(etalon, input) 81 | end 82 | 83 | def test_tokenization_013 #Bindestriche 84 | input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden" 85 | etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden} 86 | compare(etalon, input) 87 | end 88 | 89 | def test_tokenization_014 #Abkuerzungen 90 | input = "Der Satz enthielt z.B. Fehler" 91 | etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden 92 | compare(etalon, input) 93 | end 94 | 95 | def test_tokenization_015 #Fehlende Grossbuchstaben 96 | input = "Der Satz endet. der Satz beginnt" 97 | etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen 98 | compare(etalon, input) 99 | end 100 | 101 | def test_tokenization_016 #Franzoesisch 102 | input = "L'art de l'univers, c'est un art" 103 | etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers! 104 | compare(etalon, input) 105 | end 106 | 107 | def test_tokenization_017 #James Bond 108 | input = "Bond,... James Bond." 109 | etalon = %w{ Bond , ... James Bond . } #Kontrovers! 110 | compare(etalon, input) 111 | end 112 | 113 | def test_tokenization_018 #Inches 114 | input = "The square had four 9\" sides" 115 | etalon = %w{ The square had four 9" sides } 116 | compare(etalon, input) 117 | end 118 | 119 | def test_tokenization_019 #Abkuerzung zugleich Lexikon-Eintrag 120 | input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig." 121 | etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort 122 | compare(etalon, input) 123 | end 124 | 125 | def test_tokenization_020 #Leerzeichen-getrennte Zusammengehörigkeiten 126 | input = "They booked the flight New York-Los Angeles" 127 | etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden 128 | compare(etalon, input) 129 | end 130 | 131 | def test_tokenization_021 #Ordinale 132 | input = "Der 1. Platz ging an den Sieger" 133 | etalon = %w{ Der 1. Platz ging an den Sieger } 134 | compare(etalon, input) 135 | end 136 | 137 | def test_tokenization_022 #Klitika 138 | input = "Er war's, stimmt's?" 139 | etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse 140 | compare(etalon, input) 141 | end 142 | 143 | def test_tokenization_023 #Datums- und Zeitangaben 144 | input = "Es passierte am 13. Januar 2011 um 12:13 Uhr" 145 | etalon = [ "Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"] 146 | compare(etalon, input) 147 | end 148 | 149 | def test_tokenization_024 #Eingebettete Saetze 150 | input = "\"This is all?\" George asked." 151 | etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren 152 | compare(etalon, input) 153 | end 154 | 155 | def test_tokenization_025 #Eingebettete Saetze 2 156 | input = "\"Das ist alles?\" fragte sie." 157 | etalon = %w{ Das ist alles ? fragte sie . } #ungrammatischer Satz "fragte sie." 158 | compare(etalon, input) 159 | end 160 | 161 | 162 | def test_tokenization_026 163 | input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!" 164 | etalon = %w{ Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd ! } 165 | compare(etalon, input) 166 | end 167 | 168 | def test_tokenization_027 169 | input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen." 170 | etalon = %w{ Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen . } 171 | compare(etalon, input) 172 | end 173 | 174 | def test_tokenization_028 175 | input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen." 176 | etalon = %w{ Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen . } 177 | compare(etalon, input) 178 | end 179 | 180 | def test_tokenization_029 181 | input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme." 182 | etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . } 183 | compare(etalon, input) 184 | end 185 | 186 | def test_tokenization_030 #Ellipsis 187 | input = "Der Satz endet in einer Ellips..." 188 | etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden 189 | compare(etalon, input) 190 | end 191 | 192 | def test_tokenization_031 #Fehlende Leerzeichen 193 | input = "Der Satz endet.Das Leerzeichen fehlt." 194 | etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen 195 | compare(etalon, input) 196 | end 197 | 198 | def test_tokenization_032 #Bindestriche 199 | input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden" 200 | etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden} 201 | compare(etalon, input) 202 | end 203 | 204 | def test_tokenization_033 #Abkuerzungen 205 | input = "Der Satz enthielt z.B. Fehler" 206 | etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden 207 | compare(etalon, input) 208 | end 209 | 210 | def test_tokenization_034 #Fehlende Grossbuchstaben 211 | input = "Der Satz endet. der Satz beginnt" 212 | etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen 213 | compare(etalon, input) 214 | end 215 | 216 | def test_tokenization_035 #Franzoesisch 217 | input = "L'art de l'univers, c'est un art" 218 | etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers! 219 | compare(etalon, input) 220 | end 221 | 222 | def test_tokenization_036 #James Bond 223 | input = "Bond,... James Bond." 224 | etalon = %w{ Bond , ... James Bond . } #Kontrovers! 225 | compare(etalon, input) 226 | end 227 | 228 | def test_tokenization_037 #Inches 229 | input = "The square had four 9\" sides" 230 | etalon = %w{ The square had four 9" sides } 231 | compare(etalon, input) 232 | end 233 | 234 | def test_tokenization_039 #Abkuerzung zugleich Lexikon-Eintrag 235 | input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig." 236 | etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort 237 | compare(etalon, input) 238 | end 239 | 240 | def test_tokenization_040 #Leerzeichen-getrennte Zusammengehörigkeiten 241 | input = "They booked the flight New York-Los Angeles" 242 | etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden 243 | compare(etalon, input) 244 | end 245 | 246 | def test_tokenization_041 #Ordinale 247 | input = "Der 1. Platz ging an den Sieger" 248 | etalon = %w{ Der 1. Platz ging an den Sieger } 249 | compare(etalon, input) 250 | end 251 | 252 | def test_tokenization_042 #Klitika 253 | input = "Er war's, stimmt's?" 254 | etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse 255 | compare(etalon, input) 256 | end 257 | 258 | #Datums- und Zeitangaben 259 | def test_tokenization_043 260 | input = "Es passierte am 13. Januar 2011 um 12:13 Uhr" 261 | etalon = ["Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"] 262 | compare(etalon, input) 263 | end 264 | 265 | #Eingebettete Sätze 266 | def test_tokenization_044 267 | input = '"This is all?" George asked.' 268 | etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren 269 | compare(etalon, input) 270 | end 271 | 272 | def test_tokenization_046 #Eingebettete Saetze 2 273 | input = '"Das ist alles?" fragte sie.' 274 | etalon = %w{Das ist alles ? fragte sie .} #ungrammatischer Satz "fragte sie." 275 | compare(etalon, input) 276 | end 277 | 278 | private 279 | def compare(exp_result, input) 280 | act_result = @de_tokenizer.tokenize(input) 281 | assert_equal(exp_result, act_result) 282 | end 283 | end 284 | -------------------------------------------------------------------------------- /test/development_tests/test_en_tokenizer_dev.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @en_tokenizer = Tokenizer::Tokenizer.new(:en) 9 | end 10 | 11 | def test_tokenization_001 12 | result = @en_tokenizer.tokenize('testing normal, english sentence') 13 | assert_equal(['testing', 'normal', ',', 'english', 'sentence', ''], result) 14 | end 15 | 16 | private 17 | def compare(exp_result, input) 18 | act_result = @de_tokenizer.tokenize(input) 19 | assert_equal(exp_result, act_result) 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /test/development_tests/test_fr_tokenizer_dev.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @fr_tokenizer = Tokenizer::Tokenizer.new(:fr) 9 | end 10 | 11 | def test_tokenization_001 12 | end 13 | 14 | private 15 | def compare(exp_result, input) 16 | act_result = @de_tokenizer.tokenize(input) 17 | assert_equal(exp_result, act_result) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /test/development_tests/test_it_tokenizer_dev.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @it_tokenizer = Tokenizer::Tokenizer.new(:it) 9 | end 10 | 11 | def test_tokenization_001 12 | end 13 | 14 | private 15 | def compare(exp_result, input) 16 | act_result = @de_tokenizer.tokenize(input) 17 | assert_equal(exp_result, act_result) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /test/development_tests/test_parameters.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @en_tokenizer = Tokenizer::Tokenizer.new(:en, {PRE: [], POST: ['|']}) 9 | end 10 | 11 | def test_tokenization_001 12 | result = @en_tokenizer.tokenize('testing| new') 13 | assert_equal(['testing', '|', 'new', ''], result) 14 | end 15 | 16 | def test_tokenization_002 17 | result = @en_tokenizer.tokenize('testing, new') 18 | assert_equal(['testing,', 'new', ''], result) 19 | end 20 | 21 | private 22 | def compare(exp_result, input) 23 | act_result = @de_tokenizer.tokenize(input) 24 | assert_equal(exp_result, act_result) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /test/development_tests/test_ru_tokenizer_dev.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'test/unit' 3 | require 'tokenizer' 4 | 5 | class TestTokenizerDev < Test::Unit::TestCase 6 | 7 | def setup 8 | @ru_tokenizer = Tokenizer::Tokenizer.new(:ru) 9 | end 10 | 11 | def test_tokenization_001 12 | end 13 | 14 | private 15 | def compare(exp_result, input) 16 | act_result = @de_tokenizer.tokenize(input) 17 | assert_equal(exp_result, act_result) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /test/regression_tests/test_de_tokenizer.rb: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | require 'minitest/autorun' 3 | require 'minitest/spec' 4 | require 'tokenizer' 5 | 6 | class TestTokenizer < Minitest::Test 7 | 8 | def setup 9 | @t = Tokenizer::Tokenizer.new(:de) 10 | end 11 | 12 | def test_constants 13 | assert(Tokenizer::VERSION.is_a?(String) && !Tokenizer::VERSION.empty?) 14 | end 15 | 16 | def test_output_type 17 | output = @t.tokenize('ich gehe in die Schule') 18 | assert(output.is_a?(Array)) 19 | end 20 | 21 | def test_tokenization_001 22 | input = 'Ich ging in die Schule!' 23 | etalon = %w(Ich ging in die Schule !) 24 | output = @t.tokenize(input) 25 | assert_equal(etalon, output) 26 | end 27 | 28 | def test_tokenization_002 29 | input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .' 30 | etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .) 31 | output = @t.tokenize(input) 32 | assert_equal(etalon, output) 33 | end 34 | end 35 | 36 | describe Tokenizer do 37 | describe 'empty input' do 38 | it 'should return an Array with an empty string' do 39 | tokens = Tokenizer::Tokenizer.new.tokenize('') 40 | tokens.must_equal(['']) 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /tokenizer.gemspec: -------------------------------------------------------------------------------- 1 | lib_path = File.expand_path('../lib', __FILE__) 2 | $LOAD_PATH.unshift(lib_path) unless $LOAD_PATH.include?(lib_path) 3 | 4 | require 'tokenizer/version' 5 | require 'rake' 6 | 7 | Gem::Specification.new do |s| 8 | s.name = 'tokenizer' 9 | s.summary = 'Tokenizer is a tool intended to split a text into tokens.' 10 | 11 | s.description = 'A simple multilingual tokenizer for NLP tasks. This tool '\ 12 | 'provides a CLI and a library for linguistic tokenization '\ 13 | 'which is an anavoidable step for many HLT (human language '\ 14 | 'technology) tasks in the preprocessing phase for further '\ 15 | 'syntactic, semantic and other higher level processing '\ 16 | 'goals. Use it for tokenization of German, '\ 17 | 'English and French texts.' 18 | s.rubyforge_project = 'tokenizer' 19 | s.version = Tokenizer::VERSION 20 | s.author = 'Andrei Beliankou' 21 | s.email = 'arbox@yandex.ru' 22 | s.homepage = 'https://github.com/arbox/tokenizer' 23 | s.license = 'MIT' 24 | s.executables << 'tokenize' 25 | s.extra_rdoc_files = FileList['*.rdoc'].to_a 26 | s.required_ruby_version = '>= 1.9.3' 27 | s.files = FileList['lib/**/*.rb', 28 | 'README.rdoc', 29 | 'LICENSE.rdoc', 30 | 'CHANGELOG.rdoc', 31 | '.yardopts', 32 | 'test/**/*', 33 | 'bin/*'].to_a 34 | s.test_files = FileList['test/**/*'].to_a 35 | end 36 | --------------------------------------------------------------------------------