├── .rspec ├── lib ├── unicode.rb ├── pragmatic_segmenter.rb └── pragmatic_segmenter │ ├── version.rb │ ├── types.rb │ ├── languages │ ├── armenian.rb │ ├── greek.rb │ ├── urdu.rb │ ├── amharic.rb │ ├── burmese.rb │ ├── hindi.rb │ ├── persian.rb │ ├── english.rb │ ├── bulgarian.rb │ ├── french.rb │ ├── russian.rb │ ├── arabic.rb │ ├── common │ │ ├── ellipsis.rb │ │ └── numbers.rb │ ├── chinese.rb │ ├── polish.rb │ ├── japanese.rb │ ├── spanish.rb │ ├── kazakh.rb │ ├── deutsch.rb │ ├── common.rb │ ├── danish.rb │ ├── dutch.rb │ └── italian.rb │ ├── exclamation_words.rb │ ├── segmenter.rb │ ├── languages.rb │ ├── punctuation_replacer.rb │ ├── cleaner │ └── rules.rb │ ├── between_punctuation.rb │ ├── cleaner.rb │ ├── processor.rb │ ├── abbreviation_replacer.rb │ └── list.rb ├── spec ├── spec_helper.rb ├── pragmatic_segmenter │ ├── languages │ │ ├── polish_spec.rb │ │ ├── burmese_spec.rb │ │ ├── amharic_spec.rb │ │ ├── chinese_spec.rb │ │ ├── urdu_spec.rb │ │ ├── persian_spec.rb │ │ ├── greek_spec.rb │ │ ├── hindi_spec.rb │ │ ├── dutch_spec.rb │ │ ├── bulgarian_spec.rb │ │ ├── french_spec.rb │ │ ├── kazakh_spec.rb │ │ ├── japanese_spec.rb │ │ ├── arabic_spec.rb │ │ ├── italian_spec.rb │ │ ├── russian_spec.rb │ │ ├── armenian_spec.rb │ │ └── spanish_spec.rb │ └── languages_spec.rb ├── pragmatic_segmenter_spec.rb └── performance_spec.rb ├── Rakefile ├── Gemfile ├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── pragmatic_segmenter.gemspec ├── CODE_OF_CONDUCT.md └── NEWS /.rspec: -------------------------------------------------------------------------------- 1 | --color -------------------------------------------------------------------------------- /lib/unicode.rb: -------------------------------------------------------------------------------- 1 | module Unicode 2 | def self.downcase(text) 3 | text.downcase 4 | end 5 | end -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'simplecov' 2 | SimpleCov.start 3 | require 'pragmatic_segmenter' 4 | 5 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | require "pragmatic_segmenter/version" 3 | require "pragmatic_segmenter/segmenter" -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | VERSION = "0.3.24" 5 | end 6 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | require 'rspec/core/rake_task' 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | task :default => :spec 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | group :test do 3 | gem 'simplecov' 4 | gem 'codeclimate-test-reporter' 5 | end 6 | # Specify your gem's dependencies in pragmatic_segmenter.gemspec 7 | gemspec 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | *.bundle 11 | *.so 12 | *.o 13 | *.a 14 | mkmf.log 15 | .DS_Store 16 | .vscode/launch.json 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/types.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | class Rule < Struct.new(:pattern, :replacement) 5 | class << self 6 | def apply(str, *rules) 7 | rules.flatten.each do |rule| 8 | str.gsub!(rule.pattern, rule.replacement) 9 | end 10 | str 11 | end 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/polish_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Polish, '(pl)' do 4 | 5 | describe '#segment' do 6 | it 'correctly segments text #001' do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "To słowo bałt. jestskrótem.", language: 'pl') 8 | expect(ps.segment).to eq(["To słowo bałt. jestskrótem."]) 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/armenian.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Armenian 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/ 9 | Punctuations = ['։', '՜', ':'].freeze 10 | 11 | class AbbreviationReplacer < AbbreviationReplacer 12 | SENTENCE_STARTERS = [].freeze 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/greek.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Greek 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[\.;!\?]|.*?$/ 9 | Punctuations = ['.', '!', ';', '?'].freeze 10 | 11 | class AbbreviationReplacer < AbbreviationReplacer 12 | SENTENCE_STARTERS = [].freeze 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/urdu.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Urdu 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[۔؟!\?]|.*?$/ 9 | Punctuations = ['?', '!', '۔', '؟'].freeze 10 | 11 | class AbbreviationReplacer < AbbreviationReplacer 12 | SENTENCE_STARTERS = [].freeze 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/amharic.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Amharic 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[፧።!\?]|.*?$/ 9 | Punctuations = ['።', '፧', '?', '!'].freeze 10 | 11 | class AbbreviationReplacer < AbbreviationReplacer 12 | SENTENCE_STARTERS = [].freeze 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/burmese.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Burmese 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/ 9 | Punctuations = ['။', '၏', '?', '!'].freeze 10 | 11 | class AbbreviationReplacer < AbbreviationReplacer 12 | SENTENCE_STARTERS = [].freeze 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/hindi.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Hindi 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[।\|!\?]|.*?$/ 9 | Punctuations = ['।', '|', '.', '!', '?'].freeze 10 | 11 | class AbbreviationReplacer < AbbreviationReplacer 12 | SENTENCE_STARTERS = [].freeze 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - "2.1.5" 4 | - "2.2.0" 5 | - "2.2.4" 6 | - "2.3.0" 7 | - "2.3.1" 8 | # uncomment this line if your project needs to run something other than `rake`: 9 | # script: bundle exec rspec spec 10 | addons: 11 | code_climate: 12 | repo_token: 13 | secure: "TDtg1SY+50yvYL8nRhf3YG2xtyS4b7wdJddGL7BRvYHkn5jhmGAXRU9F9+IRyPLPlwwd/VX2zxClmU4hr3DAbb7C/JUscNmVUcDeiwlMOIEUIjKXT+f+TFkLLjTsXjivdX7T9oD/pzHUHB5SjqWfWyZKIo2uAiTv6zt4PYvoeUQ=" -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/burmese_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Burmese, '(my)' do 4 | 5 | context "Golden Rules" do 6 | it "Sentence ending punctuation #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my') 8 | expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"]) 9 | end 10 | end 11 | 12 | describe '#segment' do 13 | it 'correctly segments text #001' do 14 | ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my') 15 | expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"]) 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/amharic_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Amharic, '(am)' do 4 | 5 | context "Golden Rules" do 6 | it "Sentence ending punctuation #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am') 8 | expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"]) 9 | end 10 | end 11 | 12 | describe '#segment' do 13 | it 'correctly segments text #001' do 14 | ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am') 15 | expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"]) 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/chinese_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Chinese, '(zh)' do 4 | 5 | describe '#segment' do 6 | it 'correctly segments text #001' do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh') 8 | expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"]) 9 | end 10 | 11 | it 'correctly segments text #002' do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "我们明天一起去看《摔跤吧!爸爸》好吗?好!", language: 'zh') 13 | expect(ps.segment).to eq(["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"]) 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/urdu_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Urdu, '(ur)' do 4 | 5 | context "Golden Rules" do 6 | it "Sentence ending punctuation #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur') 8 | expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]) 9 | end 10 | end 11 | 12 | describe '#segment' do 13 | it 'correctly segments text #001' do 14 | ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur') 15 | expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]) 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/persian.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Persian 6 | include Languages::Common 7 | 8 | SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟]|.*?\z|.*?$/ 9 | Punctuations = ['?', '!', ':', '.', '؟'].freeze 10 | 11 | ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭') 12 | ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬') 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | 17 | private 18 | 19 | def scan_for_replacements(txt, am, index, character_array) 20 | txt.gsub!(/(?<=#{am})\./, '∯') 21 | txt 22 | end 23 | end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/exclamation_words.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | require 'pragmatic_segmenter/punctuation_replacer' 5 | 6 | module PragmaticSegmenter 7 | # This class searches for exclamation points that 8 | # are part of words and not ending punctuation and replaces them. 9 | module ExclamationWords 10 | EXCLAMATION_WORDS = %w[!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!].freeze 11 | REGEXP = Regexp.new(EXCLAMATION_WORDS.map { |string| Regexp.escape(string) }.join('|')) 12 | 13 | def self.apply_rules(text) 14 | PragmaticSegmenter::PunctuationReplacer.new( 15 | matches_array: text.scan(REGEXP), 16 | text: text 17 | ).replace 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/persian_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Persian, '(fa)' do 4 | 5 | context "Golden Rules" do 6 | it "Sentence ending punctuation #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa') 8 | expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."]) 9 | end 10 | end 11 | 12 | describe '#segment' do 13 | it 'correctly segments text #001' do 14 | ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa') 15 | expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."]) 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/english.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module English 6 | include Languages::Common 7 | 8 | class Cleaner < Cleaner 9 | def clean 10 | super 11 | clean_quotations 12 | end 13 | 14 | private 15 | 16 | def clean_quotations 17 | @text.gsub(/`/, "'") 18 | end 19 | 20 | def abbreviations 21 | [].freeze 22 | end 23 | end 24 | 25 | class AbbreviationReplacer < AbbreviationReplacer 26 | SENTENCE_STARTERS = %w( 27 | A Being Did For He How However I In It Millions More She That The 28 | There They We What When Where Who Why 29 | ).freeze 30 | end 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/greek_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Greek, '(el)' do 4 | 5 | context "Golden Rules" do 6 | it "Question mark to end sentence #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: "el") 8 | expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."]) 9 | end 10 | end 11 | 12 | describe '#segment' do 13 | it 'correctly segments text #001' do 14 | ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: 'el') 15 | expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."]) 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/hindi_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Hindi, '(hi)' do 4 | 5 | context "Golden Rules" do 6 | it "Full stop #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: "hi") 8 | expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"]) 9 | end 10 | end 11 | 12 | describe '#segment' do 13 | it 'correctly segments text #001' do 14 | ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: 'hi') 15 | expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"]) 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/segmenter.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | require 'pragmatic_segmenter/languages' 5 | 6 | module PragmaticSegmenter 7 | # This class segments a text into an array of sentences. 8 | class Segmenter 9 | attr_reader :text, :language, :doc_type 10 | 11 | def initialize(text:, language: 'en', doc_type: nil, clean: true) 12 | return unless text 13 | @language = language 14 | @language_module = Languages.get_language_by_code(language) 15 | @doc_type = doc_type 16 | 17 | if clean 18 | @text = cleaner.new(text: text, doc_type: @doc_type, language: @language_module).clean 19 | else 20 | @text = text 21 | end 22 | end 23 | 24 | def segment 25 | return [] unless @text 26 | processor.new(language: @language_module).process(text: @text) 27 | end 28 | 29 | private 30 | 31 | def processor 32 | @language_module::Processor 33 | rescue 34 | Processor 35 | end 36 | 37 | def cleaner 38 | @language_module::Cleaner 39 | rescue 40 | Cleaner 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Kevin S. Dias 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/bulgarian.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Bulgarian 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(["p.s", "акад", "ал", "б.р", "б.ред", "бел.а", "бел.пр", "бр", "бул", "в", "вж", "вкл", "вм", "вр", "г", "ген", "гр", "дж", "дм", "доц", "др", "ем", "заб", "зам", "инж", "к.с", "кв", "кв.м", "кг", "км", "кор", "куб", "куб.м", "л", "лв", "м", "м.г", "мин", "млн", "млрд", "мм", "н.с", "напр", "пл", "полк", "проф", "р", "рис", "с", "св", "сек", "см", "сп", "срв", "ст", "стр", "т", "т.г", "т.е", "т.н", "т.нар", "табл", "тел", "у", "ул", "фиг", "ха", "хил", "ч", "чл", "щ.д"]).freeze 10 | NUMBER_ABBREVIATIONS = [].freeze 11 | PREPOSITIVE_ABBREVIATIONS = [].freeze 12 | end 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | 17 | private 18 | def replace_period_of_abbr(txt, abbr) 19 | txt.gsub!(/(?<=\s#{abbr.strip})\.|(?<=^#{abbr.strip})\./, '∯') 20 | txt 21 | end 22 | end 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/dutch_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Dutch, '(nl)' do 4 | 5 | context "Golden Rules" do 6 | it "Sentence starting with a number #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.", language: 'nl') 8 | expect(ps.segment).to eq(["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]) 9 | end 10 | 11 | it "Sentence starting with an ellipsis #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "81 procent van de schoten was raak. ...en toen barste de hel los.", language: 'nl') 13 | expect(ps.segment).to eq(["81 procent van de schoten was raak.", "...en toen barste de hel los."]) 14 | end 15 | end 16 | 17 | describe '#segment' do 18 | it 'correctly segments text #001' do 19 | ps = PragmaticSegmenter::Segmenter.new(text: "Afkorting aanw. vnw.", language: 'nl') 20 | expect(ps.segment).to eq(["Afkorting aanw. vnw."]) 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe PragmaticSegmenter::Languages do 4 | describe '.get_language_by_code' do 5 | context "when language code defined" do 6 | PragmaticSegmenter::Languages::LANGUAGE_CODES.each do |code, lang| 7 | it "returns '#{lang}' for '#{code}'" do 8 | expect(described_class.get_language_by_code(code)).to eql(lang) 9 | end 10 | end 11 | end 12 | 13 | context "when language code not defined" do 14 | it "returns 'PragmaticSegmenter::Languages::Common'" do 15 | expect(described_class.get_language_by_code('xxyyzz')).to eql(PragmaticSegmenter::Languages::Common) 16 | end 17 | end 18 | 19 | context "when language code empty string" do 20 | it "returns 'PragmaticSegmenter::Languages::Common'" do 21 | expect(described_class.get_language_by_code('')).to eql(PragmaticSegmenter::Languages::Common) 22 | end 23 | end 24 | 25 | context "when language code nil" do 26 | it "returns 'PragmaticSegmenter::Languages::Common'" do 27 | expect(described_class.get_language_by_code(nil)).to eql(PragmaticSegmenter::Languages::Common) 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/french.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module French 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']).freeze 10 | PREPOSITIVE_ABBREVIATIONS = [].freeze 11 | NUMBER_ABBREVIATIONS = [].freeze 12 | end 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/russian.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Russian 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(["y", "y.e", "а", "авт", "адм.-терр", "акад", "в", "вв", "вкз", "вост.-европ", "г", "гг", "гос", "гр", "д", "деп", "дисс", "дол", "долл", "ежедн", "ж", "жен", "з", "зап", "зап.-европ", "заруб", "и", "ин", "иностр", "инст", "к", "канд", "кв", "кг", "куб", "л", "л.h", "л.н", "м", "мин", "моск", "муж", "н", "нед", "о", "п", "пгт", "пер", "пп", "пр", "просп", "проф", "р", "руб", "с", "сек", "см", "спб", "стр", "т", "тел", "тов", "тт", "тыс", "у", "у.е", "ул", "ф", "ч"]).freeze 10 | PREPOSITIVE_ABBREVIATIONS = [].freeze 11 | NUMBER_ABBREVIATIONS = [].freeze 12 | end 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | 17 | private 18 | 19 | def replace_period_of_abbr(txt, abbr) 20 | txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯') 21 | txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯') 22 | txt.gsub!(/(?<=^#{abbr.strip})\./, '∯') 23 | txt 24 | end 25 | end 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/arabic.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Arabic 6 | include Languages::Common 7 | 8 | Punctuations = ['?', '!', ':', '.', '؟', '،'].freeze 9 | SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/ 10 | 11 | module Abbreviation 12 | ABBREVIATIONS = Set.new(['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د‪']).freeze 13 | PREPOSITIVE_ABBREVIATIONS = [].freeze 14 | NUMBER_ABBREVIATIONS = [].freeze 15 | end 16 | 17 | # Rubular: http://rubular.com/r/RX5HpdDIyv 18 | ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭') 19 | 20 | # Rubular: http://rubular.com/r/kPRgApNHUg 21 | ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬') 22 | 23 | class AbbreviationReplacer < AbbreviationReplacer 24 | SENTENCE_STARTERS = [].freeze 25 | private 26 | 27 | def scan_for_replacements(txt, am, index, character_array) 28 | txt.gsub!(/(?<=#{am})\./, '∯') 29 | txt 30 | end 31 | end 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/common/ellipsis.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | module PragmaticSegmenter 5 | module Languages 6 | module Common 7 | # This class searches for ellipses within a string and 8 | # replaces the periods. 9 | 10 | # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/ 11 | # http://www.thepunctuationguide.com/ellipses.html 12 | 13 | module EllipsisRules 14 | # Rubular: http://rubular.com/r/i60hCK81fz 15 | ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.') 16 | 17 | # Rubular: http://rubular.com/r/Hdqpd90owl 18 | FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ') 19 | 20 | # Rubular: http://rubular.com/r/YBG1dIHTRu 21 | ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟') 22 | 23 | # Rubular: http://rubular.com/r/2VvZ8wRbd8 24 | FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝') 25 | 26 | OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ') 27 | 28 | All = [ 29 | ThreeSpaceRule, 30 | FourSpaceRule, 31 | FourConsecutiveRule, 32 | ThreeConsecutiveRule, 33 | OtherThreePeriodRule 34 | ] 35 | end 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/chinese.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Chinese 6 | include Languages::Common 7 | 8 | class AbbreviationReplacer < AbbreviationReplacer 9 | SENTENCE_STARTERS = [].freeze 10 | end 11 | 12 | class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation 13 | BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/ 14 | BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/ 15 | private 16 | 17 | def sub_punctuation_between_quotes_and_parens(txt) 18 | super 19 | sub_punctuation_between_double_angled_quotation_marks(txt) 20 | sub_punctuation_between_l_bracket(txt) 21 | end 22 | 23 | def sub_punctuation_between_double_angled_quotation_marks(txt) 24 | PunctuationReplacer.new( 25 | matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX), 26 | text: txt 27 | ).replace 28 | end 29 | 30 | def sub_punctuation_between_l_bracket(txt) 31 | PunctuationReplacer.new( 32 | matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX), 33 | text: txt 34 | ).replace 35 | end 36 | end 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /pragmatic_segmenter.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'pragmatic_segmenter/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "pragmatic_segmenter" 8 | spec.version = PragmaticSegmenter::VERSION 9 | spec.authors = ["Kevin S. Dias"] 10 | spec.email = ["diasks2@gmail.com"] 11 | spec.summary = %q{A rule-based sentence boundary detection gem that works out-of-the-box across many languages} 12 | spec.description = %q{Pragmatic Segmenter is a sentence segmentation tool for Ruby. It allows you to split a text into an array of sentences. This gem provides 2 main benefits over other segmentation gems - 1) It works well even with ill-formatted text 2) It works for multiple languages } 13 | spec.homepage = "https://github.com/diasks2/pragmatic_segmenter" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files -z`.split("\x0") 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_development_dependency "bundler", ">= 1.7" 22 | spec.add_development_dependency "rake", ">= 12.3.3" 23 | spec.add_development_dependency "rspec" 24 | spec.add_development_dependency "stackprof" 25 | end 26 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/polish.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Polish 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(['ags', 'alb', 'ang', 'aor', 'awest', 'bałt', 'bojkow', 'bret', 'brus', 'bsł', 'bułg', 'c.b.d.o', 'c.b.d.u', 'celt', 'chorw', 'cs', 'czakaw', 'czerw', 'czes', 'dłuż', 'dniem', 'dor', 'dubrow', 'duń', 'ekaw', 'fiń', 'franc', 'gal', 'germ', 'głuż', 'gniem', 'goc', 'gr', 'grudz', 'hebr', 'het', 'hol', 'I cont', 'ie', 'ikaw', 'irań', 'irl', 'islandz', 'itd', 'itd.', 'itp', 'jekaw', 'kajkaw', 'kasz', 'kirg', 'kwiec', 'łac', 'lip', 'listop', 'lit', 'łot', 'lp', 'maced', 'mar', 'młpol', 'moraw', 'n.e', 'nb.', 'ngr', 'niem', 'nord', 'norw', 'np', 'np.', 'ok.', 'orm', 'oset', 'osk', 'p.n', 'p.n.e', 'p.o', 'pazdz', 'pers', 'pie', 'pod red.', 'podhal', 'pol', 'połab', 'port', 'prekm', 'pskow', 'psł', 'R cont', 'rez', 'rom', 'rozdz.', 'rum', 'rus', 'rys.', 'sas', 'sch', 'scs', 'serb', 'sierp', 'śl', 'sła', 'słe', 'słi', 'słow', 'sp. z o.o', 'śrdniem', 'śrgniem', 'śrirl', 'stbułg', 'stind', 'stpol', 'stpr', 'str.', 'strus', 'stwniem', 'stycz', 'sztokaw', 'szwedz', 't.', 'tj.', 'tłum.', 'toch', 'tur', 'tzn', 'ukr', 'ul', 'umbr', 'wed', 'węg', 'wlkpol', 'włos', 'wrzes', 'wyd.', 'zakarp']).freeze 10 | PREPOSITIVE_ABBREVIATIONS = [].freeze 11 | NUMBER_ABBREVIATIONS = [].freeze 12 | end 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/japanese.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Japanese 6 | include Languages::Common 7 | 8 | class Cleaner < PragmaticSegmenter::Cleaner 9 | # Rubular: http://rubular.com/r/N4kPuJgle7 10 | NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '') 11 | 12 | def clean 13 | super 14 | remove_newline_in_middle_of_word 15 | end 16 | 17 | private 18 | 19 | def remove_newline_in_middle_of_word 20 | Rule.apply @text, NewLineInMiddleOfWordRule 21 | end 22 | end 23 | 24 | class AbbreviationReplacer < AbbreviationReplacer 25 | SENTENCE_STARTERS = [].freeze 26 | end 27 | 28 | class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation 29 | # Rubular: http://rubular.com/r/GnjOmry5Z2 30 | BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/ 31 | 32 | # Rubular: http://rubular.com/r/EjHcZn5ZSG 33 | BETWEEN_PARENS_JA_REGEX = /\u{ff08}(?>[^\u{ff08}\u{ff09}\\]+|\\{2}|\\.)*\u{ff09}/ 34 | private 35 | 36 | def sub_punctuation_between_quotes_and_parens(txt) 37 | super 38 | sub_punctuation_between_parens_ja(txt) 39 | sub_punctuation_between_quotes_ja(txt) 40 | end 41 | 42 | def sub_punctuation_between_quotes_ja(txt) 43 | PunctuationReplacer.new( 44 | matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX), 45 | text: txt 46 | ).replace 47 | end 48 | 49 | def sub_punctuation_between_parens_ja(txt) 50 | PunctuationReplacer.new( 51 | matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX), 52 | text: txt 53 | ).replace 54 | end 55 | end 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. 4 | 5 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. 6 | 7 | Examples of unacceptable behavior by participants include: 8 | 9 | * The use of sexualized language or imagery 10 | * Personal attacks 11 | * Trolling or insulting/derogatory comments 12 | * Public or private harassment 13 | * Publishing other's private information, such as physical or electronic addresses, without explicit permission 14 | * Other unethical or unprofessional conduct 15 | 16 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. 17 | 18 | This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers. 21 | 22 | This Code of Conduct is adapted from the Contributor Covenant, version 1.2.0, available from http://contributor-covenant.org/version/1/2/0/ -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'pragmatic_segmenter/types' 4 | require 'pragmatic_segmenter/processor' 5 | require 'pragmatic_segmenter/cleaner' 6 | 7 | require 'pragmatic_segmenter/languages/common' 8 | 9 | require 'pragmatic_segmenter/languages/english' 10 | require 'pragmatic_segmenter/languages/deutsch' 11 | require 'pragmatic_segmenter/languages/hindi' 12 | require 'pragmatic_segmenter/languages/persian' 13 | require 'pragmatic_segmenter/languages/amharic' 14 | require 'pragmatic_segmenter/languages/arabic' 15 | require 'pragmatic_segmenter/languages/greek' 16 | require 'pragmatic_segmenter/languages/armenian' 17 | require 'pragmatic_segmenter/languages/burmese' 18 | require 'pragmatic_segmenter/languages/urdu' 19 | require 'pragmatic_segmenter/languages/french' 20 | require 'pragmatic_segmenter/languages/italian' 21 | require 'pragmatic_segmenter/languages/spanish' 22 | require 'pragmatic_segmenter/languages/russian' 23 | require 'pragmatic_segmenter/languages/japanese' 24 | require 'pragmatic_segmenter/languages/dutch' 25 | require 'pragmatic_segmenter/languages/polish' 26 | require 'pragmatic_segmenter/languages/chinese' 27 | require 'pragmatic_segmenter/languages/bulgarian' 28 | require 'pragmatic_segmenter/languages/danish' 29 | require 'pragmatic_segmenter/languages/kazakh' 30 | 31 | module PragmaticSegmenter 32 | module Languages 33 | LANGUAGE_CODES = { 34 | 'en' => English, 35 | 'bg' => Bulgarian, 36 | 'de' => Deutsch, 37 | 'es' => Spanish, 38 | 'fr' => French, 39 | 'it' => Italian, 40 | 'ja' => Japanese, 41 | 'el' => Greek, 42 | 'ru' => Russian, 43 | 'ar' => Arabic, 44 | 'am' => Amharic, 45 | 'hi' => Hindi, 46 | 'hy' => Armenian, 47 | 'fa' => Persian, 48 | 'my' => Burmese, 49 | 'ur' => Urdu, 50 | 'nl' => Dutch, 51 | 'pl' => Polish, 52 | 'zh' => Chinese, 53 | 'da' => Danish, 54 | 'kk' => Kazakh 55 | } 56 | 57 | def self.get_language_by_code(code) 58 | LANGUAGE_CODES[code] || Common 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/punctuation_replacer.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | module PragmaticSegmenter 5 | # This class replaces punctuation that is typically a sentence boundary 6 | # but in this case is not a sentence boundary. 7 | class PunctuationReplacer 8 | module Rules 9 | module EscapeRegexReservedCharacters 10 | LeftParen = Rule.new('(', '\\(') 11 | RightParen = Rule.new(')', '\\)') 12 | LeftBracket = Rule.new('[', '\\[') 13 | RightBracket = Rule.new(']', '\\]') 14 | Dash = Rule.new('-', '\\-') 15 | 16 | All = [ LeftParen, RightParen, 17 | LeftBracket, RightBracket, Dash ] 18 | end 19 | 20 | module SubEscapedRegexReservedCharacters 21 | SubLeftParen = Rule.new('\\(', '(') 22 | SubRightParen = Rule.new('\\)', ')') 23 | SubLeftBracket = Rule.new('\\[', '[') 24 | SubRightBracket = Rule.new('\\]', ']') 25 | SubDash = Rule.new('\\-', '-') 26 | 27 | All = [ SubLeftParen, SubRightParen, 28 | SubLeftBracket, SubRightBracket, SubDash ] 29 | end 30 | 31 | end 32 | 33 | attr_reader :matches_array, :text, :match_type 34 | def initialize(text:, matches_array:, match_type: nil) 35 | @text = text 36 | @matches_array = matches_array 37 | @match_type = match_type 38 | end 39 | 40 | def replace 41 | replace_punctuation(matches_array) 42 | end 43 | 44 | private 45 | 46 | def replace_punctuation(array) 47 | return if !array || array.empty? 48 | Rule.apply(@text, Rules::EscapeRegexReservedCharacters::All) 49 | array.each do |a| 50 | Rule.apply(a, Rules::EscapeRegexReservedCharacters::All) 51 | sub = sub_characters(a, '.', '∯') 52 | sub_1 = sub_characters(sub, '。', '&ᓰ&') 53 | sub_2 = sub_characters(sub_1, '.', '&ᓱ&') 54 | sub_3 = sub_characters(sub_2, '!', '&ᓳ&') 55 | sub_4 = sub_characters(sub_3, '!', '&ᓴ&') 56 | sub_5 = sub_characters(sub_4, '?', '&ᓷ&') 57 | sub_6 = sub_characters(sub_5, '?', '&ᓸ&') 58 | unless match_type.eql?('single') 59 | sub_7 = sub_characters(sub_6, "'", '&⎋&') 60 | end 61 | end 62 | Rule.apply(@text, Rules::SubEscapedRegexReservedCharacters::All) 63 | end 64 | 65 | def sub_characters(string, char_a, char_b) 66 | sub = string.gsub(char_a, char_b) 67 | @text.gsub!(/#{Regexp.escape(string)}/, sub) 68 | sub 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/bulgarian_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Bulgarian, '(bg)' do 4 | 5 | describe '#segment' do 6 | 7 | it 'correctly segments text #001' do 8 | ps = PragmaticSegmenter::Segmenter.new(text: "В първата половина на ноември т.г. ще бъде свикан Консултативният съвет за национална сигурност, обяви държавният глава.", language: 'bg') 9 | expect(ps.segment).to eq(["В първата половина на ноември т.г. ще бъде свикан Консултативният съвет за национална сигурност, обяви държавният глава."]) 10 | end 11 | 12 | it 'correctly segments text #002' do 13 | ps = PragmaticSegmenter::Segmenter.new(text: "Компютърът е устройство с общо предназначение, което може да бъде програмирано да извършва набор от аритметични и/или логически операции. Възможността поредицата такива операции да бъде променяна позволява компютърът да се използва за решаването на теоретично всяка изчислителна/логическа задача. Обикновено целта на тези операции е обработката на въведена информация (данни), представена в цифров (дигитален) вид, резултатът от които може да се изведе в най-общо казано използваема форма.", language: 'bg') 14 | expect(ps.segment).to eq(["Компютърът е устройство с общо предназначение, което може да бъде програмирано да извършва набор от аритметични и/или логически операции.", "Възможността поредицата такива операции да бъде променяна позволява компютърът да се използва за решаването на теоретично всяка изчислителна/логическа задача.", "Обикновено целта на тези операции е обработката на въведена информация (данни), представена в цифров (дигитален) вид, резултатът от които може да се изведе в най-общо казано използваема форма."]) 15 | end 16 | 17 | it 'correctly segments text #003' do 18 | ps = PragmaticSegmenter::Segmenter.new(text: "Пл. \"20 Април\"", language: 'bg') 19 | expect(ps.segment).to eq(["Пл. \"20 Април\""]) 20 | end 21 | 22 | it 'correctly segments text #004' do 23 | ps = PragmaticSegmenter::Segmenter.new(text: "Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат. Стойностни, вкл. български и руски", language: 'bg') 24 | expect(ps.segment).to eq(["Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.", "Стойностни, вкл. български и руски"]) 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Segmenter do 4 | 5 | describe '#segment' do 6 | it 'handles nil' do 7 | ps = PragmaticSegmenter::Segmenter.new(text: nil) 8 | expect(ps.segment).to eq([]) 9 | end 10 | 11 | it 'handles no language' do 12 | ps = PragmaticSegmenter::Segmenter.new(text: 'Hello world. Hello.') 13 | expect(ps.segment).to eq(["Hello world.", "Hello."]) 14 | end 15 | 16 | it 'handles empty strings' do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "\n") 18 | expect(ps.segment).to eq([]) 19 | end 20 | 21 | it 'handles empty strings' do 22 | ps = PragmaticSegmenter::Segmenter.new(text: "") 23 | expect(ps.segment).to eq([]) 24 | end 25 | 26 | it 'handles empty strings' do 27 | ps = PragmaticSegmenter::Segmenter.new(text: '') 28 | expect(ps.segment).to eq([]) 29 | end 30 | 31 | it 'has an option to not use the cleaner' do 32 | ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en", clean: false) 33 | expect(ps.segment).to eq(["It was a cold", "night in the city."]) 34 | end 35 | 36 | it 'does not mutate the input string' do 37 | text = "It was a cold \nnight in the city." 38 | PragmaticSegmenter::Segmenter.new(text: text, language: "en").segment 39 | expect(text).to eq("It was a cold \nnight in the city.") 40 | end 41 | 42 | describe '#clean' do 43 | it 'cleans the text #001' do 44 | ps = PragmaticSegmenter::Cleaner.new(text: "It was a cold \nnight in the city.", language: "en") 45 | expect(ps.clean).to eq("It was a cold night in the city.") 46 | end 47 | 48 | it 'cleans the text #002' do 49 | text = 'injections made by the Shareholder through the years. 7 (max.) 3. Specifications/4.Design and function The operating instructions are part of the product and must be kept in the immediate vicinity of the instrument and readily accessible to skilled "' 50 | ps = PragmaticSegmenter::Cleaner.new(text: text) 51 | expect(ps.clean).to eq("injections made by the Shareholder through the years. 7 (max.) 3. Specifications/4.Design and function The operating instructions are part of the product and must be kept in the immediate vicinity of the instrument and readily accessible to skilled \"") 52 | end 53 | 54 | it 'does not mutate the input string (cleaner)' do 55 | text = "It was a cold \nnight in the city." 56 | PragmaticSegmenter::Cleaner.new(text: text, language: "en").clean 57 | expect(text).to eq("It was a cold \nnight in the city.") 58 | end 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/french_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::French, '(fr)' do 4 | 5 | describe '#segment' do 6 | it 'correctly segments text #001' do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale. L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle.", language: 'fr') 8 | expect(ps.segment).to eq(["Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale.", "L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle."]) 9 | end 10 | 11 | it 'correctly segments text #002' do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté.", language: 'fr') 13 | expect(ps.segment).to eq(["\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté."]) 14 | end 15 | 16 | it 'correctly segments text #003' do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires. Elle assure que ce mouvement « n’aura aucun impact sur les livraisons ».", language: 'fr') 18 | expect(ps.segment).to eq(["À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires.", "Elle assure que ce mouvement « n’aura aucun impact sur les livraisons »."]) 19 | end 20 | 21 | it 'correctly segments text #004' do 22 | ps = PragmaticSegmenter::Segmenter.new(text: "Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle.", language: 'fr') 23 | expect(ps.segment).to eq(["Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle."]) 24 | end 25 | 26 | it 'correctly segments text #005' do 27 | ps = PragmaticSegmenter::Segmenter.new(text: "Les derniers ouvrages de Intercept Ltd. sont ici.", language: 'fr') 28 | expect(ps.segment).to eq(["Les derniers ouvrages de Intercept Ltd. sont ici."]) 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/spanish.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Spanish 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']).freeze 10 | PREPOSITIVE_ABBREVIATIONS = Set.new(['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']).freeze 11 | NUMBER_ABBREVIATIONS = Set.new(['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']).freeze 12 | end 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/cleaner/rules.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | # This is an opinionated class that removes errant newlines, 5 | # xhtml, inline formatting, etc. 6 | class Cleaner 7 | module Rules 8 | # Rubular: http://rubular.com/r/V57WnM9Zut 9 | NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '') 10 | 11 | # Rubular: http://rubular.com/r/dMxp5MixFS 12 | DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r") 13 | 14 | # Rubular: http://rubular.com/r/H6HOJeA8bq 15 | DoubleNewLineRule = Rule.new(/\n\n/, "\r") 16 | 17 | # Rubular: http://rubular.com/r/FseyMiiYFT 18 | NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '') 19 | 20 | 21 | ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r") 22 | 23 | EscapedNewLineRule = Rule.new(/\\n/, "\n") 24 | EscapedCarriageReturnRule = Rule.new(/\\r/, "\r") 25 | 26 | TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n") 27 | 28 | TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r") 29 | 30 | 31 | 32 | 33 | # Rubular: http://rubular.com/r/bAJrhyLNeZ 34 | InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*\s]+))?)+\s*|\s*)\/?>/, '') 71 | 72 | # Rubular: http://rubular.com/r/XZVqMPJhea 73 | EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '') 74 | 75 | All = [HTMLTagRule, EscapedHTMLTagRule] 76 | end 77 | 78 | module PDF 79 | # Rubular: http://rubular.com/r/UZAVcwqck8 80 | NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '') 81 | 82 | # Rubular: http://rubular.com/r/eaNwGavmdo 83 | NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ') 84 | end 85 | 86 | end 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/kazakh.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Kazakh 6 | include Languages::Common 7 | 8 | MULTI_PERIOD_ABBREVIATION_REGEX = /\b\p{Cyrillic}(?:\.\s?\p{Cyrillic})+[.]|b[a-z](?:\.[a-z])+[.]/i 9 | 10 | module Abbreviation 11 | ABBREVIATIONS = Set.new(['afp', 'anp', 'atp', 'bae', 'bg', 'bp', 'cam', 'cctv', 'cd', 'cez', 'cgi', 'cnpc', 'farc', 'fbi', 'eiti', 'epo', 'er', 'gp', 'gps', 'has', 'hiv', 'hrh', 'http', 'icu', 'idf', 'imd', 'ime', 'icu', 'idf', 'ip', 'iso', 'kaz', 'kpo', 'kpa', 'kz', 'kz', 'mri', 'nasa', 'nba', 'nbc', 'nds', 'ohl', 'omlt', 'ppm', 'pda', 'pkk', 'psm', 'psp', 'raf', 'rss', 'rtl', 'sas', 'sme', 'sms', 'tnt', 'udf', 'uefa', 'usb', 'utc', 'x', 'zdf', 'әқбк', 'әқбк', 'аақ', 'авг.', 'aбб', 'аек', 'ак', 'ақ', 'акцион.', 'акср', 'ақш', 'англ', 'аөсшк', 'апр', 'м.', 'а.', 'р.', 'ғ.', 'апр.', 'аум.', 'ацат', 'әч', 'т. б.', 'б. з. б.', 'б. з. б.', 'б. з. д.', 'б. з. д.', 'биікт.', 'б. т.', 'биол.', 'биохим', 'бө', 'б. э. д.', 'бта', 'бұұ', 'вич', 'всоонл', 'геогр.', 'геол.', 'гленкор', 'гэс', 'қк', 'км', 'г', 'млн', 'млрд', 'т', 'ғ. с.', 'ғ.', 'қ.', 'ғ.', 'дек.', 'днқ', 'дсұ', 'еақк', 'еқыұ', 'ембімұнайгаз', 'ео', 'еуразэқ', 'еуроодақ', 'еұу', 'ж.', 'ж.', 'жж.', 'жоо', 'жіө', 'жсдп', 'жшс', 'іім', 'инта', 'исаф', 'камаз', 'кгб', 'кеу', 'кг', 'км²', 'км²', 'км³', 'км³', 'кимеп', 'кср', 'ксро', 'кокп', 'кхдр', 'қазатомпром', 'қазкср', 'қазұу', 'қазмұнайгаз', 'қазпошта', 'қазтаг', 'қазұу', 'қкп', 'қмдб', 'қр', 'қхр', 'лат.', 'м²', 'м²', 'м³', 'м³', 'магатэ', 'май.', 'максам', 'мб', 'мвт', 'мемл', 'м', 'мсоп', 'мтк', 'мыс.', 'наса', 'нато', 'нквд', 'нояб.', 'обл.', 'огпу', 'окт.', 'оңт.', 'опек', 'оеб', 'өзенмұнайгаз', 'өф', 'пәк', 'пед.', 'ркфср', 'рнқ', 'рсфср', 'рф', 'свс', 'сву', 'сду', 'сес', 'сент.', 'см', 'снпс', 'солт.', 'солт.', 'сооно', 'ссро', 'сср', 'ссср', 'ссс', 'сэс', 'дк', 'т. б.', 'т', 'тв', 'тереңд.', 'тех.', 'тжқ', 'тмд', 'төм.', 'трлн', 'тр', 'т.', 'и.', 'м.', 'с.', 'ш.', 'т.', 'т. с. с.', 'тэц', 'уаз', 'уефа', 'еқыұ', 'ұқк', 'ұқшұ', 'февр.', 'фққ', 'фсб', 'хим.', 'хқко', 'шұар', 'шыұ', 'экон.', 'экспо', 'цтп', 'цас', 'янв.', 'dvd', 'жкт', 'ққс', 'км', 'ацат', 'юнеско', 'ббс', 'mgm', 'жск', 'зоо', 'бсн', 'өұқ', 'оар', 'боак', 'эөкк', 'хтқо', 'әөк', 'жэк', 'хдо', 'спбму', 'аф', 'сбд', 'амт', 'гсдп', 'гсбп', 'эыдұ', 'нұсжп', 'шыұ', 'жтсх', 'хдп', 'эқк', 'фкққ', 'пиқ', 'өгк', 'мбф', 'маж', 'кота', 'тж', 'ук', 'обб', 'сбл', 'жхл', 'кмс', 'бмтрк', 'жққ', 'бхооо', 'мқо', 'ржмб', 'гулаг', 'жко', 'еэы', 'еаэы', 'кхдр', 'рфкп', 'рлдп', 'хвқ', 'мр', 'мт', 'кту', 'ртж', 'тим', 'мемдум', 'ксро', 'т.с.с', 'с.ш.', 'ш.б.', 'б.б.', 'руб', 'мин', 'акад.', 'ғ.', 'мм', 'мм.']).freeze 12 | PREPOSITIVE_ABBREVIATIONS = [].freeze 13 | NUMBER_ABBREVIATIONS = [].freeze 14 | end 15 | 16 | class Processor < PragmaticSegmenter::Processor 17 | private 18 | 19 | # Rubular: http://rubular.com/r/WRWy56Z5zp 20 | QuestionMarkFollowedByDashLowercaseRule = Rule.new(/(?<=\p{Ll})\?(?=\s*[-—]\s*\p{Ll})/, '&ᓷ&') 21 | # Rubular: http://rubular.com/r/lixxP7puSa 22 | ExclamationMarkFollowedByDashLowercaseRule = Rule.new(/(?<=\p{Ll})!(?=\s*[-—]\s*\p{Ll})/, '&ᓴ&') 23 | 24 | def between_punctuation(txt) 25 | super(txt) 26 | Rule.apply(txt, QuestionMarkFollowedByDashLowercaseRule, ExclamationMarkFollowedByDashLowercaseRule) 27 | end 28 | end 29 | 30 | class AbbreviationReplacer < AbbreviationReplacer 31 | SENTENCE_STARTERS = [].freeze 32 | 33 | SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule.new(/(?<=^[А-ЯЁ])\.(?=\s)/, '∯') 34 | SingleUpperCaseCyrillicLetterRule = Rule.new(/(?<=\s[А-ЯЁ])\.(?=\s)/, '∯') 35 | 36 | def replace 37 | super 38 | Rule.apply(@text, SingleUpperCaseCyrillicLetterAtStartOfLineRule, SingleUpperCaseCyrillicLetterRule) 39 | end 40 | end 41 | end 42 | end 43 | end 44 | 45 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/common/numbers.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | module PragmaticSegmenter 5 | module Languages 6 | module Common 7 | module Numbers 8 | # Rubular: http://rubular.com/r/oNyxBOqbyy 9 | PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯') 10 | 11 | # Rubular: http://rubular.com/r/EMk5MpiUzt 12 | NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯') 13 | 14 | # Rubular: http://rubular.com/r/rf4l1HjtjG 15 | NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯') 16 | 17 | # Rubular: http://rubular.com/r/HPa4sdc6b9 18 | StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯') 19 | 20 | # Rubular: http://rubular.com/r/NuvWnKleFl 21 | StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯') 22 | 23 | All = [ 24 | PeriodBeforeNumberRule, 25 | NumberAfterPeriodBeforeLetterRule, 26 | NewLineNumberPeriodSpaceLetterRule, 27 | StartLineNumberPeriodRule, 28 | StartLineTwoDigitNumberPeriodRule 29 | ] 30 | end 31 | 32 | 33 | SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/ 34 | 35 | # Rubular: http://rubular.com/r/NqCqv372Ix 36 | QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/ 37 | 38 | # Rubular: http://rubular.com/r/6flGnUMEVl 39 | PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/ 40 | 41 | # Rubular: http://rubular.com/r/TYzr4qOW1Q 42 | BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/ 43 | 44 | # Rubular: http://rubular.com/r/JMjlZHAT4g 45 | SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/ 46 | 47 | # Rubular: http://rubular.com/r/mQ8Es9bxtk 48 | CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/ 49 | 50 | NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)?\b\d{1,3}\])+|((\d{1,3}\s?){0,3}\d{1,3}))(\s)(?=[A-Z])/ 51 | 52 | # Rubular: http://rubular.com/r/yqa4Rit8EY 53 | PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯') 54 | 55 | # Rubular: http://rubular.com/r/NEv265G2X2 56 | KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯') 57 | 58 | # Rubular: http://rubular.com/r/xDkpFZ0EgH 59 | MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i 60 | 61 | module AmPmRules 62 | # Rubular: http://rubular.com/r/Vnx3m4Spc8 63 | UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.') 64 | 65 | # Rubular: http://rubular.com/r/AJMCotJVbW 66 | UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.') 67 | 68 | # Rubular: http://rubular.com/r/13q7SnOhgA 69 | LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.') 70 | 71 | # Rubular: http://rubular.com/r/DgUDq4mLz5 72 | LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.') 73 | 74 | All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule] 75 | end 76 | 77 | # This class searches for periods within an abbreviation and 78 | # replaces the periods. 79 | module SingleLetterAbbreviationRules 80 | # Rubular: http://rubular.com/r/e3H6kwnr6H 81 | SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=,?\s)/, '∯') 82 | 83 | # Rubular: http://rubular.com/r/gitvf0YWH4 84 | SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=,?\s)/, '∯') 85 | 86 | All = [ 87 | SingleUpperCaseLetterAtStartOfLineRule, 88 | SingleUpperCaseLetterRule 89 | ] 90 | end 91 | end 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/between_punctuation.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | module PragmaticSegmenter 5 | # This class searches for punctuation between quotes or parenthesis 6 | # and replaces it 7 | class BetweenPunctuation 8 | # Rubular: http://rubular.com/r/2YFrKWQUYi 9 | BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/ 10 | 11 | BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/ 12 | 13 | # Rubular: http://rubular.com/r/3Pw1QlXOjd 14 | BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/ 15 | 16 | # Rubular: http://rubular.com/r/x6s4PZK8jc 17 | BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/ 18 | 19 | # Rubular: http://rubular.com/r/JbAIpKdlSq 20 | BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/ 21 | 22 | # Rubular: http://rubular.com/r/WX4AvnZvlX 23 | BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/ 24 | 25 | # Rubular: http://rubular.com/r/6tTityPflI 26 | BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/ 27 | 28 | # Rubular: http://rubular.com/r/mXf8cW025o 29 | WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/ 30 | 31 | # Rubular: http://rubular.com/r/jTtDKfjxzr 32 | BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/ 33 | 34 | attr_reader :text 35 | def initialize(text:) 36 | @text = text 37 | end 38 | 39 | def replace 40 | sub_punctuation_between_quotes_and_parens(text) 41 | end 42 | 43 | private 44 | 45 | def sub_punctuation_between_quotes_and_parens(txt) 46 | sub_punctuation_between_single_quotes(txt) 47 | sub_punctuation_between_single_quote_slanted(txt) 48 | sub_punctuation_between_double_quotes(txt) 49 | sub_punctuation_between_square_brackets(txt) 50 | sub_punctuation_between_parens(txt) 51 | sub_punctuation_between_quotes_arrow(txt) 52 | sub_punctuation_between_em_dashes(txt) 53 | sub_punctuation_between_quotes_slanted(txt) 54 | end 55 | 56 | def sub_punctuation_between_parens(txt) 57 | PragmaticSegmenter::PunctuationReplacer.new( 58 | matches_array: txt.scan(BETWEEN_PARENS_REGEX), 59 | text: txt 60 | ).replace 61 | end 62 | 63 | def sub_punctuation_between_square_brackets(txt) 64 | PragmaticSegmenter::PunctuationReplacer.new( 65 | matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX), 66 | text: txt 67 | ).replace 68 | end 69 | 70 | def sub_punctuation_between_single_quotes(txt) 71 | unless !(txt !~ WORD_WITH_LEADING_APOSTROPHE) && txt !~ /'\s/ 72 | PragmaticSegmenter::PunctuationReplacer.new( 73 | matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX), 74 | text: txt, 75 | match_type: 'single' 76 | ).replace 77 | end 78 | end 79 | 80 | def sub_punctuation_between_single_quote_slanted(txt) 81 | PragmaticSegmenter::PunctuationReplacer.new( 82 | matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX), 83 | text: txt 84 | ).replace 85 | end 86 | 87 | def sub_punctuation_between_double_quotes(txt) 88 | PragmaticSegmenter::PunctuationReplacer.new( 89 | matches_array: btwn_dbl_quote(txt), 90 | text: txt 91 | ).replace 92 | end 93 | 94 | def btwn_dbl_quote(txt) 95 | txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX) 96 | end 97 | 98 | def sub_punctuation_between_quotes_arrow(txt) 99 | PragmaticSegmenter::PunctuationReplacer.new( 100 | matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX), 101 | text: txt 102 | ).replace 103 | end 104 | 105 | def sub_punctuation_between_em_dashes(txt) 106 | PragmaticSegmenter::PunctuationReplacer.new( 107 | matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX), 108 | text: txt 109 | ).replace 110 | end 111 | 112 | def sub_punctuation_between_quotes_slanted(txt) 113 | PragmaticSegmenter::PunctuationReplacer.new( 114 | matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX), 115 | text: txt 116 | ).replace 117 | end 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/cleaner.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | require_relative 'cleaner/rules' 5 | 6 | module PragmaticSegmenter 7 | # This is an opinionated class that removes errant newlines, 8 | # xhtml, inline formatting, etc. 9 | class Cleaner 10 | include Rules 11 | 12 | attr_reader :text, :doc_type 13 | def initialize(text:, doc_type: nil, language: Languages::Common) 14 | @text = text.dup 15 | @doc_type = doc_type 16 | @language = language 17 | end 18 | 19 | # Clean text of unwanted formatting 20 | # 21 | # Example: 22 | # >> text = "This is a sentence\ncut off in the middle because pdf." 23 | # >> PragmaticSegmenter::Cleaner(text: text).clean 24 | # => "This is a sentence cut off in the middle because pdf." 25 | # 26 | # Arguments: 27 | # text: (String) *required 28 | # language: (String) *optional 29 | # (two character ISO 639-1 code e.g. 'en') 30 | # doc_type: (String) *optional 31 | # (e.g. 'pdf') 32 | 33 | def clean 34 | return unless text 35 | remove_all_newlines 36 | replace_double_newlines 37 | replace_newlines 38 | replace_escaped_newlines 39 | 40 | Rule.apply(@text, HTML::All) 41 | 42 | replace_punctuation_in_brackets 43 | Rule.apply(@text, InlineFormattingRule) 44 | clean_quotations 45 | clean_table_of_contents 46 | check_for_no_space_in_between_sentences 47 | clean_consecutive_characters 48 | end 49 | 50 | private 51 | 52 | def abbreviations 53 | @language::Abbreviation::ABBREVIATIONS 54 | end 55 | 56 | def check_for_no_space_in_between_sentences 57 | words = @text.split(' ') 58 | words.each do |word| 59 | search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule) 60 | search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule) 61 | end 62 | @text 63 | end 64 | 65 | def replace_punctuation_in_brackets 66 | @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match| 67 | @text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?') 68 | end 69 | end 70 | 71 | def search_for_connected_sentences(word, txt, regex, rule) 72 | if word =~ regex 73 | unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ } 74 | unless abbreviations.any? { |abbr| word =~ /#{abbr}/i } 75 | new_word = Rule.apply(word.dup, rule) 76 | txt.gsub!(/#{Regexp.escape(word)}/, new_word) 77 | end 78 | end 79 | end 80 | end 81 | 82 | def remove_all_newlines 83 | remove_newline_in_middle_of_sentence 84 | remove_newline_in_middle_of_word 85 | end 86 | 87 | def remove_newline_in_middle_of_sentence 88 | @text.gsub!(/(?:[^\.])*/) do |match| 89 | match.gsub(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '') 90 | end 91 | @text 92 | end 93 | 94 | def remove_newline_in_middle_of_word 95 | Rule.apply @text, NewLineInMiddleOfWordRule 96 | end 97 | 98 | def replace_escaped_newlines 99 | Rule.apply @text, EscapedNewLineRule, EscapedCarriageReturnRule, 100 | TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule 101 | end 102 | 103 | def replace_double_newlines 104 | Rule.apply @text, DoubleNewLineWithSpaceRule, DoubleNewLineRule 105 | end 106 | 107 | def replace_newlines 108 | if doc_type.eql?('pdf') 109 | remove_pdf_line_breaks 110 | else 111 | Rule.apply @text, NewLineFollowedByPeriodRule, 112 | ReplaceNewlineWithCarriageReturnRule 113 | end 114 | end 115 | 116 | def remove_pdf_line_breaks 117 | Rule.apply @text, NewLineFollowedByBulletRule, 118 | 119 | PDF::NewLineInMiddleOfSentenceRule, 120 | PDF::NewLineInMiddleOfSentenceNoSpacesRule 121 | end 122 | 123 | def clean_quotations 124 | Rule.apply @text, QuotationsFirstRule, QuotationsSecondRule 125 | end 126 | 127 | def clean_table_of_contents 128 | Rule.apply @text, TableOfContentsRule, ConsecutivePeriodsRule, 129 | ConsecutiveForwardSlashRule 130 | end 131 | 132 | def clean_consecutive_characters 133 | Rule.apply @text, ConsecutivePeriodsRule, ConsecutiveForwardSlashRule 134 | end 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/processor.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | require 'pragmatic_segmenter/punctuation_replacer' 5 | require 'pragmatic_segmenter/between_punctuation' 6 | 7 | 8 | require 'pragmatic_segmenter/list' 9 | require 'pragmatic_segmenter/abbreviation_replacer' 10 | require 'pragmatic_segmenter/exclamation_words' 11 | 12 | module PragmaticSegmenter 13 | # This class processing segmenting the text. 14 | class Processor 15 | 16 | attr_reader :text 17 | def initialize(language: Languages::Common) 18 | @language = language 19 | end 20 | 21 | def process(text:) 22 | @text = List.new(text: text).add_line_break 23 | replace_abbreviations 24 | replace_numbers 25 | replace_continuous_punctuation 26 | replace_periods_before_numeric_references 27 | Rule.apply(@text, @language::Abbreviations::WithMultiplePeriodsAndEmailRule) 28 | Rule.apply(@text, @language::GeoLocationRule) 29 | Rule.apply(@text, @language::FileFormatRule) 30 | split_into_segments 31 | end 32 | 33 | private 34 | 35 | def split_into_segments 36 | check_for_parens_between_quotes(@text).split("\r") 37 | .map! { |segment| Rule.apply(segment, @language::SingleNewLineRule, @language::EllipsisRules::All) } 38 | .map { |segment| check_for_punctuation(segment) }.flatten 39 | .map! { |segment| Rule.apply(segment, @language::SubSymbolsRules::All) } 40 | .map { |segment| post_process_segments(segment) } 41 | .flatten.compact.delete_if(&:empty?) 42 | .map! { |segment| Rule.apply(segment, @language::SubSingleQuoteRule) } 43 | end 44 | 45 | def post_process_segments(txt) 46 | return txt if txt.length < 2 && txt =~ /\A[a-zA-Z]*\Z/ 47 | return if consecutive_underscore?(txt) || txt.length < 2 48 | Rule.apply( 49 | txt, 50 | @language::ReinsertEllipsisRules::All, 51 | @language::ExtraWhiteSpaceRule 52 | ) 53 | 54 | if txt =~ @language::QUOTATION_AT_END_OF_SENTENCE_REGEX 55 | txt.split(@language::SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX) 56 | else 57 | txt.tr("\n", '').strip 58 | end 59 | end 60 | 61 | def check_for_parens_between_quotes(txt) 62 | return txt unless txt =~ @language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX 63 | txt.gsub!(@language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX) do |match| 64 | match.gsub(/\s(?=\()/, "\r").gsub(/(?<=\))\s/, "\r") 65 | end 66 | end 67 | 68 | def replace_continuous_punctuation 69 | @text.gsub!(@language::CONTINUOUS_PUNCTUATION_REGEX) do |match| 70 | match.gsub(/!/, '&ᓴ&').gsub(/\?/, '&ᓷ&') 71 | end 72 | end 73 | 74 | def replace_periods_before_numeric_references 75 | @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7") 76 | end 77 | 78 | def consecutive_underscore?(txt) 79 | # Rubular: http://rubular.com/r/fTF2Ff3WBL 80 | txt.gsub(/_{3,}/, '').length.eql?(0) 81 | end 82 | 83 | def check_for_punctuation(txt) 84 | if @language::Punctuations.any? { |p| txt.include?(p) } 85 | process_text(txt) 86 | else 87 | txt 88 | end 89 | end 90 | 91 | def process_text(txt) 92 | txt << 'ȸ' unless @language::Punctuations.any? { |p| txt[-1].include?(p) } 93 | ExclamationWords.apply_rules(txt) 94 | between_punctuation(txt) 95 | txt = Rule.apply( 96 | txt, 97 | @language::DoublePunctuationRules::All, 98 | @language::QuestionMarkInQuotationRule, 99 | @language::ExclamationPointRules::All 100 | ) 101 | txt = List.new(text: txt).replace_parens 102 | sentence_boundary_punctuation(txt) 103 | end 104 | 105 | def replace_numbers 106 | Rule.apply @text, @language::Numbers::All 107 | end 108 | 109 | def abbreviations_replacer 110 | if defined? @language::AbbreviationReplacer 111 | @language::AbbreviationReplacer 112 | else 113 | AbbreviationReplacer 114 | end 115 | end 116 | 117 | def replace_abbreviations 118 | @text = abbreviations_replacer.new(text: @text, language: @language).replace 119 | end 120 | 121 | def between_punctuation_processor 122 | if defined? @language::BetweenPunctuation 123 | @language::BetweenPunctuation 124 | else 125 | BetweenPunctuation 126 | end 127 | end 128 | 129 | def between_punctuation(txt) 130 | between_punctuation_processor.new(text: txt).replace 131 | end 132 | 133 | def sentence_boundary_punctuation(txt) 134 | txt = Rule.apply txt, @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule 135 | txt = Rule.apply txt, @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule 136 | 137 | txt.scan(@language::SENTENCE_BOUNDARY_REGEX) 138 | end 139 | end 140 | end 141 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/deutsch.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Deutsch 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt', 'univ.-prof', 'o.univ.-prof', 'ao.univ.prof', 'ass.prof', 'hon.prof', 'univ.-doz', 'univ.ass', 'stud.ass', 'projektass', 'ass', 'di', 'dipl.-ing', 'mag']).freeze 10 | NUMBER_ABBREVIATIONS = Set.new(['art', 'ca', 'no', 'nos', 'nr', 'pp']).freeze 11 | PREPOSITIVE_ABBREVIATIONS = [].freeze 12 | end 13 | 14 | # Rubular: http://rubular.com/r/OdcXBsub0w 15 | BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/ 16 | 17 | # Rubular: http://rubular.com/r/2UskIupGgP 18 | SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/ 19 | 20 | # Rubular: http://rubular.com/r/TkZomF9tTM 21 | BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/ 22 | 23 | 24 | module Numbers 25 | # Rubular: http://rubular.com/r/hZxoyQwKT1 26 | NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯') 27 | 28 | # Rubular: http://rubular.com/r/ityNMwdghj 29 | NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯') 30 | 31 | All = [ 32 | Common::Numbers::All, 33 | NumberPeriodSpaceRule, 34 | NegativeNumberPeriodSpaceRule 35 | ] 36 | end 37 | 38 | MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember'].freeze 39 | 40 | # Rubular: http://rubular.com/r/B4X33QKIL8 41 | SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯') 42 | 43 | # Rubular: http://rubular.com/r/iUNSkCuso0 44 | SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯') 45 | 46 | class Processor < PragmaticSegmenter::Processor 47 | private 48 | 49 | def replace_numbers 50 | Rule.apply @text, Numbers::All 51 | 52 | replace_period_in_deutsch_dates 53 | end 54 | 55 | def replace_period_in_deutsch_dates 56 | MONTHS.each do |month| 57 | # Rubular: http://rubular.com/r/zlqgj7G5dA 58 | @text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯') 59 | end 60 | end 61 | end 62 | 63 | class AbbreviationReplacer < AbbreviationReplacer 64 | 65 | SENTENCE_STARTERS = %w( 66 | Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In 67 | Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir 68 | ).freeze 69 | 70 | def replace 71 | @text = Rule.apply( 72 | text, 73 | @language::PossessiveAbbreviationRule, 74 | @language::SingleLetterAbbreviationRules::All, 75 | SingleLowerCaseLetterRule, 76 | SingleLowerCaseLetterAtStartOfLineRule) 77 | 78 | @text = search_for_abbreviations_in_string(@text) 79 | @text = replace_multi_period_abbreviations(@text) 80 | Rule.apply(@text, Languages::Common::AmPmRules::All) 81 | replace_abbreviation_as_sentence_boundary(@text) 82 | end 83 | 84 | private 85 | 86 | def scan_for_replacements(txt, am, index, character_array) 87 | txt.gsub!(/(?<=#{am})\.(?=\s)/, '∯') 88 | txt 89 | end 90 | end 91 | 92 | class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation 93 | private 94 | 95 | def btwn_dbl_quote(txt) 96 | if txt.include?('„') 97 | btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX) 98 | txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q| 99 | btwn_dbl_quote << q 100 | end 101 | elsif txt.include?(',,') 102 | btwn_dbl_quote = txt.scan(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX) 103 | end 104 | btwn_dbl_quote 105 | end 106 | end 107 | end 108 | end 109 | end 110 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/abbreviation_replacer.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | require 'unicode' 5 | 6 | module PragmaticSegmenter 7 | # This class searches for periods within an abbreviation and 8 | # replaces the periods. 9 | class AbbreviationReplacer 10 | 11 | attr_reader :text 12 | def initialize(text:, language: ) 13 | @text = text.dup 14 | @language = language 15 | end 16 | 17 | def replace 18 | Rule.apply(@text, 19 | @language::PossessiveAbbreviationRule, 20 | @language::KommanditgesellschaftRule, 21 | @language::SingleLetterAbbreviationRules::All) 22 | 23 | @text = search_for_abbreviations_in_string(@text) 24 | @text = replace_multi_period_abbreviations(@text) 25 | Rule.apply(@text, @language::AmPmRules::All) 26 | replace_abbreviation_as_sentence_boundary(@text) 27 | end 28 | 29 | private 30 | 31 | def search_for_abbreviations_in_string(txt) 32 | original = txt.dup 33 | downcased = Unicode::downcase(txt) 34 | @language::Abbreviation::ABBREVIATIONS.each do |abbreviation| 35 | stripped = abbreviation.strip 36 | next unless downcased.include?(stripped) 37 | abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i) 38 | next if abbrev_match.empty? 39 | next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/ 40 | character_array = @text.scan(next_word_start) 41 | abbrev_match.each_with_index do |am, index| 42 | txt = scan_for_replacements(txt, am, index, character_array) 43 | end 44 | end 45 | txt 46 | end 47 | 48 | def scan_for_replacements(txt, am, index, character_array) 49 | character = character_array[index] 50 | prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS 51 | number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS 52 | upper = /[[:upper:]]/.match(character.to_s) 53 | if upper.nil? || prepositive.include?(Unicode::downcase(am.strip)) 54 | if prepositive.include?(Unicode::downcase(am.strip)) 55 | txt = replace_prepositive_abbr(txt, am) 56 | elsif number_abbr.include?(Unicode::downcase(am.strip)) 57 | txt = replace_pre_number_abbr(txt, am) 58 | else 59 | txt = replace_period_of_abbr(txt, am) 60 | end 61 | end 62 | txt 63 | end 64 | 65 | def replace_abbreviation_as_sentence_boundary(txt) 66 | # As we are being conservative and keeping ambiguous 67 | # sentence boundaries as one sentence instead of 68 | # splitting into two, we can split at words that 69 | # we know for certain never follow these abbreviations. 70 | # Some might say that the set of words that follow an 71 | # abbreviation such as U.S. (i.e. U.S. Government) is smaller than 72 | # the set of words that could start a sentence and 73 | # never follow U.S. However, we are being conservative 74 | # and not splitting by default, so we need to look for places 75 | # where we definitely can split. Obviously SENTENCE_STARTERS 76 | # will never cover all cases, but as the gem is named 77 | # 'Pragmatic Segmenter' we need to be pragmatic 78 | # and try to cover the words that most often start a 79 | # sentence but could never follow one of the abbreviations below. 80 | 81 | # Rubular: http://rubular.com/r/PkBQ3PVBS8 82 | @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word| 83 | escaped = Regexp.escape(word) 84 | regex = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/ 85 | txt.gsub!(regex, '\1.') 86 | end 87 | txt 88 | end 89 | 90 | def replace_multi_period_abbreviations(txt) 91 | mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX) 92 | return txt if mpa.empty? 93 | mpa.each do |r| 94 | txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}") 95 | end 96 | txt 97 | end 98 | 99 | def replace_pre_number_abbr(txt, abbr) 100 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯') 101 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯') 102 | txt 103 | end 104 | 105 | def replace_prepositive_abbr(txt, abbr) 106 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯') 107 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯') 108 | txt 109 | end 110 | 111 | def replace_period_of_abbr(txt, abbr) 112 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯') 113 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯') 114 | txt 115 | end 116 | 117 | def replace_possessive_abbreviations(txt) 118 | txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯') 119 | txt 120 | end 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/kazakh_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Kazakh, "(kk)" do 4 | 5 | context "Golden Rules" do 6 | it "Simple period to end sentence #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Мұхитқа тікелей шыға алмайтын мемлекеттердің ішінде Қазақстан - ең үлкені.", language: "kk") 8 | expect(ps.segment).to eq(["Мұхитқа тікелей шыға алмайтын мемлекеттердің ішінде Қазақстан - ең үлкені."]) 9 | end 10 | 11 | it "Question mark to end sentence #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "Оқушылар үйі, Достық даңғылы, Абай даналығы, ауыл шаруашылығы – кім? не?", language: "kk") 13 | expect(ps.segment).to eq(["Оқушылар үйі, Достық даңғылы, Абай даналығы, ауыл шаруашылығы – кім?", "не?"]) 14 | end 15 | 16 | it "Parenthetical inside sentence #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "Әр түрлі өлшемнің атауы болып табылатын м (метр), см (сантиметр), кг (киллограмм), т (тонна), га (гектар), ц (центнер), т. б. (тағы басқа), тәрізді белгілер де қысқарған сөздер болып табылады.", language: "kk") 18 | expect(ps.segment).to eq(["Әр түрлі өлшемнің атауы болып табылатын м (метр), см (сантиметр), кг (киллограмм), т (тонна), га (гектар), ц (центнер), т. б. (тағы басқа), тәрізді белгілер де қысқарған сөздер болып табылады."]) 19 | end 20 | 21 | it "Two letter abbreviation to end sentence #004" do 22 | ps = PragmaticSegmenter::Segmenter.new(text: "Мысалы: обкомға (облыстық комитетке) барды, ауаткомда (аудандық атқару комитетінде) болды, педучилищеге (педагогтік училищеге) түсті, медпункттің (медициналық пункттің) алдында т. б.", language: "kk") 23 | expect(ps.segment).to eq(["Мысалы: обкомға (облыстық комитетке) барды, ауаткомда (аудандық атқару комитетінде) болды, педучилищеге (педагогтік училищеге) түсті, медпункттің (медициналық пункттің) алдында т. б."]) 24 | end 25 | 26 | it "Number as non sentence boundary #005" do 27 | ps = PragmaticSegmenter::Segmenter.new(text: "Елдің жалпы ішкі өнімі ЖІӨ (номинал) = $225.619 млрд (2014)", language: "kk") 28 | expect(ps.segment).to eq(["Елдің жалпы ішкі өнімі ЖІӨ (номинал) = $225.619 млрд (2014)"]) 29 | end 30 | 31 | it "No whitespace between sentence boundary #006" do 32 | ps = PragmaticSegmenter::Segmenter.new(text: "Ресейдiң әлеуметтiк-экономикалық жағдайы.XVIII ғасырдың бiрiншi ширегiнде Ресейге тән нәрсе.", language: "kk") 33 | expect(ps.segment).to eq(["Ресейдiң әлеуметтiк-экономикалық жағдайы.", "XVIII ғасырдың бiрiншi ширегiнде Ресейге тән нәрсе."]) 34 | end 35 | 36 | it "Dates within sentence #007" do 37 | ps = PragmaticSegmenter::Segmenter.new(text: "(«Егемен Қазақстан», 7 қыркүйек 2012 жыл. №590-591); Бұл туралы кеше санпедқадағалау комитетінің облыыстық департаменті хабарлады. («Айқын», 23 сəуір 2010 жыл. № 70).", language: "kk") 38 | expect(ps.segment).to eq(["(«Егемен Қазақстан», 7 қыркүйек 2012 жыл. №590-591); Бұл туралы кеше санпедқадағалау комитетінің облыыстық департаменті хабарлады.", "(«Айқын», 23 сəуір 2010 жыл. № 70)."]) 39 | end 40 | 41 | it "Multi period abbreviation within sentence #008" do 42 | ps = PragmaticSegmenter::Segmenter.new(text: "Иран революциясы (1905 — 11) және азаматтық қозғалыс (1918 — 21) кезінде А. Фарахани, М. Кермани, М. Т. Бехар, т.б. ақындар демократиялық идеяның жыршысы болды.", language: "kk") 43 | expect(ps.segment).to eq(["Иран революциясы (1905 — 11) және азаматтық қозғалыс (1918 — 21) кезінде А. Фарахани, М. Кермани, М. Т. Бехар, т.б. ақындар демократиялық идеяның жыршысы болды."]) 44 | end 45 | 46 | it "Web addresses #009" do 47 | ps = PragmaticSegmenter::Segmenter.new(text: "Владимир Федосеев: Аттар магиясы енді жоқ http://www.vremya.ru/2003/179/10/80980.html", language: "kk") 48 | expect(ps.segment).to eq(["Владимир Федосеев: Аттар магиясы енді жоқ http://www.vremya.ru/2003/179/10/80980.html"]) 49 | end 50 | 51 | it "Question mark not at end of sentence #010" do 52 | ps = PragmaticSegmenter::Segmenter.new(text: "Бірақ оның енді не керегі бар? — деді.", language: "kk") 53 | expect(ps.segment).to eq(["Бірақ оның енді не керегі бар? — деді."]) 54 | end 55 | 56 | it "Exclamation mark not at end of sentence #011" do 57 | ps = PragmaticSegmenter::Segmenter.new(text: "Сондықтан шапаныма жегізіп отырғаным! - деп, жауап береді.", language: "kk") 58 | expect(ps.segment).to eq(["Сондықтан шапаныма жегізіп отырғаным! - деп, жауап береді."]) 59 | end 60 | end 61 | 62 | describe '#segment' do 63 | it 'correctly segments text #001' do 64 | ps = PragmaticSegmenter::Segmenter.new(text: "Б.з.б. 6 – 3 ғасырларда конфуцийшілдік, моизм, легизм мектептерінің қалыптасуы нәтижесінде Қытай философиясы пайда болды.", language: 'kk') 65 | expect(ps.segment).to eq(["Б.з.б. 6 – 3 ғасырларда конфуцийшілдік, моизм, легизм мектептерінің қалыптасуы нәтижесінде Қытай философиясы пайда болды."]) 66 | end 67 | 68 | it 'correctly segments text #002' do 69 | ps = PragmaticSegmenter::Segmenter.new(text: "'Та марбута' тек сөз соңында екі түрде жазылады:", language: "kk") 70 | expect(ps.segment).to eq(["'Та марбута' тек сөз соңында екі түрде жазылады:"]) 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/japanese_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Japanese, "(ja)" do 4 | 5 | context "Golden Rules" do 6 | it "Simple period to end sentence #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "これはペンです。それはマーカーです。", language: "ja") 8 | expect(ps.segment).to eq(["これはペンです。", "それはマーカーです。"]) 9 | end 10 | 11 | it "Question mark to end sentence #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "それは何ですか?ペンですか?", language: "ja") 13 | expect(ps.segment).to eq(["それは何ですか?", "ペンですか?"]) 14 | end 15 | 16 | it "Exclamation point to end sentence #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "良かったね!すごい!", language: "ja") 18 | expect(ps.segment).to eq(["良かったね!", "すごい!"]) 19 | end 20 | 21 | it "Quotation #004" do 22 | ps = PragmaticSegmenter::Segmenter.new(text: "自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。2%台後半を目指すとする方向で最終調整に入りました。", language: "ja") 23 | expect(ps.segment).to eq(["自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。", "2%台後半を目指すとする方向で最終調整に入りました。"]) 24 | end 25 | 26 | it "Errant newlines in the middle of sentences #005" do 27 | ps = PragmaticSegmenter::Segmenter.new(text: "これは父の\n家です。", language: "ja") 28 | expect(ps.segment).to eq(["これは父の家です。"]) 29 | end 30 | end 31 | 32 | describe '#segment' do 33 | it 'correctly segments text #001' do 34 | ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。 \nこれは山です(これは山です。これは山です)。これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。 \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。 \n※1 これは山です。 \n2.)これは山です、これは山です、これは山です、これは山です。 \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。 \n4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。 \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。 \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja') 35 | expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。", "これは山です(これは山です。これは山です)。", "これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。", "※1 これは山です。", "2.)これは山です、これは山です、これは山です、これは山です。", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。", "4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"]) 36 | end 37 | 38 | it 'correctly segments text #002' do 39 | ps = PragmaticSegmenter::Segmenter.new(text: "フフーの\n主たる債務", language: 'ja') 40 | expect(ps.segment).to eq(["フフーの主たる債務"]) 41 | end 42 | 43 | it 'correctly segments text #003' do 44 | ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です. \nこれは山です(これは山です.これは山です).これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です. \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です. \n※1 これは山です. \n2.)これは山です、これは山です、これは山です、これは山です. \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です. \n4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です). \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です. \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja') 45 | expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です.", "これは山です(これは山です.これは山です).", "これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です.", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です.", "※1 これは山です.", "2.)これは山です、これは山です、これは山です、これは山です.", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です.", "4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です).", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です.", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"]) 46 | end 47 | 48 | it 'correctly segments text #004' do 49 | ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です! \nこれは山です(これは山です!これは山です)!これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です! \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です! \n※1 これは山です! \n2.)これは山です、これは山です、これは山です、これは山です! \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です! \n4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)! \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です! \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja') 50 | expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です!", "これは山です(これは山です!これは山です)!", "これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です!", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です!", "※1 これは山です!", "2.)これは山です、これは山です、これは山です、これは山です!", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です!", "4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)!", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です!", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"]) 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /spec/performance_spec.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | require 'benchmark' 3 | require 'spec_helper' 4 | require 'stackprof' 5 | 6 | RSpec.describe PragmaticSegmenter::Segmenter do 7 | 8 | # Speed benchmarks tests 9 | 10 | # it 'is fast' do 11 | # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 100 12 | # benchmark do 13 | # PragmaticSegmenter::Segmenter.new(text: string, language: 'en').segment 14 | # end 15 | # data = StackProf.run(mode: :cpu, interval: 1000) do 16 | # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10 17 | # PragmaticSegmenter::Segmenter.new(text: string, language: 'en').segment 18 | # end 19 | # puts StackProf::Report.new(data).print_text 20 | # end 21 | 22 | end 23 | 24 | def benchmark(&block) 25 | block.call 26 | time = Benchmark.realtime { block.call } 27 | puts "RUNTIME: #{time}" 28 | end 29 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | 0.3.24 (2024-08-12): 2 | 3 | * Bug Fix: Catastrophic backtracking in regular expression for numerical references 4 | * Improvement: Remove unicode dependency 5 | 6 | 0.3.23 (2021-05-03): 7 | 8 | * Improvement: Refactor for Ruby 3.0 compatibility 9 | 10 | 0.3.22 (2018-09-23): 11 | 12 | * Improvement: Initial support for Kazakh 13 | 14 | 0.3.21 (2018-08-30): 15 | 16 | * Improvement: Add support for file formats 17 | * Improvement: Add support for numeric references at the end of a sentence (i.e. Wikipedia references) 18 | 19 | 0.3.20 (2018-08-28): 20 | 21 | * Improvement: Handle slanted single quotation as a single quote 22 | * Bug Fix: The text contains a single character abbreviation as part of a list 23 | * Bug Fix: Chinese book quotes 24 | * Improvement: Add viz as abbreviation 25 | 26 | 0.3.19 (2018-07-19): 27 | 28 | * Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment. 29 | 30 | 0.3.18 (2018-03-27): 31 | 32 | * Improvement: Performance optimizations 33 | 34 | 0.3.17 (2017-12-07): 35 | 36 | * Bug Fix: Regex for parsing HTML 37 | 38 | 0.3.16 (2017-11-13): 39 | 40 | * Improvement: Support for Danish 41 | 42 | 0.3.15 (2017-06-28): 43 | 44 | * Improvement: Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark 45 | 46 | 0.3.14 (2017-06-28): 47 | 48 | * Improvement: Add English abbreviation Rs. to denote the Indian currency 49 | 50 | 0.3.13 (2017-01-17): 51 | 52 | * Bug Fix: Unexpected sentence break between abbreviation and hyphen 53 | 54 | 0.3.12 (2016-12-12): 55 | 56 | * Bug Fix: Issue with words with leading apostrophes 57 | 58 | 0.3.11 (2016-11-08): 59 | 60 | * Improvement: Update German abbreviation list 61 | * Bug Fix: Refactor 'remove_newline_in_middle_of_sentence' method 62 | 63 | 0.3.10 (2016-07-01): 64 | 65 | * Bug Fix: Change load order of dependencies 66 | 67 | 0.3.9 (2016-06-16): 68 | 69 | * Improvement: Remove `guard-rspec` development dependency 70 | 71 | 0.3.8 (2016-03-03): 72 | 73 | * Bug Fix: Fix bug that cleaned away single letter segments 74 | 75 | 0.3.7 (2016-01-12): 76 | 77 | * Improvement: Add `unicode` gem and use it for downcasing to better handle cyrillic languages 78 | 79 | 0.3.6 (2016-01-05): 80 | 81 | * Improvement: Refactor SENTENCE_STARTERS to each individual language and add SENTENCE_STARTERS for German 82 | 83 | 0.3.5 (2016-01-04): 84 | 85 | * Performance: Reduce GC by replacing #gsub with #gsub! where possible 86 | 87 | 0.3.4 (2015-12-22): 88 | 89 | * Improvement: Large refactor 90 | 91 | 0.3.3 (2015-05-27): 92 | 93 | * Bug Fix: Fix cleaner bug 94 | 95 | 0.3.2 (2015-05-27): 96 | 97 | * Improvement: Add English abbreviations 98 | 99 | 0.3.1 (2015-03-02): 100 | 101 | * Bug Fix: Fix undefined method 'gsub!' for nil:NilClass issue 102 | 103 | 0.3.0 (2015-02-04): 104 | 105 | * Improvement: Add support for square brackets 106 | * Improvement: Add support for continuous exclamation points or questions marks or combinations of both 107 | * Bug Fix: Fix Roman numeral support 108 | * Improvement: Add English abbreviations 109 | 110 | 0.2.0 (2015-01-26): 111 | 112 | * Improvement: Add Dutch Golden Rules and abbreviations 113 | * Improvement: Update README with additional tools 114 | * Improvement: Update segmentation test scores in README with results of new Golden Rule tests 115 | * Improvement: Add Polish abbreviations 116 | 117 | 0.1.8 (2015-01-22): 118 | 119 | * Bug Fix: Fix bug in splitting new sentence after single quotes 120 | 121 | 0.1.7 (2015-01-22): 122 | 123 | * Improvement: Add Alice in Wonderland specs 124 | * Bug Fix: Fix parenthesis between double quotations bug 125 | * Bug Fix: Fix split after quotation ending in dash bug 126 | 127 | 0.1.6 (2015-01-16): 128 | 129 | * Bug Fix: Fix bug in numbered list finder (ignore longer digits) 130 | 131 | 0.1.5 (2015-01-13): 132 | 133 | * Bug Fix: Fix comma at end of quotation bug 134 | 135 | 0.1.4 (2015-01-13): 136 | 137 | * Bug Fix: Fix missing abbreviations 138 | 139 | 0.1.3 (2015-01-13): 140 | 141 | * Improvement: Improve punctuation in bracket replacement 142 | 143 | 0.1.2 (2015-01-13): 144 | 145 | * Bug Fix: Fix missing abbreviations 146 | * Improvement: Add footnote rule to `cleaner.rb` 147 | 148 | 0.1.1 (2015-01-12): 149 | 150 | * Bug Fix: Fix handling of German dates 151 | 152 | 0.1.0 (2015-01-12): 153 | 154 | * Improvement: Add Kommanditgesellschaft Rule 155 | 156 | 0.0.9 (2015-01-12): 157 | 158 | * Improvement: Improve handling of alphabetical and roman numeral lists 159 | 160 | 0.0.8 (2015-01-12): 161 | 162 | * Bug Fix: Fix error in `list.rb` 163 | 164 | 0.0.7 (2015-01-12): 165 | 166 | * Improvement: Add change log to README 167 | * Improvement: Add passing spec for new end of sentence abbreviation (EN) 168 | * Improvement: Add roman numeral list support 169 | 170 | 0.0.6 (2015-01-11): 171 | 172 | * Improvement: Add rule for escaped newlines that include a space between the slash and character 173 | * Improvement: Add Golden Rule #52 and code to make it pass 174 | 175 | 0.0.5 (2015-01-10): 176 | 177 | * Improvement: Make symbol substitution safer 178 | * Improvement: Refactor `process.rb` 179 | * Improvement: Update cleaner with escaped newline rules 180 | 181 | 0.0.4 (2015-01-10): 182 | 183 | * Improvement: Add `ConsecutiveForwardSlashRule` to cleaner 184 | * Improvement: Refactor `segmenter.rb` and `process.rb` 185 | 186 | 0.0.3 (2015-01-07): 187 | 188 | * Improvement: Add travis.yml 189 | * Improvement: Add Code Climate 190 | * Improvement: Update README 191 | 192 | 0.0.2 (2015-01-07): 193 | 194 | * Improvement: Major design refactor 195 | 196 | 0.0.1 (2015-01-07): 197 | 198 | * Initial Release -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/common.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative 'common/numbers' 4 | require_relative 'common/ellipsis' 5 | 6 | module PragmaticSegmenter 7 | module Languages 8 | module Common 9 | # This class holds the punctuation marks. 10 | Punctuations = ['。', '.', '.', '!', '!', '?', '?'].freeze 11 | 12 | # Defines the abbreviations for each language (if available) 13 | module Abbreviation 14 | ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze 15 | PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']).freeze 16 | NUMBER_ABBREVIATIONS = Set.new(['art', 'ext', 'no', 'nos', 'p', 'pp']).freeze 17 | end 18 | 19 | module Abbreviations 20 | # Rubular: http://rubular.com/r/EUbZCNfgei 21 | WithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3') 22 | end 23 | 24 | # Rubular: http://rubular.com/r/G2opjedIm9 25 | GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯') 26 | 27 | FileFormatRule = Rule.new(/(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)/, '∯') 28 | 29 | SingleNewLineRule = Rule.new(/\n/, 'ȹ') 30 | 31 | module DoublePunctuationRules 32 | FirstRule = Rule.new(/\?!/, '☉') 33 | SecondRule = Rule.new(/!\?/, '☈') 34 | ThirdRule = Rule.new(/\?\?/, '☇') 35 | ForthRule = Rule.new(/!!/, '☄') 36 | 37 | All = [ FirstRule, SecondRule, ThirdRule, ForthRule ] 38 | end 39 | 40 | 41 | # Rubular: http://rubular.com/r/aXPUGm6fQh 42 | QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&') 43 | 44 | 45 | module ExclamationPointRules 46 | # Rubular: http://rubular.com/r/XS1XXFRfM2 47 | InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&') 48 | 49 | # Rubular: http://rubular.com/r/sl57YI8LkA 50 | BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&') 51 | 52 | # Rubular: http://rubular.com/r/f9zTjmkIPb 53 | MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&') 54 | 55 | All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ] 56 | end 57 | 58 | module SubSymbolsRules 59 | Period = Rule.new(/∯/, '.') 60 | ArabicComma = Rule.new(/♬/, '،') 61 | SemiColon = Rule.new(/♭/, ':') 62 | FullWidthPeriod = Rule.new(/&ᓰ&/, '。') 63 | SpecialPeriod = Rule.new(/&ᓱ&/, '.') 64 | FullWidthExclamation = Rule.new(/&ᓳ&/, '!') 65 | ExclamationPoint = Rule.new(/&ᓴ&/, '!') 66 | QuestionMark = Rule.new(/&ᓷ&/, '?') 67 | FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?') 68 | MixedDoubleQE = Rule.new(/☉/, '?!') 69 | MixedDoubleQQ = Rule.new(/☇/, '??') 70 | MixedDoubleEQ = Rule.new(/☈/, '!?') 71 | MixedDoubleEE = Rule.new(/☄/, '!!') 72 | LeftParens = Rule.new(/&✂&/, '(') 73 | RightParens = Rule.new(/&⌬&/, ')') 74 | TemporaryEndingPunctutation = Rule.new('ȸ', '') 75 | Newline = Rule.new(/ȹ/, "\n") 76 | 77 | All = [ Period, ArabicComma, 78 | SemiColon, FullWidthPeriod, 79 | SpecialPeriod, FullWidthExclamation, 80 | ExclamationPoint, QuestionMark, 81 | FullWidthQuestionMark, MixedDoubleQE, 82 | MixedDoubleQQ, MixedDoubleEQ, 83 | MixedDoubleEE, LeftParens, 84 | RightParens, TemporaryEndingPunctutation, 85 | Newline ] 86 | end 87 | 88 | 89 | module ReinsertEllipsisRules 90 | SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...') 91 | SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ') 92 | SubFourSpacePeriod = Rule.new(/♝/, '. . . .') 93 | SubTwoConsecutivePeriod = Rule.new(/☏/, '..') 94 | SubOnePeriod = Rule.new(/∮/, '.') 95 | 96 | All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod, 97 | SubFourSpacePeriod, SubTwoConsecutivePeriod, 98 | SubOnePeriod ] 99 | end 100 | 101 | ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ') 102 | 103 | SubSingleQuoteRule = Rule.new(/&⎋&/, "'") 104 | 105 | class AbbreviationReplacer < AbbreviationReplacer 106 | SENTENCE_STARTERS = %w( 107 | A Being Did For He How However I In It Millions More She That The 108 | There They We What When Where Who Why 109 | ).freeze 110 | end 111 | 112 | end 113 | end 114 | end 115 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/arabic_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Arabic, '(ar)' do 4 | 5 | context "Golden Rules" do 6 | it "Regular punctuation #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: "ar") 8 | expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."]) 9 | end 10 | 11 | it "Abbreviations #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: "ar") 13 | expect(ps.segment).to eq(["وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."]) 14 | end 15 | 16 | it "Numbers and Dates #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: "ar") 18 | expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."]) 19 | end 20 | 21 | it "Time #004" do 22 | ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: "ar") 23 | expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."]) 24 | end 25 | 26 | it "Comma #005" do 27 | ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: "ar") 28 | expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"]) 29 | end 30 | end 31 | 32 | # Thanks to Mahmoud Holmez for the Arabic test examples. 33 | describe '#segment' do 34 | it 'correctly segments text #001' do 35 | ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: 'ar') 36 | expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."]) 37 | end 38 | 39 | it 'correctly segments text #002' do 40 | ps = PragmaticSegmenter::Segmenter.new(text: "وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: 'ar') 41 | expect(ps.segment).to eq(["وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."]) 42 | end 43 | 44 | it 'correctly segments text #003' do 45 | ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: 'ar') 46 | expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."]) 47 | end 48 | 49 | it 'correctly segments text #004' do 50 | ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: 'ar') 51 | expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."]) 52 | end 53 | 54 | it 'correctly segments text #005' do 55 | ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: 'ar') 56 | expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"]) 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/danish.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Danish 6 | include Languages::Common 7 | 8 | class Cleaner < Cleaner 9 | def clean 10 | super 11 | clean_quotations 12 | end 13 | 14 | private 15 | 16 | def clean_quotations 17 | @text.gsub(/`/, "'") 18 | end 19 | 20 | def abbreviations 21 | [].freeze 22 | end 23 | end 24 | 25 | module Abbreviation 26 | ABBREVIATIONS = Set.new(['adm', 'adr', 'afd', 'afs', 'al', 'alm', 'alm', 'ang', 'ank', 'anm', 'ann', 'ansvh', 'apr', 'arr', 'ass', 'att', 'aud', 'aug', 'aut', 'bd', 'bdt', 'bet', 'bhk', 'bio', 'biol', 'bk', 'bl.a', 'bot', 'br', 'bto', 'ca', 'cal', 'cirk', 'cit', 'co', 'cpr-nr', 'cvr-nr', 'd.d', 'd.e', 'd.m', 'd.s', 'd.s.s', 'd.y', 'd.å', 'd.æ', 'da', 'dav', 'dec', 'def', 'del', 'dep', 'diam', 'din', 'dir', 'disp', 'distr', 'do', 'dobb', 'dr', 'ds', 'dvs', 'e.b', 'e.kr', 'e.l', 'e.o', 'e.v.t', 'eftf', 'eftm', 'egl', 'eks', 'eksam', 'ekskl', 'eksp', 'ekspl', 'el', 'emer', 'endv', 'eng', 'enk', 'etc', 'eur', 'evt', 'exam', 'f', 'f', 'f.eks', 'f.kr', 'f.m', 'f.n', 'f.o', 'f.o.m', 'f.s.v', 'f.t', 'f.v.t', 'f.å', 'fa', 'fakt', 'feb', 'fec', 'ff', 'fg', 'fg', 'fhv', 'fig', 'fl', 'flg', 'fm', 'fm', 'fmd', 'forb', 'foreg', 'foren', 'forf', 'forh', 'fork', 'form', 'forr', 'fors', 'forsk', 'forts', 'fp', 'fr', 'frk', 'fuldm', 'fuldm', 'fung', 'fung', 'fys', 'fær', 'g', 'g.d', 'g.m', 'gd', 'gdr', 'gg', 'gh', 'gl', 'gn', 'gns', 'gr', 'grdl', 'gross', 'h.a', 'h.c', 'hdl', 'henh', 'henv', 'hf', 'hft', 'hhv', 'hort', 'hosp', 'hpl', 'hr', 'hrs', 'hum', 'i', 'i.e', 'ib', 'ibid', 'if', 'ifm', 'ill', 'indb', 'indreg', 'ing', 'inkl', 'insp', 'instr', 'isl', 'istf', 'jan', 'jf', 'jfr', 'jnr', 'jr', 'jul', 'jun', 'jur', 'jvf', 'kal', 'kap', 'kat', 'kbh', 'kem', 'kgl', 'kin', 'kl', 'kld', 'km/t', 'knsp', 'komm', 'kons', 'korr', 'kp', 'kr', 'kr', 'kst', 'kt', 'ktr', 'kv', 'kvt', 'l', 'l.c', 'lab', 'lat', 'lb', 'lb.', 'lb.nr', 'lejl', 'lgd', 'lic', 'lign', 'lin', 'ling.merc', 'litt', 'lok', 'lrs', 'ltr', 'lø', 'm', 'm.a.o', 'm.fl.st', 'm.m', 'm/', 'ma', 'mag', 'maks', 'mar', 'mat', 'matr.nr', 'md', 'mdl', 'mdr', 'mdtl', 'med', 'medd', 'medflg', 'medl', 'merc', 'mezz', 'mf', 'mfl', 'mgl', 'mhp', 'mht', 'mi', 'mia', 'mio', 'ml', 'mods', 'modsv', 'modt', 'mr', 'mrk', 'mrs', 'ms', 'mul', 'mv', 'mvh', 'n', 'n.br', 'n.f', 'nat', 'ned', 'nedenn', 'nedenst', 'nederl', 'nkr', 'nl', 'no', 'nord', 'nov', 'nr', 'nr', 'nto', 'nuv', 'o', 'o.a', 'o.fl.st', 'o.g', 'o.h', 'o.m.a', 'obj', 'obl', 'obs', 'odont', 'oecon', 'off', 'ofl', 'okt', 'omg', 'omr', 'omtr', 'on', 'op.cit', 'opg', 'opl', 'opr', 'org', 'orig', 'osfr', 'osv', 'ovenn', 'ovenst', 'overs', 'ovf', 'oz', 'p', 'p.a', 'p.b.v', 'p.c', 'p.m.v', 'p.p', 'p.s', 'p.t', 'p.v.a', 'p.v.c', 'par', 'partc', 'pass', 'pct', 'pd', 'pens', 'perf', 'pers', 'pg', 'pga', 'pgl', 'ph', 'ph.d', 'pharm', 'phil', 'pinx', 'pk', 'pkt', 'pl', 'pluskv', 'polit', 'polyt', 'port', 'pos', 'pp', 'pr', 'prc', 'priv', 'prod', 'prof', 'pron', 'præd', 'præf', 'præp', 'præs', 'præt', 'psych', 'pt', 'pæd', 'q.e.d', 'rad', 'red', 'ref', 'reg', 'regn', 'rel', 'rep', 'repr', 'rest', 'rk', 'russ', 's', 's.br', 's.d', 's.e', 's.f', 's.m.b.a', 's.u', 's.å', 's/', 'sa', 'sb', 'sc', 'scient', 'sek', 'sek', 'sekr', 'sem', 'sen', 'sep', 'sept', 'sg', 'sign', 'sj', 'skr', 'skt', 'slutn', 'sml', 'smp', 'sms', 'smst', 'soc', 'soc', 'sort', 'sp', 'spec', 'spm', 'spr', 'spsk', 'st', 'stk', 'str', 'stud', 'subj', 'subst', 'suff', 'sup', 'suppl', 'sv', 'såk', 'sædv', 'sø', 't', 't.h', 't.o.m', 't.v', 'tab', 'td', 'tdl', 'tdr', 'techn', 'tekn', 'temp', 'th', 'ti', 'tidl', 'tilf', 'tilh', 'till', 'tilsv', 'tjg', 'tlf', 'tlgr', 'to', 'tr', 'trp', 'tv', 'ty', 'u', 'u.p', 'u.st', 'u.å', 'uafh', 'ubf', 'ubøj', 'udb', 'udbet', 'udd', 'udg', 'uds', 'ugtl', 'ulin', 'ult', 'undt', 'univ', 'v.f', 'var', 'vb', 'vbsb', 'vedk', 'vedl', 'vedr', 'vejl', 'vh', 'vol', 'vs', 'vsa', 'vær', 'zool', 'årg', 'årh', 'årl', 'ø.f', 'øv', 'øvr']).freeze 27 | NUMBER_ABBREVIATIONS = Set.new(['nr', 's']).freeze 28 | PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'skt', 'dr', 'hr', 'fru', 'st']).freeze 29 | end 30 | 31 | # This handles the case where a dot is used to denote and ordinal (5. Juni) 32 | module Numbers 33 | # Rubular: http://rubular.com/r/hZxoyQwKT1 34 | NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯') 35 | 36 | # Rubular: http://rubular.com/r/ityNMwdghj 37 | NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯') 38 | 39 | All = [ 40 | Common::Numbers::All, 41 | NumberPeriodSpaceRule, 42 | NegativeNumberPeriodSpaceRule 43 | ] 44 | end 45 | 46 | MONTHS = ['Januar', 'Februar', 'Marts', 'April', 'Maj', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'December'].freeze 47 | 48 | class AbbreviationReplacer < AbbreviationReplacer 49 | SENTENCE_STARTERS = %w( 50 | At De Dem Den Der Det Du En Et For Få Gjorde Han Hun Hvad Hvem Hvilke 51 | Hvor Hvordan Hvorfor Hvorledes Hvornår I Jeg Mange Vi Være 52 | ).freeze 53 | 54 | def replace_abbreviation_as_sentence_boundary(txt) 55 | # As we are being conservative and keeping ambiguous 56 | # sentence boundaries as one sentence instead of 57 | # splitting into two, we can split at words that 58 | # we know for certain never follow these abbreviations. 59 | # Some might say that the set of words that follow an 60 | # abbreviation such as U.S. (i.e. U.S. Government) is smaller than 61 | # the set of words that could start a sentence and 62 | # never follow U.S. However, we are being conservative 63 | # and not splitting by default, so we need to look for places 64 | # where we definitely can split. Obviously SENTENCE_STARTERS 65 | # will never cover all cases, but as the gem is named 66 | # 'Pragmatic Segmenter' we need to be pragmatic 67 | # and try to cover the words that most often start a 68 | # sentence but could never follow one of the abbreviations below. 69 | 70 | @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word| 71 | escaped = Regexp.escape(word) 72 | txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s") 73 | txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s") 74 | txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s") 75 | txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s") 76 | txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s") 77 | txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s") 78 | txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s") 79 | txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s") 80 | txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s") 81 | txt.gsub!(/s.u∯\s#{escaped}\s/, "s\.u\.\s#{escaped}\s") 82 | txt.gsub!(/S.U∯\s#{escaped}\s/, "S\.U\.\s#{escaped}\s") 83 | end 84 | txt 85 | end 86 | end 87 | end 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/list.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | module PragmaticSegmenter 5 | # This class searches for a list within a string and adds 6 | # newlines before each list item. 7 | class List 8 | ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx) 9 | LATIN_NUMERALS = ('a'..'z').to_a 10 | 11 | # Rubular: http://rubular.com/r/XcpaJKH0sz 12 | ALPHABETICAL_LIST_WITH_PERIODS = 13 | /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/ 14 | 15 | # Rubular: http://rubular.com/r/Gu5rQapywf 16 | ALPHABETICAL_LIST_WITH_PARENS = 17 | /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i 18 | 19 | SubstituteListPeriodRule = Rule.new(/♨/, '∯') 20 | ListMarkerRule = Rule.new(/☝/, '') 21 | 22 | # Rubular: http://rubular.com/r/Wv4qLdoPx7 23 | SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r") 24 | 25 | # Rubular: http://rubular.com/r/AizHXC6HxK 26 | SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r") 27 | 28 | # Rubular: http://rubular.com/r/GE5q6yID2j 29 | SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r") 30 | 31 | NUMBERED_LIST_REGEX_1 = 32 | /\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/ 33 | NUMBERED_LIST_REGEX_2 = 34 | /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/ 35 | NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/ 36 | 37 | # Rubular: http://rubular.com/r/NsNFSqrNvJ 38 | EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = 39 | /\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i 40 | 41 | # Rubular: http://rubular.com/r/wMpnVedEIb 42 | ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = 43 | /(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i 44 | 45 | # Rubular: http://rubular.com/r/GcnmQt4a3I 46 | ROMAN_NUMERALS_IN_PARENTHESES = 47 | /\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/ 48 | 49 | attr_reader :text 50 | def initialize(text:) 51 | @text = text.dup 52 | end 53 | 54 | def add_line_break 55 | format_alphabetical_lists 56 | format_roman_numeral_lists 57 | format_numbered_list_with_periods 58 | format_numbered_list_with_parens 59 | end 60 | 61 | def replace_parens 62 | text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze) 63 | text 64 | end 65 | 66 | private 67 | 68 | def format_numbered_list_with_parens 69 | replace_parens_in_numbered_list 70 | add_line_breaks_for_numbered_list_with_parens 71 | Rule.apply(@text, ListMarkerRule) 72 | end 73 | 74 | def format_numbered_list_with_periods 75 | replace_periods_in_numbered_list 76 | add_line_breaks_for_numbered_list_with_periods 77 | Rule.apply(@text, SubstituteListPeriodRule) 78 | end 79 | 80 | def format_alphabetical_lists 81 | add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false) 82 | add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false) 83 | end 84 | 85 | def format_roman_numeral_lists 86 | add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true) 87 | add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true) 88 | end 89 | 90 | def replace_periods_in_numbered_list 91 | scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true) 92 | end 93 | 94 | def add_line_breaks_for_numbered_list_with_periods 95 | if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/ 96 | Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule) 97 | end 98 | end 99 | 100 | def replace_parens_in_numbered_list 101 | scan_lists( 102 | NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝') 103 | scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝') 104 | end 105 | 106 | def add_line_breaks_for_numbered_list_with_parens 107 | if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/ 108 | Rule.apply(@text, SpaceBetweenListItemsThirdRule) 109 | end 110 | end 111 | 112 | def scan_lists(regex1, regex2, replacement, strip: false) 113 | list_array = @text.scan(regex1).map(&:to_i) 114 | list_array.each_with_index do |a, i| 115 | next unless (a + 1).eql?(list_array[i + 1]) || 116 | (a - 1).eql?(list_array[i - 1]) || 117 | (a.eql?(0) && list_array[i - 1].eql?(9)) || 118 | (a.eql?(9) && list_array[i + 1].eql?(0)) 119 | substitute_found_list_items(regex2, a, strip, replacement) 120 | end 121 | end 122 | 123 | def substitute_found_list_items(regex, a, strip, replacement) 124 | @text.gsub!(regex).with_index do |m| 125 | if a.to_s.eql?(strip ? m.strip.chop : m) 126 | "#{Regexp.escape(a.to_s)}" + replacement 127 | else 128 | "#{m}" 129 | end 130 | end 131 | end 132 | 133 | def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false) 134 | iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral) 135 | end 136 | 137 | def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false) 138 | iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, 139 | parens: true, 140 | roman_numeral: roman_numeral) 141 | end 142 | 143 | def replace_alphabet_list(a) 144 | @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m| 145 | a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}" 146 | end 147 | end 148 | 149 | def replace_alphabet_list_parens(a) 150 | @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m| 151 | if m.include?('(') 152 | a.eql?(Unicode::downcase(m.dup).gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}" 153 | else 154 | a.eql?(Unicode::downcase(m.dup)) ? "\r#{Regexp.escape(m)}" : "#{m}" 155 | end 156 | end 157 | end 158 | 159 | def replace_correct_alphabet_list(a, parens) 160 | if parens 161 | replace_alphabet_list_parens(a) 162 | else 163 | replace_alphabet_list(a) 164 | end 165 | end 166 | 167 | def last_array_item_replacement(a, i, alphabet, list_array, parens) 168 | return if alphabet & list_array == [] || 169 | !alphabet.include?(list_array[i - 1]) || 170 | !alphabet.include?(a) 171 | return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1 172 | replace_correct_alphabet_list(a, parens) 173 | end 174 | 175 | def other_items_replacement(a, i, alphabet, list_array, parens) 176 | return if alphabet & list_array == [] || 177 | !alphabet.include?(list_array[i - 1]) || 178 | !alphabet.include?(a) || 179 | !alphabet.include?(list_array[i + 1]) 180 | return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 && 181 | (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1 182 | replace_correct_alphabet_list(a, parens) 183 | end 184 | 185 | def iterate_alphabet_array(regex, parens: false, roman_numeral: false) 186 | list_array = @text.scan(regex).map { |s| Unicode::downcase(s) } 187 | if roman_numeral 188 | alphabet = ROMAN_NUMERALS 189 | else 190 | alphabet = LATIN_NUMERALS 191 | end 192 | list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } } 193 | list_array.each_with_index do |a, i| 194 | if i.eql?(list_array.length - 1) 195 | last_array_item_replacement(a, i, alphabet, list_array, parens) 196 | else 197 | other_items_replacement(a, i, alphabet, list_array, parens) 198 | end 199 | end 200 | end 201 | end 202 | end 203 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/italian_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Italian, "(it)" do 4 | 5 | context "Golden Rules" do 6 | it "Abbreviations #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: "it") 8 | expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"]) 9 | end 10 | 11 | it "Quotations #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.».", language: "it") 13 | expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.»."]) 14 | end 15 | 16 | it "Numbers #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: "it") 18 | expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"]) 19 | end 20 | end 21 | 22 | # Thanks to Davide Fornelli for the Italian test examples. 23 | describe '#segment' do 24 | 25 | it 'correctly segments text #001' do 26 | ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: 'it') 27 | expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"]) 28 | end 29 | 30 | it 'correctly segments text #002' do 31 | ps = PragmaticSegmenter::Segmenter.new(text: "Buongiorno! Sono l'Ing. Mengozzi. È presente l'Avv. Cassioni?", language: 'it') 32 | expect(ps.segment).to eq(["Buongiorno!", "Sono l'Ing. Mengozzi.", "È presente l'Avv. Cassioni?"]) 33 | end 34 | 35 | it 'correctly segments text #003' do 36 | ps = PragmaticSegmenter::Segmenter.new(text: "Mi fissi un appuntamento per mar. 23 Nov.. Grazie.", language: 'it') 37 | expect(ps.segment).to eq(["Mi fissi un appuntamento per mar. 23 Nov..", "Grazie."]) 38 | end 39 | 40 | it 'correctly segments text #004' do 41 | ps = PragmaticSegmenter::Segmenter.new(text: "Ecco il mio tel.:01234567. Mi saluti la Sig.na Manelli. Arrivederci.", language: 'it') 42 | expect(ps.segment).to eq(["Ecco il mio tel.:01234567.", "Mi saluti la Sig.na Manelli.", "Arrivederci."]) 43 | end 44 | 45 | it 'correctly segments text #005' do 46 | ps = PragmaticSegmenter::Segmenter.new(text: "La centrale meteor. si è guastata. Gli idraul. son dovuti andare a sistemarla.", language: 'it') 47 | expect(ps.segment).to eq(["La centrale meteor. si è guastata.", "Gli idraul. son dovuti andare a sistemarla."]) 48 | end 49 | 50 | it 'correctly segments text #006' do 51 | ps = PragmaticSegmenter::Segmenter.new(text: "Hanno creato un algoritmo allo st. d. arte. Si ringrazia lo psicol. Serenti.", language: 'it') 52 | expect(ps.segment).to eq(["Hanno creato un algoritmo allo st. d. arte.", "Si ringrazia lo psicol. Serenti."]) 53 | end 54 | 55 | it 'correctly segments text #007' do 56 | ps = PragmaticSegmenter::Segmenter.new(text: "Chiamate il V.Cte. delle F.P., adesso!", language: 'it') 57 | expect(ps.segment).to eq(["Chiamate il V.Cte. delle F.P., adesso!"]) 58 | end 59 | 60 | it 'correctly segments text #008' do 61 | ps = PragmaticSegmenter::Segmenter.new(text: "Giancarlo ha sostenuto l'esame di econ. az..", language: 'it') 62 | expect(ps.segment).to eq(["Giancarlo ha sostenuto l'esame di econ. az.."]) 63 | end 64 | 65 | it 'correctly segments text #009' do 66 | ps = PragmaticSegmenter::Segmenter.new(text: "Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!", language: 'it') 67 | expect(ps.segment).to eq(["Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!"]) 68 | end 69 | 70 | it 'correctly segments text #010' do 71 | ps = PragmaticSegmenter::Segmenter.new(text: "Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona.", language: 'it') 72 | expect(ps.segment).to eq(["Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona."]) 73 | end 74 | 75 | it 'correctly segments text #011' do 76 | ps = PragmaticSegmenter::Segmenter.new(text: "Stava mangiando e/o dormendo.", language: 'it') 77 | expect(ps.segment).to eq(["Stava mangiando e/o dormendo."]) 78 | end 79 | 80 | it 'correctly segments text #012' do 81 | ps = PragmaticSegmenter::Segmenter.new(text: "Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo.", language: 'it') 82 | expect(ps.segment).to eq(["Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo."]) 83 | end 84 | 85 | it 'correctly segments text #013' do 86 | ps = PragmaticSegmenter::Segmenter.new(text: "La politica è quella della austerità; quindi verranno fatti tagli agli sprechi.", language: 'it') 87 | expect(ps.segment).to eq(["La politica è quella della austerità; quindi verranno fatti tagli agli sprechi."]) 88 | end 89 | 90 | it 'correctly segments text #014' do 91 | ps = PragmaticSegmenter::Segmenter.new(text: "Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\".", language: 'it') 92 | expect(ps.segment).to eq(["Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\"."]) 93 | end 94 | 95 | it 'correctly segments text #015' do 96 | ps = PragmaticSegmenter::Segmenter.new(text: "Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW.", language: 'it') 97 | expect(ps.segment).to eq(["Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW."]) 98 | end 99 | 100 | it 'correctly segments text #016' do 101 | ps = PragmaticSegmenter::Segmenter.new(text: "La parola 'casa' è sinonimo di abitazione.", language: 'it') 102 | expect(ps.segment).to eq(["La parola 'casa' è sinonimo di abitazione."]) 103 | end 104 | 105 | it 'correctly segments text #017' do 106 | ps = PragmaticSegmenter::Segmenter.new(text: "La \"Mulino Bianco\" fa alimentari pre-confezionati.", language: 'it') 107 | expect(ps.segment).to eq(["La \"Mulino Bianco\" fa alimentari pre-confezionati."]) 108 | end 109 | 110 | it 'correctly segments text #018' do 111 | ps = PragmaticSegmenter::Segmenter.new(text: "\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni).", language: 'it') 112 | expect(ps.segment).to eq(["\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni)."]) 113 | end 114 | 115 | it 'correctly segments text #019' do 116 | ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...».", language: 'it') 117 | expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...»."]) 118 | end 119 | 120 | it 'correctly segments text #020' do 121 | ps = PragmaticSegmenter::Segmenter.new(text: "Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\"", language: 'it') 122 | expect(ps.segment).to eq(["Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\""]) 123 | end 124 | 125 | it 'correctly segments text #021' do 126 | ps = PragmaticSegmenter::Segmenter.new(text: "Ai bambini è stato chiesto di fare \"4:2*2\"", language: 'it') 127 | expect(ps.segment).to eq(["Ai bambini è stato chiesto di fare \"4:2*2\""]) 128 | end 129 | 130 | it 'correctly segments text #022' do 131 | ps = PragmaticSegmenter::Segmenter.new(text: "La maestra esclamò: \"Bambini, quanto fa '2/3 + 4/3?'\".", language: 'it') 132 | expect(ps.segment).to eq(["La maestra esclamò: \"Bambini, quanto fa \'2/3 + 4/3?\'\"."]) 133 | end 134 | 135 | it 'correctly segments text #023' do 136 | ps = PragmaticSegmenter::Segmenter.new(text: "Il motore misurava 120°C.", language: 'it') 137 | expect(ps.segment).to eq(["Il motore misurava 120°C."]) 138 | end 139 | 140 | it 'correctly segments text #024' do 141 | ps = PragmaticSegmenter::Segmenter.new(text: "Il volume era di 3m³.", language: 'it') 142 | expect(ps.segment).to eq(["Il volume era di 3m³."]) 143 | end 144 | 145 | it 'correctly segments text #025' do 146 | ps = PragmaticSegmenter::Segmenter.new(text: "La stanza misurava 20m².", language: 'it') 147 | expect(ps.segment).to eq(["La stanza misurava 20m²."]) 148 | end 149 | 150 | it 'correctly segments text #026' do 151 | ps = PragmaticSegmenter::Segmenter.new(text: "1°C corrisponde a 33.8°F.", language: 'it') 152 | expect(ps.segment).to eq(["1°C corrisponde a 33.8°F."]) 153 | end 154 | 155 | it 'correctly segments text #027' do 156 | ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27-10-14.", language: 'it') 157 | expect(ps.segment).to eq(["Oggi è il 27-10-14."]) 158 | end 159 | 160 | it 'correctly segments text #028' do 161 | ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: 'it') 162 | expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"]) 163 | end 164 | 165 | it 'correctly segments text #029' do 166 | ps = PragmaticSegmenter::Segmenter.new(text: "Il corridore 103 è arrivato 4°.", language: 'it') 167 | expect(ps.segment).to eq(["Il corridore 103 è arrivato 4°."]) 168 | end 169 | 170 | it 'correctly segments text #030' do 171 | ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27/10/2014.", language: 'it') 172 | expect(ps.segment).to eq(["Oggi è il 27/10/2014."]) 173 | end 174 | 175 | it 'correctly segments text #031' do 176 | ps = PragmaticSegmenter::Segmenter.new(text: "Ecco l'elenco: 1.gelato, 2.carne, 3.riso.", language: 'it') 177 | expect(ps.segment).to eq(["Ecco l'elenco: 1.gelato, 2.carne, 3.riso."]) 178 | end 179 | 180 | it 'correctly segments text #032' do 181 | ps = PragmaticSegmenter::Segmenter.new(text: "Devi comprare : 1)pesce 2)sale.", language: 'it') 182 | expect(ps.segment).to eq(["Devi comprare : 1)pesce 2)sale."]) 183 | end 184 | 185 | it 'correctly segments text #033' do 186 | ps = PragmaticSegmenter::Segmenter.new(text: "La macchina viaggiava a 100 km/h.", language: 'it') 187 | expect(ps.segment).to eq(["La macchina viaggiava a 100 km/h."]) 188 | end 189 | end 190 | end 191 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/russian_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Russian, "(ru)" do 4 | 5 | context "Golden Rules" do 6 | it "Abbreviations #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: "ru") 8 | expect(ps.segment).to eq(["Объем составляет 5 куб.м."]) 9 | end 10 | 11 | it "Quotations #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: "ru") 13 | expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."]) 14 | end 15 | 16 | it "Numbers #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: "ru") 18 | expect(ps.segment).to eq(["Сегодня 27.10.14"]) 19 | end 20 | end 21 | 22 | # Thanks to Anastasiia Tsvitailo for the Russian test examples. 23 | describe '#segment' do 24 | it 'correctly segments text #001' do 25 | ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: 'ru') 26 | expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."]) 27 | end 28 | 29 | it 'correctly segments text #002' do 30 | ps = PragmaticSegmenter::Segmenter.new(text: "«Я приду поздно», — сказал Андрей.", language: 'ru') 31 | expect(ps.segment).to eq(["«Я приду поздно», — сказал Андрей."]) 32 | end 33 | 34 | it 'correctly segments text #003' do 35 | ps = PragmaticSegmenter::Segmenter.new(text: "«К чему ты готовишься? – спросила мама. – Завтра ведь выходной».", language: 'ru') 36 | expect(ps.segment).to eq(["«К чему ты готовишься? – спросила мама. – Завтра ведь выходной»."]) 37 | end 38 | 39 | it 'correctly segments text #004' do 40 | ps = PragmaticSegmenter::Segmenter.new(text: "По словам Пушкина, «Привычка свыше дана, замена счастью она».", language: 'ru') 41 | expect(ps.segment).to eq(["По словам Пушкина, «Привычка свыше дана, замена счастью она»."]) 42 | end 43 | 44 | it 'correctly segments text #005' do 45 | ps = PragmaticSegmenter::Segmenter.new(text: "Он сказал: «Я очень устал», и сразу же замолчал.", language: 'ru') 46 | expect(ps.segment).to eq(["Он сказал: «Я очень устал», и сразу же замолчал."]) 47 | end 48 | 49 | it 'correctly segments text #006' do 50 | ps = PragmaticSegmenter::Segmenter.new(text: "Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей.", language: 'ru') 51 | expect(ps.segment).to eq(["Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей."]) 52 | end 53 | 54 | it 'correctly segments text #007' do 55 | ps = PragmaticSegmenter::Segmenter.new(text: "Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…", language: 'ru') 56 | expect(ps.segment).to eq(["Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…"]) 57 | end 58 | 59 | it 'correctly segments text #008' do 60 | ps = PragmaticSegmenter::Segmenter.new(text: "Слово «дом» является синонимом жилища", language: 'ru') 61 | expect(ps.segment).to eq(["Слово «дом» является синонимом жилища"]) 62 | end 63 | 64 | it 'correctly segments text #009' do 65 | ps = PragmaticSegmenter::Segmenter.new(text: "В Санкт-Петербург на гастроли приехал театр «Современник»", language: 'ru') 66 | expect(ps.segment).to eq(["В Санкт-Петербург на гастроли приехал театр «Современник»"]) 67 | end 68 | 69 | it 'correctly segments text #010' do 70 | ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru') 71 | expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."]) 72 | end 73 | 74 | it 'correctly segments text #011' do 75 | ps = PragmaticSegmenter::Segmenter.new(text: "Я поем и/или лягу спать.", language: 'ru') 76 | expect(ps.segment).to eq(["Я поем и/или лягу спать."]) 77 | end 78 | 79 | it 'correctly segments text #012' do 80 | ps = PragmaticSegmenter::Segmenter.new(text: "Он не мог справиться с примером \"3 + (14:7) = 5\"", language: 'ru') 81 | expect(ps.segment).to eq(["Он не мог справиться с примером \"3 + (14:7) = 5\""]) 82 | end 83 | 84 | it 'correctly segments text #013' do 85 | ps = PragmaticSegmenter::Segmenter.new(text: "Вот список: 1.мороженое, 2.мясо, 3.рис.", language: 'ru') 86 | expect(ps.segment).to eq(["Вот список: 1.мороженое, 2.мясо, 3.рис."]) 87 | end 88 | 89 | it 'correctly segments text #014' do 90 | ps = PragmaticSegmenter::Segmenter.new(text: "Квартира 234 находится на 4-ом этаже.", language: 'ru') 91 | expect(ps.segment).to eq(["Квартира 234 находится на 4-ом этаже."]) 92 | end 93 | 94 | it 'correctly segments text #015' do 95 | ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru') 96 | expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."]) 97 | end 98 | 99 | it 'correctly segments text #016' do 100 | ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5м³.", language: 'ru') 101 | expect(ps.segment).to eq(["Объем составляет 5м³."]) 102 | end 103 | 104 | it 'correctly segments text #017' do 105 | ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: 'ru') 106 | expect(ps.segment).to eq(["Объем составляет 5 куб.м."]) 107 | end 108 | 109 | it 'correctly segments text #018' do 110 | ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14м².", language: 'ru') 111 | expect(ps.segment).to eq(["Площадь комнаты 14м²."]) 112 | end 113 | 114 | it 'correctly segments text #019' do 115 | ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14 кв.м.", language: 'ru') 116 | expect(ps.segment).to eq(["Площадь комнаты 14 кв.м."]) 117 | end 118 | 119 | it 'correctly segments text #020' do 120 | ps = PragmaticSegmenter::Segmenter.new(text: "1°C соответствует 33.8°F.", language: 'ru') 121 | expect(ps.segment).to eq(["1°C соответствует 33.8°F."]) 122 | end 123 | 124 | it 'correctly segments text #021' do 125 | ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: 'ru') 126 | expect(ps.segment).to eq(["Сегодня 27.10.14"]) 127 | end 128 | 129 | it 'correctly segments text #022' do 130 | ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27 октября 2014 года.", language: 'ru') 131 | expect(ps.segment).to eq(["Сегодня 27 октября 2014 года."]) 132 | end 133 | 134 | it 'correctly segments text #023' do 135 | ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит 150 000 дол.!", language: 'ru') 136 | expect(ps.segment).to eq(["Эта машина стоит 150 000 дол.!"]) 137 | end 138 | 139 | it 'correctly segments text #024' do 140 | ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит $150 000!", language: 'ru') 141 | expect(ps.segment).to eq(["Эта машина стоит $150 000!"]) 142 | end 143 | 144 | it 'correctly segments text #025' do 145 | ps = PragmaticSegmenter::Segmenter.new(text: "Вот номер моего телефона: +39045969798. Передавайте привет г-ну Шапочкину. До свидания.", language: 'ru') 146 | expect(ps.segment).to eq(["Вот номер моего телефона: +39045969798.", "Передавайте привет г-ну Шапочкину.", "До свидания."]) 147 | end 148 | 149 | it 'correctly segments text #026' do 150 | ps = PragmaticSegmenter::Segmenter.new(text: "Постойте, разве можно указывать цены в у.е.!", language: 'ru') 151 | expect(ps.segment).to eq(["Постойте, разве можно указывать цены в у.е.!"]) 152 | end 153 | 154 | it 'correctly segments text #027' do 155 | ps = PragmaticSegmenter::Segmenter.new(text: "Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!", language: 'ru') 156 | expect(ps.segment).to eq(["Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!"]) 157 | end 158 | 159 | it 'correctly segments text #028' do 160 | ps = PragmaticSegmenter::Segmenter.new(text: "Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре.", language: 'ru') 161 | expect(ps.segment).to eq(["Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре."]) 162 | end 163 | 164 | it 'correctly segments text #029' do 165 | ps = PragmaticSegmenter::Segmenter.new(text: "Уважаемый проф. Семенов! Просьба до 20.10 сдать отчет на кафедру.", language: 'ru') 166 | expect(ps.segment).to eq(["Уважаемый проф. Семенов!", "Просьба до 20.10 сдать отчет на кафедру."]) 167 | end 168 | 169 | it 'correctly segments text #030' do 170 | ps = PragmaticSegmenter::Segmenter.new(text: "Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка. Предъявите дисконтную карту, пожалуйста!", language: 'ru') 171 | expect(ps.segment).to eq(["Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка.", "Предъявите дисконтную карту, пожалуйста!"]) 172 | end 173 | 174 | it 'correctly segments text #031' do 175 | ps = PragmaticSegmenter::Segmenter.new(text: "Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая.", language: 'ru') 176 | expect(ps.segment).to eq(["Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая."]) 177 | end 178 | 179 | it 'correctly segments text #032' do 180 | ps = PragmaticSegmenter::Segmenter.new(text: "Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок.", language: 'ru') 181 | expect(ps.segment).to eq(["Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок."]) 182 | end 183 | 184 | it 'correctly segments text #033' do 185 | ps = PragmaticSegmenter::Segmenter.new(text: "В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно.", language: 'ru') 186 | expect(ps.segment).to eq(["В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно."]) 187 | end 188 | 189 | it 'correctly segments text #034' do 190 | ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?»", language: 'ru') 191 | expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»"]) 192 | end 193 | 194 | it 'correctly segments text #035' do 195 | ps = PragmaticSegmenter::Segmenter.new(text: "Кв. 234 находится на 4 этаже.", language: 'ru') 196 | expect(ps.segment).to eq(["Кв. 234 находится на 4 этаже."]) 197 | end 198 | 199 | it 'correctly segments text #036' do 200 | ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru') 201 | expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."]) 202 | end 203 | 204 | it 'correctly segments text #037' do 205 | ps = PragmaticSegmenter::Segmenter.new(text: "Нужно купить 1)рыбу 2)соль.", language: 'ru') 206 | expect(ps.segment).to eq(["Нужно купить 1)рыбу 2)соль."]) 207 | end 208 | 209 | it 'correctly segments text #038' do 210 | ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru') 211 | expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."]) 212 | end 213 | 214 | it 'correctly segments text #039' do 215 | ps = PragmaticSegmenter::Segmenter.new(text: "Л.Н. Толстой написал \"Войну и мир\". Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами. Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое.", language: 'ru') 216 | expect(ps.segment).to eq(["Л.Н. Толстой написал \"Войну и мир\".", "Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами.", "Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое."]) 217 | end 218 | end 219 | end 220 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/armenian_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Armenian, '(hy)' do 4 | 5 | context "Golden Rules" do 6 | it "Sentence ending punctuation #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: "hy") 8 | expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"]) 9 | end 10 | 11 | it "Ellipsis #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: "hy") 13 | expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"]) 14 | end 15 | 16 | it "Period is not a sentence boundary #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: "hy") 18 | expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"]) 19 | end 20 | end 21 | 22 | describe '#segment' do 23 | # Thanks to Armine Abelyan for the Armenian test examples. 24 | 25 | it 'correctly segments text #001' do 26 | ps = PragmaticSegmenter::Segmenter.new(text: "Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը: Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:", language: 'hy') 27 | expect(ps.segment).to eq(["Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը:", "Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:"]) 28 | end 29 | 30 | it 'correctly segments text #002' do 31 | ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy') 32 | expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"]) 33 | end 34 | 35 | it 'correctly segments text #003' do 36 | ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy') 37 | expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"]) 38 | end 39 | 40 | it 'correctly segments text #004' do 41 | # "Hello world. My name is Armine." ==> ["Hello world.", "My name is Armine."] 42 | ps = PragmaticSegmenter::Segmenter.new(text: "Բարև Ձեզ: Իմ անունն էԱրմինե:", language: 'hy') 43 | expect(ps.segment).to eq(["Բարև Ձեզ:", "Իմ անունն էԱրմինե:"]) 44 | end 45 | 46 | it 'correctly segments text #005' do 47 | # "Today is Monday. I am going to work." ==> ["Today is Monday.", "I am going to work."] 48 | ps = PragmaticSegmenter::Segmenter.new(text: "Այսօր երկուշաբթի է: Ես գնում եմ աշխատանքի:", language: 'hy') 49 | expect(ps.segment).to eq(["Այսօր երկուշաբթի է:", "Ես գնում եմ աշխատանքի:"]) 50 | end 51 | 52 | it 'correctly segments text #006' do 53 | # "Tomorrow is September 1st. We are going to school." ==> ["Tomorrow is September 1st.", "We are going to school."] 54 | ps = PragmaticSegmenter::Segmenter.new(text: "Վաղը սեպտեմբերի 1-ն է: Մենք գնում ենք դպրոց:", language: 'hy') 55 | expect(ps.segment).to eq(["Վաղը սեպտեմբերի 1-ն է:", "Մենք գնում ենք դպրոց:"]) 56 | end 57 | 58 | it 'correctly segments text #007' do 59 | # "Yes, I understood. I really love you." ==> ["Yes, I understood.", "I really love you."] 60 | ps = PragmaticSegmenter::Segmenter.new(text: "Այո, ես հասկացա: Ես իսկապես քեզ սիրում եմ:", language: 'hy') 61 | expect(ps.segment).to eq(["Այո, ես հասկացա:", "Ես իսկապես քեզ սիրում եմ:"]) 62 | end 63 | 64 | it 'correctly segments text #008' do 65 | # "Close the windows. It is raining in the evening." ==> ["Close the windows.", "It is raining in the evening."] 66 | ps = PragmaticSegmenter::Segmenter.new(text: "Փակիր պատուհանները: Երեկոյան անձրևում է:", language: 'hy') 67 | expect(ps.segment).to eq(["Փակիր պատուհանները:", "Երեկոյան անձրևում է:"]) 68 | end 69 | 70 | it 'correctly segments text #009' do 71 | # "It is dark. I should go home." ==> ["It is dark.", "I should go home."] 72 | ps = PragmaticSegmenter::Segmenter.new(text: "Մութ է: Ես պետք է տուն վերադառնամ:", language: 'hy') 73 | expect(ps.segment).to eq(["Մութ է:", "Ես պետք է տուն վերադառնամ:"]) 74 | end 75 | 76 | it 'correctly segments text #010' do 77 | # "You know, I am starting to believe. Everything is changing." ==> ["You know, I am starting to believe.", "Everything is changing."] 78 | ps = PragmaticSegmenter::Segmenter.new(text: "Գիտես, սկսել եմ հավատալ: Ամեն ինչ փոխվում է:", language: 'hy') 79 | expect(ps.segment).to eq(["Գիտես, սկսել եմ հավատալ:", "Ամեն ինչ փոխվում է:"]) 80 | end 81 | 82 | it 'correctly segments text #011' do 83 | # "It is a new Christmas tree. We should decorate it." ==> ["It is a new Christmas tree.", "We should decorate it."] 84 | ps = PragmaticSegmenter::Segmenter.new(text: "Տոնածառը նոր է: Պետք է այն զարդարել:", language: 'hy') 85 | expect(ps.segment).to eq(["Տոնածառը նոր է:", "Պետք է այն զարդարել:"]) 86 | end 87 | 88 | it 'correctly segments text #012' do 89 | # "I am in hurry. I could not wait you." ==> ["I am in hurry.", "I could not wait you."] 90 | ps = PragmaticSegmenter::Segmenter.new(text: "Ես շտապում եմ: Ես քեզ չեմ կարող սպասել:", language: 'hy') 91 | expect(ps.segment).to eq(["Ես շտապում եմ:", "Ես քեզ չեմ կարող սպասել:"]) 92 | end 93 | 94 | it 'correctly segments text #013' do 95 | # "Wait, we love each other. I want us to live together." ==> ["Wait, we love each other.", "I want us to live together."] 96 | ps = PragmaticSegmenter::Segmenter.new(text: "Սպասիր, մենք իրար սիրում ենք: Ցանկանում եմ միասին ապրենք:", language: 'hy') 97 | expect(ps.segment).to eq(["Սպասիր, մենք իրար սիրում ենք:", "Ցանկանում եմ միասին ապրենք:"]) 98 | end 99 | 100 | it 'correctly segments text #014' do 101 | # "No, I do not think so. It is not true." ==> ["No, I do not think so.", "It is not true."] 102 | ps = PragmaticSegmenter::Segmenter.new(text: "Ոչ, այդպես չեմ կարծում: Դա ճիշտ չէ:", language: 'hy') 103 | expect(ps.segment).to eq(["Ոչ, այդպես չեմ կարծում:", "Դա ճիշտ չէ:"]) 104 | end 105 | 106 | it 'correctly segments text #015' do 107 | # "April 24 it has started to rain... I was thinking about." ==> ["April 24 it has started to rain... I was thinking about."] 108 | ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: 'hy') 109 | expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"]) 110 | end 111 | 112 | it 'correctly segments text #016' do 113 | # "It was 1960...it was winter...it was night. It was cold...emptiness." ==> ["It was 1960...it was winter...it was night.", "It was cold...emptiness."] 114 | ps = PragmaticSegmenter::Segmenter.new(text: "1960 թվական…ձմեռ…գիշեր: Սառն էր…դատարկություն:", language: 'hy') 115 | expect(ps.segment).to eq(["1960 թվական…ձմեռ…գիշեր:", "Սառն էր…դատարկություն:"]) 116 | end 117 | 118 | it 'correctly segments text #017' do 119 | # "Why a computer could not do what a man could do? Simply it doesn't have a human brain." ==> ["Why a computer could not do what a man could do?", "Simply it doesn't have a human brain."] 120 | ps = PragmaticSegmenter::Segmenter.new(text: "Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը: Պարզապես չունի մարդկային ուղեղ:", language: 'hy') 121 | expect(ps.segment).to eq(["Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը:", "Պարզապես չունի մարդկային ուղեղ:"]) 122 | end 123 | 124 | it 'correctly segments text #018' do 125 | # "Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity." ==> ["Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity."] 126 | ps = PragmaticSegmenter::Segmenter.new(text: "Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:", language: 'hy') 127 | expect(ps.segment).to eq(["Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:"]) 128 | end 129 | 130 | it 'correctly segments text #019' do 131 | # "So, we are coming to the end. The logic is...simplicity and work" ==> ["So, we are coming to the end.", "Simplicity and work."] 132 | ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: 'hy') 133 | expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"]) 134 | end 135 | 136 | it 'correctly segments text #020' do 137 | # "What are you thinking? Nothing!" ==> ["What are you thinking?", "Nothing!"] 138 | ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: 'hy') 139 | expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"]) 140 | end 141 | 142 | it 'correctly segments text #021' do 143 | # "Can we work together ?. May be what you are thinking, is possible." ==> ["Can we work together?.", "May be what you are thinking is possible."] 144 | ps = PragmaticSegmenter::Segmenter.new(text: "Կարող ե՞նք միասին աշխատել: Գուցե այն ինչ մտածում ես, իրականանալի է:", language: 'hy') 145 | expect(ps.segment).to eq(["Կարող ե՞նք միասին աշխատել:", "Գուցե այն ինչ մտածում ես, իրականանալի է:"]) 146 | end 147 | 148 | it 'correctly segments text #022' do 149 | # "Now what we have started, comes to the end. However the questions are numerous... ." ==> ["Now what we have started, comes to the end.", "However the questions are numerous... ."] 150 | ps = PragmaticSegmenter::Segmenter.new(text: "Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում: Հարցերը սակայն շատ են...:", language: 'hy') 151 | expect(ps.segment).to eq(["Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում:", "Հարցերը սակայն շատ են...:"]) 152 | end 153 | 154 | it 'correctly segments text #023' do 155 | # "Honey... I am waiting. Shall I go... or?" ==> ["Honey... I am waiting.", "Shall I go... or?"] 156 | ps = PragmaticSegmenter::Segmenter.new(text: "Սիրելիս...սպասում եմ: Գնամ թ՟ե …:", language: 'hy') 157 | expect(ps.segment).to eq(["Սիրելիս...սպասում եմ:", "Գնամ թ՟ե …:"]) 158 | end 159 | end 160 | end 161 | -------------------------------------------------------------------------------- /spec/pragmatic_segmenter/languages/spanish_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe PragmaticSegmenter::Languages::Spanish, '(es)' do 4 | 5 | context "Golden Rules" do 6 | it "Question mark to end sentence #001" do 7 | ps = PragmaticSegmenter::Segmenter.new(text: "¿Cómo está hoy? Espero que muy bien.", language: "es") 8 | expect(ps.segment).to eq(["¿Cómo está hoy?", "Espero que muy bien."]) 9 | end 10 | 11 | it "Exclamation point to end sentence #002" do 12 | ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola señorita! Espero que muy bien.", language: "es") 13 | expect(ps.segment).to eq(["¡Hola señorita!", "Espero que muy bien."]) 14 | end 15 | 16 | it "Abbreviations #003" do 17 | ps = PragmaticSegmenter::Segmenter.new(text: "Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: "es") 18 | expect(ps.segment).to eq(["Hola Srta. Ledesma.", "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."]) 19 | end 20 | 21 | it "Numbers #004" do 22 | ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: "es") 23 | expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."]) 24 | end 25 | 26 | it "Quotations #005" do 27 | ps = PragmaticSegmenter::Segmenter.new(text: "«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles.", language: "es") 28 | expect(ps.segment).to eq(["«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles."]) 29 | end 30 | end 31 | 32 | # Thanks to Alejandro Naser Pastoriza for the Spanish test examples. 33 | describe '#segment' do 34 | it 'correctly segments text #001' do 35 | ps = PragmaticSegmenter::Segmenter.new(text: '«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles. Pablo, ¿adónde vas? ¡¿Qué viste?!', language: 'es') 36 | expect(ps.segment).to eq(['«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles.', 'Pablo, ¿adónde vas?', '¡¿Qué viste?!']) 37 | end 38 | 39 | it 'correctly segments text #002' do 40 | ps = PragmaticSegmenter::Segmenter.new(text: 'Admón. es administración o me equivoco.', language: 'es') 41 | expect(ps.segment).to eq(['Admón. es administración o me equivoco.']) 42 | end 43 | 44 | it 'correctly segments text #003' do 45 | ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa", language: 'es') 46 | expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa"]) 47 | end 48 | 49 | it 'correctly segments text #004' do 50 | ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa \n• 11. Hola", language: 'es') 51 | expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa", "• 11. Hola"]) 52 | end 53 | 54 | it 'correctly segments text #005' do 55 | ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola Srta. Ledesma! ¿Cómo está hoy? Espero que muy bien.", language: 'es') 56 | expect(ps.segment).to eq(["¡Hola Srta. Ledesma!", "¿Cómo está hoy?", "Espero que muy bien."]) 57 | end 58 | 59 | it 'correctly segments text #006' do 60 | ps = PragmaticSegmenter::Segmenter.new(text: "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: 'es') 61 | expect(ps.segment).to eq(["Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."]) 62 | end 63 | 64 | it 'correctly segments text #007' do 65 | ps = PragmaticSegmenter::Segmenter.new(text: "He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014. Gracias.", language: 'es') 66 | expect(ps.segment).to eq(["He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014.", "Gracias."]) 67 | end 68 | 69 | it 'correctly segments text #008' do 70 | ps = PragmaticSegmenter::Segmenter.new(text: "Núm. de tel: 351.123.465.4. Envíe mis saludos a la Sra. Rescia.", language: 'es') 71 | expect(ps.segment).to eq(["Núm. de tel: 351.123.465.4.", "Envíe mis saludos a la Sra. Rescia."]) 72 | end 73 | 74 | it 'correctly segments text #009' do 75 | ps = PragmaticSegmenter::Segmenter.new(text: "Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin. Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K.", language: 'es') 76 | expect(ps.segment).to eq(["Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin.", "Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K."]) 77 | end 78 | 79 | it 'correctly segments text #010' do 80 | ps = PragmaticSegmenter::Segmenter.new(text: "Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D.", language: 'es') 81 | expect(ps.segment).to eq(["Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D."]) 82 | end 83 | 84 | it 'correctly segments text #011' do 85 | ps = PragmaticSegmenter::Segmenter.new(text: "Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \".", language: 'es') 86 | expect(ps.segment).to eq(["Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \"."]) 87 | end 88 | 89 | it 'correctly segments text #012' do 90 | ps = PragmaticSegmenter::Segmenter.new(text: "Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado.", language: 'es') 91 | expect(ps.segment).to eq(["Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado."]) 92 | end 93 | 94 | it 'correctly segments text #013' do 95 | ps = PragmaticSegmenter::Segmenter.new(text: "Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\". ¿Qué te parece?", language: 'es') 96 | expect(ps.segment).to eq(["Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\".", "¿Qué te parece?"]) 97 | end 98 | 99 | it 'correctly segments text #014' do 100 | ps = PragmaticSegmenter::Segmenter.new(text: "Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU..", language: 'es') 101 | expect(ps.segment).to eq(["Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU.."]) 102 | end 103 | 104 | it 'correctly segments text #015' do 105 | ps = PragmaticSegmenter::Segmenter.new(text: "Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\". Disponían de 1 min. para responder esa pregunta.", language: 'es') 106 | expect(ps.segment).to eq(["Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\".", "Disponían de 1 min. para responder esa pregunta."]) 107 | end 108 | 109 | it 'correctly segments text #016' do 110 | ps = PragmaticSegmenter::Segmenter.new(text: "La temperatura del motor alcanzó los 120.5°C. Afortunadamente, pudo llegar al final de carrera.", language: 'es') 111 | expect(ps.segment).to eq(["La temperatura del motor alcanzó los 120.5°C.", "Afortunadamente, pudo llegar al final de carrera."]) 112 | end 113 | 114 | it 'correctly segments text #017' do 115 | ps = PragmaticSegmenter::Segmenter.new(text: "El volumen del cuerpo es 3m³. ¿Cuál es la superficie de cada cara del prisma?", language: 'es') 116 | expect(ps.segment).to eq(["El volumen del cuerpo es 3m³.", "¿Cuál es la superficie de cada cara del prisma?"]) 117 | end 118 | 119 | it 'correctly segments text #018' do 120 | ps = PragmaticSegmenter::Segmenter.new(text: "La habitación tiene 20.55m². El living tiene 50.0m².", language: 'es') 121 | expect(ps.segment).to eq(["La habitación tiene 20.55m².", "El living tiene 50.0m²."]) 122 | end 123 | 124 | it 'correctly segments text #019' do 125 | ps = PragmaticSegmenter::Segmenter.new(text: "1°C corresponde a 33.8°F. ¿A cuánto corresponde 35°C?", language: 'es') 126 | expect(ps.segment).to eq(["1°C corresponde a 33.8°F.", "¿A cuánto corresponde 35°C?"]) 127 | end 128 | 129 | it 'correctly segments text #020' do 130 | ps = PragmaticSegmenter::Segmenter.new(text: "Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos. De esta manera se consagró ¡Campeón mundial!", language: 'es') 131 | expect(ps.segment).to eq(["Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos.", "De esta manera se consagró ¡Campeón mundial!"]) 132 | end 133 | 134 | it 'correctly segments text #021' do 135 | ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: 'es') 136 | expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."]) 137 | end 138 | 139 | it 'correctly segments text #022' do 140 | ps = PragmaticSegmenter::Segmenter.new(text: "El corredor No. 103 arrivó 4°.", language: 'es') 141 | expect(ps.segment).to eq(["El corredor No. 103 arrivó 4°."]) 142 | end 143 | 144 | it 'correctly segments text #023' do 145 | ps = PragmaticSegmenter::Segmenter.new(text: "Hoy es 27/04/2014, y es mi cumpleaños. ¿Cuándo es el tuyo?", language: 'es') 146 | expect(ps.segment).to eq(["Hoy es 27/04/2014, y es mi cumpleaños.", "¿Cuándo es el tuyo?"]) 147 | end 148 | 149 | it 'correctly segments text #024' do 150 | ps = PragmaticSegmenter::Segmenter.new(text: "Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz. ¿Cuánto costará? Quizás $12.5.", language: 'es') 151 | expect(ps.segment).to eq(["Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz.", "¿Cuánto costará?", "Quizás $12.5."]) 152 | end 153 | 154 | it 'correctly segments text #025' do 155 | ps = PragmaticSegmenter::Segmenter.new(text: "1 + 1 es 2. 2 + 2 es 4. El auto es de color rojo.", language: 'es') 156 | expect(ps.segment).to eq(["1 + 1 es 2.", "2 + 2 es 4.", "El auto es de color rojo."]) 157 | end 158 | 159 | it 'correctly segments text #026' do 160 | ps = PragmaticSegmenter::Segmenter.new(text: "La máquina viajaba a 100 km/h. ¿En cuánto tiempo recorrió los 153 Km.?", language: 'es') 161 | expect(ps.segment).to eq(["La máquina viajaba a 100 km/h.", "¿En cuánto tiempo recorrió los 153 Km.?"]) 162 | end 163 | 164 | it 'correctly segments text #027' do 165 | ps = PragmaticSegmenter::Segmenter.new(text: "\n \nCentro de Relaciones Interinstitucionales -CERI \n\nCra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia \n\nhttp://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co \n\n \n\nCERI 0908 \n \nBogotá, D.C. 6 de noviembre de 2014. \n \nSeñores: \nEMBAJADA DE UNITED KINGDOM \n \n", language: 'es') 166 | expect(ps.segment).to eq(["Centro de Relaciones Interinstitucionales -CERI", "Cra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia", "http://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co", "CERI 0908", "Bogotá, D.C. 6 de noviembre de 2014.", "Señores:", "EMBAJADA DE UNITED KINGDOM"]) 167 | end 168 | 169 | it 'correctly segments text #028' do 170 | ps = PragmaticSegmenter::Segmenter.new(text: "N°. 1026.253.553", language: 'es') 171 | expect(ps.segment).to eq(["N°. 1026.253.553"]) 172 | end 173 | 174 | it 'correctly segments text #029' do 175 | ps = PragmaticSegmenter::Segmenter.new(text: "\nA continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN \nSANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, \negresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por \nsu excelencia académica, actualmente cursa el programa de Maestría en \nIngeniería Industrial y se encuentra en un intercambio cultural en Bangalore – \nIndia.", language: 'es', doc_type: 'pdf') 176 | expect(ps.segment).to eq(["A continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN SANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, egresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por su excelencia académica, actualmente cursa el programa de Maestría en Ingeniería Industrial y se encuentra en un intercambio cultural en Bangalore – India."]) 177 | end 178 | 179 | it 'correctly segments text #030' do 180 | ps = PragmaticSegmenter::Segmenter.new(text: "\n__________________________________________________________\nEl Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad.", language: 'es') 181 | expect(ps.segment).to eq(["El Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad."]) 182 | end 183 | 184 | it 'correctly segments text #031' do 185 | ps = PragmaticSegmenter::Segmenter.new(text: "Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco.", language: 'es') 186 | expect(ps.segment).to eq(["Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco."]) 187 | end 188 | end 189 | end 190 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/dutch.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Dutch 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(['a.2d', 'a.a', 'a.a.j.b', 'a.f.t', 'a.g.j.b', 'a.h.v', 'a.h.w', 'a.hosp', 'a.i', 'a.j.b', 'a.j.t', 'a.m', 'a.m.r', 'a.p.m', 'a.p.r', 'a.p.t', 'a.s', 'a.t.d.f', 'a.u.b', 'a.v.a', 'a.w', 'aanbev', 'aanbev.comm', 'aant', 'aanv.st', 'aanw', 'vnw', 'aanw.vnw', 'abd', 'abm', 'abs', 'acc.& fisc', 'acc.act', 'acc.bedr.m', 'acc.bedr.t', "acc.thema's m.", 'acc.thema’s m', 'achterv', 'act.dr', 'act.dr.fam', 'act.fisc', 'act.soc', 'adm.akk', 'adm.besl', 'adm.lex', 'adm.onderr', 'adm.ov', 'adv', 'adv', 'gen', 'adv.bl', 'afd', 'afl', 'aggl.verord', 'agr', 'al', 'alg', 'alg.richts', 'amén', 'ann.dr', 'ann.dr.lg', 'ann.dr.sc.pol', 'ann.ét.eur', 'ann.fac.dr.lg', 'ann.jur.créd', 'ann.jur.créd.règl.coll', 'ann.not', 'ann.parl', 'ann.prat.comm', 'app', 'arb', 'aud', 'arbbl', 'arbh', 'arbit.besl', 'arbrb', 'arr', 'arr.cass', 'arr.r.v.st', 'arr.verbr', 'arrondrb', 'art', 'artw', 'aud', 'b', 'b', 'en w', 'b.&w', 'b.a', 'b.a.s', 'b.b.o', 'b.best.dep', 'b.br.ex', 'b.coll.fr.gem.comm', 'b.coll.vl.gem.comm', 'b.d.cult.r', 'b.d.gem.ex', 'b.d.gem.reg', 'b.dep', 'b.e.b', 'b.f.r', 'b.fr.gem.ex', 'b.fr.gem.reg', 'b.i.h', 'b.inl.j.d', 'b.inl.s.reg', 'b.j', 'b.l', 'b.lid br.ex', 'b.lid d.gem.ex', 'b.lid fr.gem.ex', 'b.lid vl.ex', 'b.lid w.gew.ex', 'b.o.z', 'b.prov.r', 'b.r.h', 'b.s', 'b.sr', 'b.stb', 'b.t.i.r', 'b.t.s.z', 'b.t.w.rev', 'b.v', 'b.ver.coll.gem.gem.comm', 'b.verg.r.b', 'b.versl', 'b.vl.ex', 'b.voorl.reg', 'b.w', 'b.w.gew.ex', 'b.z.d.g', 'b.z.v', 'bab', 'bank fin', 'bank fin.r', 'bedr.org', 'begins', 'beheersov', 'bekendm.comm', 'bel', 'bel.besch', 'bel.w.p', 'beleidsov', 'belg', 'grondw', 'benelux jur', 'ber', 'ber.w', 'besch', 'besl', 'beslagr', 'besluitwet nr', 'bestuurswet', 'bet', 'betr', 'betr', 'vnw', 'bevest', 'bew', 'bijbl', 'ind', 'eig', 'bijbl.n.bijdr', 'bijl', 'bijv', 'bijw', 'bijz.decr', 'bin.b', 'bkh', 'bl', 'blz', 'bm', 'bn', 'bnlx merkw', 'bnlx tek', 'bnlx uitl', 'rh', 'bnw', 'bouwr', 'br drs', 'br.parl', 'bs', 'bt drs', 'btw rev', 'bull', 'bull.adm.pénit', 'bull.ass', 'bull.b.m.m', 'bull.bel', 'bull.best.strafinr', 'bull.bmm', 'bull.c.b.n', 'bull.c.n.c', 'bull.cbn', 'bull.centr.arb', 'bull.cnc', 'bull.contr', 'bull.doc.min.fin', 'bull.f.e.b', 'bull.feb', 'bull.fisc.fin.r', 'bull.i.u.m', 'bull.inf.ass.secr.soc', 'bull.inf.i.e.c', 'bull.inf.i.n.a.m.i', 'bull.inf.i.r.e', 'bull.inf.iec', 'bull.inf.inami', 'bull.inf.ire', 'bull.inst.arb', 'bull.ium', 'bull.jur.imm', 'bull.lég.b', 'bull.off', 'bull.trim.b.dr.comp', 'bull.us', 'bull.v.b.o', 'bull.vbo', 'bv i.o', 'bv', 'bw int.reg', 'bw', 'bxh', 'byz', 'c', 'c.& f', 'c.& f.p', 'c.a', 'c.a.-a', 'c.a.b.g', 'c.c', 'c.c.i', 'c.c.s', 'c.conc.jur', 'c.d.e', 'c.d.p.k', 'c.e', 'c.ex', 'c.f', 'c.h.a', 'c.i.f', 'c.i.f.i.c', 'c.j', 'c.l', 'c.n', 'c.o.d', 'c.p', 'c.pr.civ', 'c.q', 'c.r', 'c.r.a', 'c.s', 'c.s.a', 'c.s.q.n', 'c.v', 'c.v.a', 'c.v.o', 'ca', 'cadeaust', 'cah.const', 'cah.dr.europ', 'cah.dr.immo', 'cah.dr.jud', 'cal', '2d', 'cal', '3e', 'cal', 'rprt', 'cap', 'carg', 'cass', 'cass', 'verw', 'cert', 'cf', 'ch', 'chron', 'chron.d.s', 'chron.dr.not', 'cie', 'cie', 'verz.schr', 'cir', 'circ', 'circ.z', 'cit', 'cit.loc', 'civ', 'cl.et.b', 'cmt', 'co', 'cognoss.v', 'coll', 'v', 'b', 'colp.w', 'com', 'com', 'cas', 'com.v.min', 'comm', 'comm', 'v', 'comm.bijz.ov', 'comm.erf', 'comm.fin', 'comm.ger', 'comm.handel', 'comm.pers', 'comm.pub', 'comm.straf', 'comm.v', 'comm.v.en v', 'comm.venn', 'comm.verz', 'comm.voor', 'comp', 'compt.w', 'computerr', 'con.m', 'concl', 'concr', 'conf', 'confl.w', 'confl.w.huwbetr', 'cons', 'conv', 'coöp', 'ver', 'corr', 'corr.bl', 'cour de cass', 'cour.fisc', 'cour.immo', 'cridon', 'crim', 'cur', 'cur', 'crt', 'curs', 'd', 'd.-g', 'd.a', 'd.a.v', 'd.b.f', 'd.c', 'd.c.c.r', 'd.d', 'd.d.p', 'd.e.t', 'd.gem.r', 'd.h', 'd.h.z', 'd.i', 'd.i.t', 'd.j', 'd.l.r', 'd.m', 'd.m.v', 'd.o.v', 'd.parl', 'd.w.z', 'dact', 'dat', 'dbesch', 'dbesl', 'de advoc', 'de belg.acc', 'de burg.st', 'de gem', 'de gerechtsd', 'de venn', 'de verz', 'decr', 'decr.d', 'decr.fr', 'decr.vl', 'decr.w', 'def', 'dep.opv', 'dep.rtl', 'derg', 'desp', 'det.mag', 'deurw.regl', 'dez', 'dgl', 'dhr', 'disp', 'diss', 'div', 'div.act', 'div.bel', 'dl', 'dln', 'dnotz', 'doc', 'hist', 'doc.jur.b', 'doc.min.fin', 'doc.parl', 'doctr', 'dpl', 'dpl.besl', 'dr', 'dr.banc.fin', 'dr.circ', 'dr.inform', 'dr.mr', 'dr.pén.entr', 'dr.q.m', 'drs', 'dtp', 'dwz', 'dyn', 'e cont', 'e', 'e.a', 'e.b', 'tek.mod', 'e.c', 'e.c.a', 'e.d', 'e.e', 'e.e.a', 'e.e.g', 'e.g', 'e.g.a', 'e.h.a', 'e.i', 'e.j', 'e.m.a', 'e.n.a.c', 'e.o', 'e.p.c', 'e.r.c', 'e.r.f', 'e.r.h', 'e.r.o', 'e.r.p', 'e.r.v', 'e.s.r.a', 'e.s.t', 'e.v', 'e.v.a', 'e.w', 'e&o.e', 'ec.pol.r', 'echos log', 'econ', 'ed', 'ed(s)', 'eeg verd.v', 'eex san s', 'eff', 'eg rtl', 'eig', 'eig.mag', 'eil', 'elektr', 'enmb', 'entr.et dr', 'enz', 'err', 'et al', 'et seq', 'etc', 'etq', 'eur', 'parl', 'eur.t.s', 'eur.verd.overdracht strafv', 'ev rechtsh', 'ev uitl', 'ev', 'evt', 'ex', 'ex.crim', 'exec', 'f', 'f.a.o', 'f.a.q', 'f.a.s', 'f.i.b', 'f.j.f', 'f.o.b', 'f.o.r', 'f.o.s', 'f.o.t', 'f.r', 'f.supp', 'f.suppl', 'fa', 'facs', 'fare act', 'fasc', 'fg', 'fid.ber', 'fig', 'fin.verh.w', 'fisc', 'fisc', 'tijdschr', 'fisc.act', 'fisc.koer', 'fl', 'form', 'foro', 'it', 'fr', 'fr.cult.r', 'fr.gem.r', 'fr.parl', 'fra', 'ft', 'g', 'g.a', 'g.a.v', 'g.a.w.v', 'g.g.d', 'g.m.t', 'g.o', 'g.omt.e', 'g.p', 'g.s', 'g.v', 'g.w.w', 'geb', 'gebr', 'gebrs', 'gec', 'gec.decr', 'ged', 'ged.st', 'gedipl', 'gedr.st', 'geh', 'gem', 'gem', 'en gew', 'gem', 'en prov', 'gem.gem.comm', 'gem.st', 'gem.stem', 'gem.w', 'gem.wet, gem.wet', 'gemeensch.optr', 'gemeensch.standp', 'gemeensch.strat', 'gemeent', 'gemeent.b', 'gemeent.regl', 'gemeent.verord', 'geol', 'geopp', 'gepubl', 'ger.deurw', 'ger.w', 'gerekw', 'gereq', 'gesch', 'get', 'getr', 'gev.m', 'gev.maatr', 'gew', 'ghert', 'gir.eff.verk', 'gk', 'gr', 'gramm', 'grat.w', 'gron,opm.en leermed', 'grootb.w', 'grs', 'grur ausl', 'grur int', 'grvm', 'grw', 'gst', 'gw', 'h.a', 'h.a.v.o', 'h.b.o', 'h.e.a.o', 'h.e.g.a', 'h.e.geb', 'h.e.gestr', 'h.l', 'h.m', 'h.o', 'h.r', 'h.t.l', 'h.t.m', 'h.w.geb', 'hand', 'handelsn.w', 'handelspr', 'handelsr.w', 'handelsreg.w', 'handv', 'harv.l.rev', 'hc', 'herald', 'hert', 'herz', 'hfdst', 'hfst', 'hgrw', 'hhr', 'hist', 'hooggel', 'hoogl', 'hosp', 'hpw', 'hr', 'hr', 'ms', 'hr.ms', 'hregw', 'hrg', 'hst', 'huis.just', 'huisv.w', 'huurbl', 'hv.vn', 'hw', 'hyp.w', 'i.b.s', 'i.c', 'i.c.m.h', 'i.e', 'i.f', 'i.f.p', 'i.g.v', 'i.h', 'i.h.a', 'i.h.b', 'i.l.pr', 'i.o', 'i.p.o', 'i.p.r', 'i.p.v', 'i.pl.v', 'i.r.d.i', 'i.s.m', 'i.t.t', 'i.v', 'i.v.m', 'i.v.s', 'i.w.tr', 'i.z', 'ib', 'ibid', 'icip-ing.cons', 'iem', 'ind prop', 'indic.soc', 'indiv', 'inf', 'inf.i.d.a.c', 'inf.idac', 'inf.r.i.z.i.v', 'inf.riziv', 'inf.soc.secr', 'ing', 'ing', 'cons', 'ing.cons', 'inst', 'int', 'int', 'rechtsh', 'strafz', "int'l & comp.l.q.", 'interm', 'intern.fisc.act', 'intern.vervoerr', 'inv', 'inv', 'f', 'inv.w', 'inv.wet', 'invord.w', 'inz', 'ir', 'irspr', 'iwtr', 'j', 'j.-cl', 'j.c.b', 'j.c.e', 'j.c.fl', 'j.c.j', 'j.c.p', 'j.d.e', 'j.d.f', 'j.d.s.c', 'j.dr.jeun', 'j.j.d', 'j.j.p', 'j.j.pol', 'j.l', 'j.l.m.b', 'j.l.o', 'j.ordre pharm', 'j.p.a', 'j.r.s', 'j.t', 'j.t.d.e', 'j.t.dr.eur', 'j.t.o', 'j.t.t', 'jaarl', 'jb.hand', 'jb.kred', 'jb.kred.c.s', 'jb.l.r.b', 'jb.lrb', 'jb.markt', 'jb.mens', 'jb.t.r.d', 'jb.trd', 'jeugdrb', 'jeugdwerkg.w', 'jg', 'jis', 'jl', 'journ.jur', 'journ.prat.dr.fisc.fin', 'journ.proc', 'jrg', 'jur', 'jur.comm.fl', 'jur.dr.soc.b.l.n', 'jur.f.p.e', 'jur.fpe', 'jur.niv', 'jur.trav.brux', 'jura falc', 'jurambt', 'jv.cass', 'jv.h.r.j', 'jv.hrj', 'jw', 'k', 'k', 'en m', 'k.b', 'k.g', 'k.k', 'k.m.b.o', 'k.o.o', 'k.v.k', 'k.v.v.v', 'kadasterw', 'kaderb', 'kador', 'kbo-nr', 'kg', 'kh', 'kiesw', 'kind.bes.v', 'kkr', 'koopv', 'kr', 'krankz.w', 'ksbel', 'kt', 'ktg', 'ktr', 'kvdm', 'kw.r', 'kymr', 'kzr', 'kzw', 'l', 'l.b', 'l.b.o', 'l.bas', 'l.c', 'l.gew', 'l.j', 'l.k', 'l.l', 'l.o', 'l.r.b', 'l.u.v.i', 'l.v.r', 'l.v.w', 'l.w', "l'exp.-compt.b.", 'l’exp.-compt.b', 'landinr.w', 'landscrt', 'larcier cass', 'lat', 'law.ed', 'lett', 'levensverz', 'lgrs', 'lidw', 'limb.rechtsl', 'lit', 'litt', 'liw', 'liwet', 'lk', 'll', 'll.(l.)l.r', 'loonw', 'losbl', 'ltd', 'luchtv', 'luchtv.w', 'm', 'm', 'not', 'm.a.v.o', 'm.a.w', 'm.b', 'm.b.o', 'm.b.r', 'm.b.t', 'm.d.g.o', 'm.e.a.o', 'm.e.r', 'm.h', 'm.h.d', 'm.i.v', 'm.j.t', 'm.k', 'm.m', 'm.m.a', 'm.m.h.h', 'm.m.v', 'm.n', 'm.not.fisc', 'm.nt', 'm.o', 'm.r', 'm.s.a', 'm.u.p', 'm.v.a', 'm.v.h.n', 'm.v.t', 'm.z', 'maatr.teboekgest.luchtv', 'maced', 'mand', 'max', 'mbl.not', 'me', 'med', 'med', 'v.b.o', 'med.b.u.f.r', 'med.bufr', 'med.vbo', 'meerv', 'meetbr.w', 'mém.adm', 'mgr', 'mgrs', 'mhd', 'mi.verantw', 'mil', 'mil.bed', 'mil.ger', 'min', 'min', 'aanbev', 'min', 'circ', 'min', 'fin', 'min.j.omz', 'min.just.circ', 'mitt', 'mnd', 'mod', 'mon', 'monde ass', 'mouv.comm', 'mr', 'ms', 'muz', 'mv', 'mva ii inv', 'mva inv', 'n cont', 'n', 'chr', 'n.a', 'n.a.g', 'n.a.v', 'n.b', 'n.c', 'n.chr', 'n.d', 'n.d.r', 'n.e.a', 'n.g', 'n.h.b.c', 'n.j', 'n.j.b', 'n.j.w', 'n.l', 'n.m', 'n.m.m', 'n.n', 'n.n.b', 'n.n.g', 'n.n.k', 'n.o.m', 'n.o.t.k', 'n.rapp', 'n.tijd.pol', 'n.v', 'n.v.d.r', 'n.v.d.v', 'n.v.o.b', 'n.v.t', 'nat.besch.w', 'nat.omb', 'nat.pers', 'ned.cult.r', 'neg.verkl', 'nhd', 'nieuw arch', 'wisk', 'njcm-bull', 'nl', 'nnd', 'no', 'not.fisc.m', 'not.w', 'not.wet', 'nr', 'nrs', 'nste', 'nt', 'numism', 'o', 'o.a', 'o.b', 'o.c', 'o.g', 'o.g.v', 'o.i', 'o.i.d', 'o.m', 'o.o', 'o.o.d', 'o.o.v', 'o.p', 'o.r', 'o.regl', 'o.s', 'o.t.s', 'o.t.t', 'o.t.t.t', 'o.t.t.z', 'o.tk.t', 'o.v.t', 'o.v.t.t', 'o.v.tk.t', 'o.v.v', 'ob', 'obsv', 'octr', 'octr.gem.regl', 'octr.regl', 'oe', 'oecd mod', 'off.pol', 'ofra', 'ohd', 'omb', 'omnia frat', 'omnil', 'omz', 'on.ww', 'onderr', 'onfrank', 'onteig.w', 'ontw', 'b.w', 'onuitg', 'onz', 'oorl.w', 'op.cit', 'opin.pa', 'opm', 'or', 'ord.br', 'ord.gem', 'ors', 'orth', 'os', 'osm', 'ov', 'ov.w.i', 'ov.w.ii', 'ov.ww', 'overg.w', 'overw', 'ovkst', 'ow kadasterw', 'oz', 'p', 'p.& b', 'p.a', 'p.a.o', 'p.b.o', 'p.e', 'p.g', 'p.j', 'p.m', 'p.m.a', 'p.o', 'p.o.j.t', 'p.p', 'p.v', 'p.v.s', 'pachtw', 'pag', 'pan', 'pand.b', 'pand.pér', 'parl.gesch', 'parl.gesch', 'inv', 'parl.st', 'part.arb', 'pas', 'pasin', 'pat', 'pb.c', 'pb.l', 'pens', 'pensioenverz', 'per.ber.i.b.r', 'per.ber.ibr', 'pers', 'st', 'pft', 'pg wijz.rv', 'pk', 'pktg', 'pli jur', 'plv', 'po', 'pol', 'pol.off', 'pol.r', 'pol.w', 'politie j', 'postbankw', 'postw', 'pp', 'pr', 'preadv', 'pres', 'prf', 'prft', 'prg', 'prijz.w', 'pro jus', 'proc', 'procesregl', 'prof', 'prot', 'prov', 'prov.b', 'prov.instr.h.m.g', 'prov.regl', 'prov.verord', 'prov.w', 'publ', 'publ.cour eur.d.h', 'publ.eur.court h.r', 'pun', 'pw', 'q.b.d', 'q.e.d', 'q.q', 'q.r', 'r', 'r.a.b.g', 'r.a.c.e', 'r.a.j.b', 'r.b.d.c', 'r.b.d.i', 'r.b.s.s', 'r.c', 'r.c.b', 'r.c.d.c', 'r.c.j.b', 'r.c.s.j', 'r.cass', 'r.d.c', 'r.d.i', 'r.d.i.d.c', 'r.d.j.b', 'r.d.j.p', 'r.d.p.c', 'r.d.s', 'r.d.t.i', 'r.e', 'r.f.s.v.p', 'r.g.a.r', 'r.g.c.f', 'r.g.d.c', 'r.g.f', 'r.g.z', 'r.h.a', 'r.i.c', 'r.i.d.a', 'r.i.e.j', 'r.i.n', 'r.i.s.a', 'r.j.d.a', 'r.j.i', 'r.k', 'r.l', 'r.l.g.b', 'r.med', 'r.med.rechtspr', 'r.n.b', 'r.o', 'r.orde apoth', 'r.ov', 'r.p', 'r.p.d.b', 'r.p.o.t', 'r.p.r.j', 'r.p.s', 'r.r.d', 'r.r.s', 'r.s', 'r.s.v.p', 'r.stvb', 'r.t.d.f', 'r.t.d.h', 'r.t.l', 'r.trim.dr.eur', 'r.v.a', 'r.verkb', 'r.w', 'r.w.d', 'rap.ann.c.a', 'rap.ann.c.c', 'rap.ann.c.e', 'rap.ann.c.s.j', 'rap.ann.ca', 'rap.ann.cass', 'rap.ann.cc', 'rap.ann.ce', 'rap.ann.csj', 'rapp', 'rb', 'rb.kh', 'rb.van kh', 'rdn', 'rdnr', 're.pers', 'rec', 'rec.c.i.j', 'rec.c.j.c.e', 'rec.cij', 'rec.cjce', 'rec.cour eur.d.h', 'rec.gén.enr.not', 'rec.lois decr.arr', 'rechtsk.t', 'rechtspl.zeem', 'rechtspr.arb.br', 'rechtspr.b.f.e', 'rechtspr.bfe', 'rechtspr.soc.r.b.l.n', 'recl.reg', 'rect', 'red', 'reg', 'reg.huiz.bew', 'reg.w', 'registr.w', 'regl', 'regl', 'r.v.k', 'regl.besl', 'regl.onderr', 'regl.r.t', 'rep', 'rep.eur.court h.r', 'rép.fisc', 'rép.not', 'rep.r.j', 'rep.rj', 'req', 'res', 'resp', 'rev', 'rev', 'de dr', 'comp', 'rev', 'trim', 'de dr', 'civ', 'rev', 'trim', 'de dr', 'comm', 'rev.acc.trav', 'rev.adm', 'rev.b.compt', 'rev.b.dr.const', 'rev.b.dr.intern', 'rev.b.séc.soc', 'rev.banc.fin', 'rev.comm', 'rev.cons.prud', 'rev.dr.b', 'rev.dr.commun', 'rev.dr.étr', 'rev.dr.fam', 'rev.dr.intern.comp', 'rev.dr.mil', 'rev.dr.min', 'rev.dr.pén', 'rev.dr.pén.mil', 'rev.dr.rur', 'rev.dr.u.l.b', 'rev.dr.ulb', 'rev.exp', 'rev.faill', 'rev.fisc', 'rev.gd', 'rev.hist.dr', 'rev.i.p.c', 'rev.ipc', 'rev.not.b', 'rev.prat.dr.comm', 'rev.prat.not.b', 'rev.prat.soc', 'rev.rec', 'rev.rw', 'rev.trav', 'rev.trim.d.h', 'rev.trim.dr.fam', 'rev.urb', 'richtl', 'riv.dir.int', 'riv.dir.int."le priv', 'riv.dir.int.priv.proc', 'rk', 'rln', 'roln', 'rom', 'rondz', 'rov', 'rtl', 'rubr', 'ruilv.wet', 'rv.verdr', 'rvkb', 's', 's', 'en s', 's.a', 's.b.n', 's.ct', 's.d', 's.e.c', 's.e.et.o', 's.e.w', 's.exec.rept', 's.hrg', 's.j.b', 's.l', 's.l.e.a', 's.l.n.d', 's.p.a', 's.s', 's.t', 's.t.b', 's.v', 's.v.p', 'samenw', 'sc', 'sch', 'scheidsr.uitspr', 'schepel.besl', 'secr.comm', 'secr.gen', 'sect.soc', 'sess', 'cas', 'sir', 'soc', 'best', 'soc', 'handv', 'soc', 'verz', 'soc.act', 'soc.best', 'soc.kron', 'soc.r', 'soc.sw', 'soc.weg', 'sofi-nr', 'somm', 'somm.ann', 'sp.c.c', 'sr', 'ss', 'st.doc.b.c.n.a.r', 'st.doc.bcnar', 'st.vw', 'stagever', 'stas', 'stat', 'stb', 'stbl', 'stcrt', 'stichting i.v', 'stud.dipl', 'su', 'subs', 'subst', 'succ.w', 'suppl', 'sv', 'sw', 't', 't.a', 't.a.a', 't.a.n', 't.a.p', 't.a.s.n', 't.a.v', 't.a.v.w', 't.aann', 't.acc', 't.agr.r', 't.app', 't.b.b.r', 't.b.h', 't.b.m', 't.b.o', 't.b.p', 't.b.r', 't.b.s', 't.b.v', 't.bankw', 't.belg.not', 't.desk', 't.e.m', 't.e.p', 't.f.r', 't.fam', 't.fin.r', 't.g.r', 't.g.t', 't.g.v', 't.gem', 't.gez', 't.huur', 't.i.n', 't.in b.z', 't.j.k', 't.l.l', 't.l.v', 't.m', 't.m.r', 't.m.w', 't.mil.r', 't.mil.strafr', 't.not', 't.o', 't.o.r.b', 't.o.v', 't.ontv', 't.orde geneesh', 't.p.r', 't.pol', 't.r', 't.r.d.& i', 't.r.g', 't.r.o.s', 't.r.v', 't.s.r', 't.strafr', 't.t', 't.u', 't.v.c', 't.v.g', 't.v.m.r', 't.v.o', 't.v.v', 't.v.v.d.b', 't.v.w', 't.verz', 't.vred', 't.vreemd', 't.w', 't.w.k', 't.w.v', 't.w.v.r', 't.wrr', 't.z', 't.z.t', 't.z.v', 'taalk', 'tar.burg.z', 'td', 'techn', 'telecomm', 'toel', 'toel.st.v.w', 'toep', 'toep.regl', 'tom', 'top', 'trans.b', 'transp.r', 'trav.com.ét.et lég.not', 'trb', 'trib', 'trib.civ', 'trib.gr.inst', 'ts', 'ts', 'best', 'ts', 'verv', 'turnh.rechtsl', 'tvpol', 'tvpr', 'tvrechtsgesch', 'tw', 'u', 'u.a', 'u.a.r', 'u.a.v', 'u.c', 'u.c.c', 'u.g', 'u.p', 'u.s', 'u.s.d.c', 'uitdr', 'uitl.w', 'uitv.besch.div.b', 'uitv.besl', 'uitv.besl', 'succ.w', 'uitv.besl.bel.rv', 'uitv.besl.l.b', 'uitv.reg', 'inv.w', 'uitv.reg.bel.d', 'uitv.reg.afd.verm', 'uitv.reg.lb', 'uitv.reg.succ.w', 'univ', 'univ.verkl', 'v', 'v', 'chr', 'v.& f', 'v.a', 'v.a.v', 'v.bp prot', 'v.c', 'v.chr', 'v.h', 'v.huw.verm', 'v.i', 'v.i.o', 'v.k.a', 'v.m', 'v.o.f', 'v.o.n', 'v.onderh.verpl', 'v.p', 'v.r', 'v.s.o', 'v.t.t', 'v.t.t.t', 'v.tk.t', 'v.toep.r.vert', 'v.v.b', 'v.v.g', 'v.v.t', 'v.v.t.t', 'v.v.tk.t', 'v.w.b', 'v.z.m', 'vb', 'vb.bo', 'vbb', 'vc', 'vd', 'veldw', 'ver.k', 'ver.verg.gem', 'gem.comm', 'verbr', 'verd', 'verdr', 'verdr.v', 'verdrag benel.i.z', 'tek.mod', 'verenw', 'verg', 'verg.fr.gem', 'comm', 'verkl', 'verkl.herz.gw', 'verl', 'deelw', 'vern', 'verord', 'vers.r', 'versch', 'versl.c.s.w', 'versl.csw', 'vert', 'verw', 'verz', 'verz.w', 'verz.wett.besl', 'verz.wett.decr.besl', 'vgl', 'vid', 'vigiles jb', 'viss.w', 'vl.parl', 'vl.r', 'vl.t.gez', 'vl.w.reg', 'vl.w.succ', 'vlg', 'vn', 'vnl', 'vnw', 'vo', 'vo.bl', 'voegw', 'vol', 'volg', 'volt', 'deelw', 'voorl', 'voorz', 'vord.w', 'vorst.d', 'vr', 'en antw', 'vred', 'vrg', 'vnw', 'vrijgrs', 'vs', 'vt', 'vvsr jb', 'vw', 'vz', 'vzngr', 'vzr', 'w', 'w.a', 'w.b.r', 'w.c.h', 'w.conf.huw', 'w.conf.huwelijksb', 'w.consum.kr', 'w.f.r', 'w.g', 'w.gelijke beh', 'w.gew.r', 'w.ident.pl', 'w.just.doc', 'w.kh', 'w.l.r', 'w.l.v', 'w.mil.straf.spr', 'w.n', 'w.not.ambt', 'w.o', 'w.o.d.huurcomm', 'w.o.d.k', 'w.openb.manif', 'w.parl', 'w.r', 'w.reg', 'w.succ', 'w.u.b', 'w.uitv.pl.verord', 'w.v', 'w.v.k', 'w.v.m.s', 'w.v.r', 'w.v.w', 'w.venn', 'wac', 'wd', 'wet a.b', 'wet bel.rv', 'wet c.a.o', 'wet c.o', 'wet div.bel', 'wet ksbel', 'wet l.v', 'wetb', 'n.v.h', 'wgb', 'winkelt.w', 'wisk', 'wka-verkl', 'wnd', 'won.w', 'woningw', 'woonr.w', 'wrr', 'wrr.ber', 'wrsch', 'ws', 'wsch', 'wsr', 'wtvb', 'ww', 'x.d', 'z cont', 'z.a', 'z.g', 'z.i', 'z.j', 'z.o.z', 'z.p', 'z.s.m', 'zesde richtl', 'zg', 'zgn', 'zn', 'znw', 'zr', 'zr', 'ms', 'zr.ms']).freeze 10 | PREPOSITIVE_ABBREVIATIONS = [].freeze 11 | NUMBER_ABBREVIATIONS = [].freeze 12 | end 13 | 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/pragmatic_segmenter/languages/italian.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module PragmaticSegmenter 4 | module Languages 5 | module Italian 6 | include Languages::Common 7 | 8 | module Abbreviation 9 | ABBREVIATIONS = Set.new(['1°', 'a.c', 'a.c/a', 'a.cam', 'a.civ', 'a.cor', 'a.d.r', 'a.gov', 'a.mil', 'a.mon', 'a.smv', 'a.v', 'a/a', 'a/c', 'a/i', 'aa', 'aaaa', 'aaal', 'aacst', 'aamct', 'aams', 'aar', 'aato', 'ab', 'abbigl', 'abbrev', 'abc', 'abi', 'abl', 'abm', 'abr', 'abs', 'absp', 'ac', 'acam', 'acb', 'acbi', 'acc', 'accorc', 'accr', 'acd', 'ace', 'acec', 'acep', 'aci', 'acli', 'acp', 'acro', 'acsit', 'actl', 'ad', 'ad.mil', 'ada', 'adap', 'adatt', 'adc', 'add', 'adei', 'adeion', 'adhd', 'adi', 'adisco', 'adj', 'adm', 'adp', 'adr', 'ads', 'adsi', 'adsl', 'adv', 'ae.b', 'aefi', 'aer', 'aerodin', 'aeron', 'afa', 'afc', 'afci', 'affl', 'afi', 'afic', 'afm', 'afp', 'ag', 'agcm', 'agcom', 'age', 'agecs', 'agesci', 'agg', 'agip', 'agis', 'agm', 'ago', 'agr', 'agric', 'agt', 'ai', 'aia', 'aiab', 'aiac', 'aiace', 'aiap', 'aias', 'aiat', 'aib', 'aic', 'aica', 'aicel', 'aici', 'aics', 'aid', 'aida', 'aidaa', 'aidac', 'aidama', 'aidda', 'aidim', 'aido', 'aids', 'aies', 'aif', 'aih', 'aiip', 'aimi', 'aip', 'aipsc', 'airi', 'ais', 'aisa', 'aism', 'aiss', 'aissca', 'aitc', 'aiti', 'aitr', 'aits', 'aka', 'al', 'alai', 'alch', 'alg', 'ali', 'alim', 'all', 'allev', 'allus', 'alp', 'alq', 'alt', 'am', 'ama', 'amaci', 'amag', 'amami', 'amc', 'ammec', 'amn', 'ampas', 'amps', 'an', 'ana', 'anaai', 'anac', 'anaci', 'anad', 'anai', 'anaoo', 'anart', 'anat', 'anat. comp', 'ancci', 'anci', 'ancip', 'ancsa', 'andit', 'anec', 'anee', 'anem', 'anes', 'anffas', 'ani', 'ania', 'anica', 'anie', 'animi', 'anis', 'anisc', 'anm', 'anmfit', 'anmig', 'anmil', 'anmli', 'anms', 'anpa', 'anpas', 'anpci', 'anpe', 'anpi', 'ansi', 'ansv', 'ant', 'anta', 'antifr', 'antlo', 'anton', 'antrop', 'anusca', 'anvi', 'anx', 'ao', 'ap', 'apa', 'apd', 'apea', 'apec', 'apet', 'api', 'apos', 'app', 'app.sc', 'apr', 'aps', 'apt', 'aq', 'ar', 'ar.ind', 'ar.rep', 'arald', 'arame', 'arc', 'arch', 'archeol', 'arci', 'ardsu', 'are', 'arg', 'aritm', 'arpa', 'arpat', 'arred', 'arrt', 'arsia', 'art', 'arti min', 'artig', 'artigl', 'artt', 'as', 'asa', 'asae', 'asc', 'asci', 'ascii', 'ascom', 'ascop', 'asd', 'ase', 'asf', 'asfer', 'asg', 'asic', 'asifa', 'asl', 'asmdc', 'asmi', 'asp', 'aspic', 'aspp', 'assi', 'assic', 'assol', 'asst', 'aster', 'astr', 'astrol', 'astron', 'at', 'ata', 'atb', 'atic', 'atm', 'ats', 'att', 'attrav', 'atv', 'au', 'auc', 'aus', 'auser', 'aut', 'autom', 'av', 'avi', 'avis', 'avo', 'avv', 'avvers', 'awb', 'awdp', 'az', 'azh', 'b.a', 'b2b', 'b2c', 'ba', 'bafta', 'bal', 'ball', 'ban', 'banc', 'bar', 'bart', 'bas', 'bat', 'batt', 'bban', 'bbc', 'bbl', 'bbs', 'bbtc', 'bcc', 'bce', 'bcf', 'bdf', 'bei', 'bep', 'bers', 'bg', 'bi', 'bibl', 'bic', 'bioch', 'biol', 'bl', 'bld', 'bldg', 'blpc', 'bm', 'bmps', 'bmw', 'bn', 'bna', 'bncf', 'bncrm', 'bni', 'bnl', 'bo', 'bot', 'bpl', 'bpm', 'bpn', 'bpr', 'br', 'brd', 'bre', 'bric', 'brig', 'brig.ca', 'brig.gen', 'bros', 'bs', 'bsc', 'bsp', 'bsu', 'bt', 'btc', 'btg', 'btg.l', 'btr', 'bts', 'bu', 'bur', 'bz', 'c.a', 'c.a.p', 'c.c.p', 'c.cost', 'c.d a', 'c.d', 'c.le', 'c.m', 'c.opv', 'c.p', 'c.s', 'c.v', 'c.v.d', 'c/a', 'c/c', 'c/pag', 'ca', 'ca.rep', 'ca.sm', 'ca.sz', 'ca.uf', 'caaf', 'cab', 'cad', 'cae', 'cai', 'cal', 'cam', 'cap', 'capol', 'capt', 'car', 'car.sc', 'carat', 'card', 'cas', 'casaca', 'casd', 'cass.civ', 'cat', 'caus', 'cav', 'cavg', 'cb', 'cbd', 'cbr', 'cbs', 'cc', 'cca', 'ccap', 'ccda', 'ccdp', 'ccee', 'cciaa', 'ccie', 'ccip', 'cciss', 'ccna', 'ccnl', 'ccnp', 'ccpb', 'ccs', 'ccsp', 'cctld', 'cctv', 'ccv', 'cd', 'cda', 'cdma', 'cdo', 'cdpd', 'cdr', 'cds', 'cdw', 'ce', 'ced', 'cee', 'cei', 'cemat', 'cenelec', 'centr', 'cepis', 'ceps', 'cept', 'cerit', 'cese', 'cesis', 'cesvot', 'cet', 'cf', 'cfa', 'cfr', 'cg', 'cgi', 'cgil', 'cgs', 'ch', 'chf', 'chim', 'chim. ind', 'chir', 'ci', 'ci-europa', 'ciber', 'cicae', 'cid', 'cie', 'cif', 'cifej', 'cig', 'cigs', 'cii', 'cilea', 'cilo', 'cim', 'cime', 'cin', 'cinit', 'cio', 'cipe', 'cirm', 'cisal', 'ciscs', 'cisd', 'cisl', 'cism', 'citol', 'cl', 'class', 'cli', 'cm', 'cmdr', 'cme', 'cmo', 'cmr', 'cms', 'cmyk', 'cm²', 'cm³', 'cn', 'cna', 'cnb', 'cnc', 'cnel', 'cngei', 'cni', 'cnipa', 'cnit', 'cnn', 'cnr', 'cns', 'cnt', 'cnvvf', 'co', 'co.ing', 'co.sa', 'cobas', 'coc', 'cod', 'cod. civ', 'cod. deont. not', 'cod. pen', 'cod. proc. civ', 'cod. proc. pen', 'codec', 'coi', 'col', 'colf', 'coll', 'com', 'comdr', 'comm', 'comp', 'compar', 'compl', 'con', 'conai', 'conc', 'concl', 'condiz', 'confetra', 'confitarma', 'confr', 'cong', 'congeav', 'congiunt', 'coni', 'coniug', 'consec', 'consob', 'contab', 'contr', 'coreco', 'corp', 'corr', 'correl', 'corrisp', 'cosap', 'cospe', 'cost', 'costr', 'cpc', 'cpdel', 'cpe', 'cpi', 'cpl', 'cpt', 'cpu', 'cr', 'cral', 'credem', 'crf', 'cri', 'cric', 'cristall', 'crm', 'cro', 'cron', 'crsm', 'crt', 'cs', 'csa', 'csai', 'csc', 'csm', 'csn', 'css', 'ct', 'ctc', 'cti', 'ctr', 'ctsis', 'cuc', 'cud', 'cun', 'cup', 'cusi', 'cvb', 'cvbs', 'cwt', 'cz', 'd', 'd.c', 'd.i.a', 'dab', 'dac', 'dam', 'dams', 'dat', 'dau', 'db', 'dbms', 'dc', 'dca', 'dccc', 'dda', 'ddp', 'ddr', 'ddt', 'dea', 'decoraz', 'dect', 'dek', 'denom', 'deriv', 'derm', 'determ', 'df', 'dfp', 'dg', 'dga', 'dhcp', 'di', 'dia', 'dial', 'dic', 'dicomac', 'dif', 'difett', 'dig. iv', 'digos', 'dimin', 'dimostr', 'din', 'dipart', 'diplom', 'dir', 'dir. amm', 'dir. can', 'dir. civ', 'dir. d. lav', 'dir. giur', 'dir. internaz', 'dir. it', 'dir. pen', 'dir. priv', 'dir. proces', 'dir. pub', 'dir. rom', 'disus', 'diy', 'dl', 'dlf', 'dm', 'dme', 'dmf', 'dmo', 'dmoz', 'dm²', 'dm³', 'dnr', 'dns', 'doa', 'doc', 'docg', 'dom', 'dop', 'dos', 'dott', 'dpa', 'dpi', 'dpl', 'dpof', 'dps', 'dpt', 'dr', 'dra', 'drm', 'drs', 'dry pt', 'ds', 'dslam', 'dspn', 'dss', 'dtc', 'dtmf', 'dtp', 'dts', 'dv', 'dvb', 'dvb-t', 'dvd', 'dvi', 'dwdm', 'e.g', 'e.p.c', 'ead', 'eafrd', 'ean', 'eap', 'easw', 'eb', 'eban', 'ebr', 'ebri', 'ebtn', 'ecc', 'eccl', 'ecdl', 'ecfa', 'ecff', 'ecg', 'ecm', 'econ', 'econ. az', 'econ. dom', 'econ. pol', 'ecpnm', 'ed', 'ed agg', 'edge', 'edi', 'edil', 'edit', 'ef', 'efa', 'efcb', 'efp', 'efsa', 'efta', 'eg', 'egiz', 'egl', 'egr', 'ei', 'eisa', 'elab', 'elettr', 'elettron', 'ellitt', 'emap', 'emas', 'embr', 'emdr', 'emi', 'emr', 'en', 'enaip', 'enal', 'enaoli', 'enapi', 'encat', 'enclic', 'enea', 'enel', 'eni', 'enigm', 'enit', 'enol', 'enpa', 'enpaf', 'enpals', 'enpi', 'enpmf', 'ens', 'entom', 'epd', 'epigr', 'epirbs', 'epl', 'epo', 'ept', 'erc', 'ercom', 'ermes', 'erp', 'es', 'esa', 'escl', 'esist', 'eso', 'esp', 'estens', 'estr. min', 'etacs', 'etf', 'eti', 'etim', 'etn', 'etol', 'eu', 'eufem', 'eufic', 'eula', 'eva®', 'f.a', 'f.b', 'f.m', 'f.p', 'fa', 'fabi', 'fac', 'facl', 'facs', 'fad', 'fai', 'faile', 'failp', 'failpa', 'faisa', 'falcri', 'fam', 'famar', 'fans', 'fao', 'fapav', 'faq', 'farm', 'fasi', 'fasib', 'fatt', 'fbe', 'fbi', 'fc', 'fco', 'fcp', 'fcr', 'fcu', 'fdi', 'fe', 'feaog', 'feaosc', 'feb', 'fedic', 'fema', 'feoga', 'ferr', 'fesco', 'fesr', 'fess', 'fg', 'fi', 'fiaf', 'fiaip', 'fiais', 'fialtel', 'fiap', 'fiapf', 'fiat', 'fiavet', 'fic', 'ficc', 'fice', 'fidal', 'fidam', 'fidapa', 'fieg', 'fifa', 'fifo', 'fig', 'figc', 'figs', 'filat', 'filcams', 'file', 'filol', 'filos', 'fim', 'fima', 'fimmg', 'fin', 'finco', 'fio', 'fioto', 'fipe', 'fipresci', 'fis', 'fisar', 'fisc', 'fisg', 'fisiol', 'fisiopatol', 'fistel', 'fit', 'fita', 'fitav', 'fits', 'fiv', 'fivet', 'fivl', 'flo', 'flpd', 'fluid pt', 'fm', 'fmcg', 'fmi', 'fmth', 'fnas', 'fnomceo', 'fnsi', 'fob', 'fod', 'folcl', 'fon', 'fop', 'fotogr', 'fp', 'fpc', 'fpld', 'fr', 'fra', 'fs', 'fsc', 'fse', 'fsf', 'fsfi', 'fsh', 'ft', 'ftase', 'ftbcc', 'fte', 'ftp', 'fts', 'ft²', 'ft³', 'fuaav', 'fut', 'fv', 'fvg', 'g.fv', 'g.u', 'g.u.el', 'gal', 'gats', 'gatt', 'gb', 'gc', 'gccc', 'gco', 'gcost', 'gd', 'gdd', 'gdf', 'gdi', 'gdo', 'gdp', 'ge', 'gea', 'gel', 'gen', 'geneal', 'geod', 'geofis', 'geogr', 'geogr. antr', 'geogr. fis', 'geol', 'geom', 'gep', 'germ', 'gescal', 'gg', 'ggv', 'gi', 'gia', 'gides', 'gift', 'gio', 'giorn', 'gis', 'gisma', 'gismo', 'giu', 'gm', 'gmdss', 'gme', 'gmo', 'go', 'gov', 'gp', 'gpl', 'gprs', 'gps', 'gr', 'gr.sel.spec', 'gr.sel.tr', 'gr.sqd', 'gra', 'gram', 'grano', 'grd', 'grtn', 'grv', 'gsa', 'gsm', 'gsm-r', 'gsr', 'gtld', 'gu', 'guce', 'gui', 'gus', 'ha', 'haart', 'haccp', 'hba', 'hcg', 'hcrp', 'hd-dvd', 'hdcp', 'hdi', 'hdml', 'hdtv', 'hepa', 'hfpa', 'hg', 'hifi', 'hiperlan', 'hiv', 'hm', 'hmld', 'hon', 'hosp', 'hpv', 'hr', 'hrh', 'hrm', 'hrt', 'html', 'http', 'hvac', 'hz', 'i.e', 'i.g.m', 'iana', 'iasb', 'iasc', 'iass', 'iat', 'iata', 'iatse', 'iau', 'iban', 'ibid', 'ibm', 'icann', 'icao', 'icbi', 'iccu', 'ice', 'icf', 'ici', 'icm', 'icom', 'icon', 'ics', 'icsi', 'icstis', 'ict', 'icta', 'id', 'iden', 'idl', 'idraul', 'iec', 'iedm', 'ieee', 'ietf', 'ifat', 'ifel', 'ifla', 'ifrs', 'ifto', 'ifts', 'ig', 'igm', 'igmp', 'igp', 'iims', 'iipp', 'ilm', 'ilo', 'ilor', 'ils', 'im', 'imaie', 'imap', 'imc', 'imdb', 'imei', 'imi', 'imms', 'imo', 'imp', 'imper', 'imperf', 'impers', 'imq', 'ims', 'imsi', 'in', 'inail', 'inca', 'incb', 'inci', 'ind', 'ind. agr', 'ind. alim', 'ind. cart', 'ind. chim', 'ind. cuoio', 'ind. estratt', 'ind. graf', 'ind. mecc', 'ind. tess', 'indecl', 'indef', 'indeterm', 'indire', 'inea', 'inf', 'infea', 'infm', 'inform', 'ing', 'ingl', 'inmarsat', 'inpdai', 'inpdap', 'inpgi', 'inps', 'inr', 'inran', 'ins', 'insp', 'int', 'inter', 'intr', 'invar', 'invim', 'in²', 'in³', 'ioma', 'iosco', 'ip', 'ipab', 'ipasvi', 'ipi', 'ippc', 'ips', 'iptv', 'iq', 'ira', 'irap', 'ircc', 'ircs', 'irda', 'iref', 'ires', 'iron', 'irpef', 'irpeg', 'irpet', 'irreg', 'is', 'isae', 'isbd', 'isbn', 'isc', 'isdn', 'isee', 'isef', 'isfol', 'isg', 'isi', 'isia', 'ism', 'ismea', 'isnart', 'iso', 'isp', 'ispearmi', 'ispel', 'ispescuole', 'ispesl', 'ispo', 'ispro', 'iss', 'issn', 'istat', 'istol', 'isvap', 'it', 'iti', 'itt', 'ittiol', 'itu', 'iud', 'iugr', 'iulm', 'iva', 'iveco', 'ivg', 'ivr', 'ivs', 'iyhp', 'j', 'jal', 'jit', 'jr', 'jv', 'k', 'kb', 'kee', 'kg', 'kkk', 'klm', 'km', 'km/h', 'kmph', 'kmq', 'km²', 'kr', 'kw', 'kwh', 'l', 'l\'ing', 'l.n', 'l\'avv', 'la', 'lag', 'lan', 'lanc', 'larn', 'laser', 'lat', 'lav', 'lav. femm', 'lav. pubbl', 'laz', 'lb', 'lc', 'lcca', 'lcd', 'le', 'led', 'lett', 'lh', 'li', 'liaf', 'lib', 'lic', 'lic.ord', 'lic.strd', 'licd', 'lice', 'lida', 'lidci', 'liff', 'lifo', 'lig', 'liit', 'lila', 'lilt', 'linfa', 'ling', 'lipu', 'lis', 'lisaac', 'lism', 'lit', 'litab', 'lnp', 'lo', 'loc', 'loc. div', 'lolo', 'lom', 'long', 'lp', 'lrm', 'lrms', 'lsi', 'lsu', 'lt', 'ltd', 'lu', 'lug', 'luiss', 'lun', 'lwt', 'lww', 'm.a', 'm.b', 'm.o', 'm/s', 'ma', 'mac', 'macch', 'mag', 'magg.(maj)', 'magg.gen.(maj.gen.)', 'mai', 'maj', 'mar', 'mar.a', 'mar.ca', 'mar.ord', 'marc', 'mat', 'mater', 'max', 'mb', 'mbac', 'mc', 'mcl', 'mcpc', 'mcs', 'md', 'mdf', 'mdp', 'me', 'mec', 'mecc', 'med', 'mediev', 'mef', 'mer', 'merc', 'merid', 'mesa', 'messrs', 'metall', 'meteor', 'metr', 'metrol', 'mg', 'mgc', 'mgm', 'mi', 'mibac', 'mica', 'microb', 'mifed', 'miglio nautico', 'miglio nautico per ora', 'miglio nautico²', 'miglio²', 'mil', 'mile', 'miles/h', 'milesph', 'min', 'miner', 'mips', 'miptv', 'mit', 'mitol', 'miur', 'ml', 'mlle', 'mls', 'mm', 'mme', 'mms', 'mm²', 'mn', 'mnp', 'mo', 'mod', 'mol', 'mons', 'morf', 'mos', 'mpaa', 'mpd', 'mpeg', 'mpi', 'mps', 'mq', 'mr', 'mrs', 'ms', 'msgr', 'mss', 'mt', 'mto', 'murst', 'mus', 'mvds', 'mws', 'm²', 'm³', 'n.a', 'n.b', 'na', 'naa', 'nafta', 'napt', 'nars', 'nasa', 'nat', 'natas', 'nato', 'nb', 'nba', 'nbc', 'ncts', 'nd', 'nda', 'nde', 'ndr', 'ndt', 'ne', 'ned', 'neg', 'neol', 'netpac', 'neur', 'news!', 'ngcc', 'nhmf', 'nlcc', 'nmr', 'no', 'nodo', 'nom', 'nos', 'nov', 'novissdi', 'npi', 'nr', 'nt', 'nta', 'nts', 'ntsc', 'nu', 'nuct', 'numism', 'nwt', 'nyc', 'nz', 'o.m.i', 'oai-pmh', 'oav', 'oc', 'occ', 'occult', 'oci', 'ocr', 'ocse', 'oculist', 'od', 'odg', 'odp', 'oecd', 'oem', 'ofdm', 'oft', 'og', 'ogg', 'ogi', 'ogm', 'ohim', 'oic', 'oics', 'olaf', 'oland', 'ole', 'oled', 'omi', 'oms', 'on', 'ong', 'onig', 'onlus', 'onomat', 'onpi', 'onu', 'op', 'opac', 'opec', 'opord', 'opsosa', 'or', 'ord', 'ord. scol', 'ore', 'oref', 'orient', 'ornit', 'orogr', 'orp', 'ort', 'os', 'osa', 'osas', 'osd', 'ot', 'ote', 'ott', 'oz', 'p', 'p.a', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.m', 'p.r', 'p.s', 'p.t', 'p.v', 'pa', 'pac', 'pag./p', 'pagg./pp', 'pai', 'pal', 'paleobot', 'paleogr', 'paleont', 'paleozool', 'paletn', 'pamr', 'pan', 'papir', 'par', 'parapsicol', 'part', 'partic', 'pass', 'pat', 'patol', 'pb', 'pc', 'pci', 'pcm', 'pcmcia', 'pcs', 'pcss', 'pct', 'pd', 'pda', 'pdf', 'pdl', 'pds', 'pe', 'pec', 'ped', 'pedag', 'peg', 'pegg', 'per.ind', 'pers', 'pert', 'pesq', 'pet', 'petr', 'petrogr', 'pfc', 'pg', 'pga', 'pgp', 'pgut', 'ph', 'php', 'pi', 'pics', 'pie', 'pif', 'pii', 'pil', 'pime', 'pin', 'pine', 'pip', 'pir', 'pit', 'pitt', 'piuss', 'pkcs', 'pki', 'pko', 'pl', 'pli', 'plr', 'pm', 'pma', 'pmi', 'pmr', 'pn', 'pnf', 'pnl', 'po', 'poet', 'pof', 'pol', 'pop', 'popitt', 'popol', 'port', 'pos', 'poss', 'post', 'pots', 'pp', 'ppa', 'ppc', 'ppga', 'ppp', 'pps', 'pptt', 'ppv', 'pr', 'pra', 'praa', 'pref', 'preist', 'prep', 'pres', 'pret', 'prg', 'pri', 'priv', 'pro.civ', 'prof', 'pron', 'pronom', 'propr', 'prov', 'prs', 'prtl', 'prusst', 'ps', 'pse', 'psi', 'psicoan', 'psicol', 'pso', 'psp', 'pstn', 'pt', 'ptc', 'pti', 'ptsd', 'ptt', 'pu', 'pug', 'puk', 'put', 'pv', 'pvb', 'pvc', 'pvt', 'pz', 'qb', 'qcs', 'qfd', 'qg', 'qi', 'qlco', 'qlcu', 'qos', 'qualif', 'r-lan', 'r.s', 'ra', 'racc', 'radar', 'radc', 'radiotecn', 'raee', 'raf', 'rag', 'raid', 'ram', 'rar', 'ras', 'rass. avv. stato', 'rc', 'rca', 'rcdp', 'rcs', 'rdc', 'rdco', 'rdf', 'rdi', 'rdp', 'rds', 'rdt', 're', 'rea', 'recipr', 'recl', 'reg', 'region', 'rel', 'rem', 'rep', 'reps', 'res', 'retor', 'rev', 'rfi', 'rfid', 'rg', 'rgb', 'rgc', 'rge', 'rgi', 'rgi bdp', 'rgpt', 'rgt', 'ri', 'riaa', 'riaj', 'riba', 'ric', 'rid', 'rif', 'rifl', 'rina', 'rip', 'ris', 'rit', 'ritts', 'rm', 'rmn', 'rn', 'ro', 'roa', 'roc', 'roi', 'rom', 'roro', 'rov', 'rp', 'rpm', 'rr', 'rrf', 'rs', 'rsc', 'rspp', 'rss', 'rsu', 'rsvp', 'rt', 'rtdpc', 'rtg', 'rtn', 'rtp', 'rttt', 'rvm', 's-dab', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 's.ten', 's.v', 's/m', 'sa', 'sab', 'saca', 'sace', 'sact', 'sad', 'sag', 'sahm', 'sai', 'saisa', 'sam', 'san', 'sanas', 'sape', 'sar', 'sars', 'sart', 'sas', 'sbaf', 'sbas', 'sbn', 'sc', 'sca.sm', 'scherz', 'scien', 'scn', 'scsi', 'scuba', 'scult', 'scut', 'sdds', 'sdiaf', 'sds', 'sdsl', 'se', 'seat', 'sebc', 'sec', 'seca', 'secam', 'secc', 'see', 'seg', 'segg', 'segredifesa', 'sem', 'sempo', 'sen', 'sens', 'seo', 'serg', 'serg.magg.(sgm)', 'serg.magg.ca', 'set', 'sfc', 'sfis', 'sfx', 'sg', 'sga', 'sgc', 'sgg', 'sgml', 'sgt', 'si', 'si@lt', 'sia', 'siae', 'siaic', 'siap', 'sias', 'sic', 'sicav', 'sid', 'sido', 'sie', 'sif', 'sig', 'sig.na', 'sig.ra', 'sige', 'sigg', 'sigill', 'sigo', 'siia', 'simb', 'simbdea', 'simg', 'simo', 'sin', 'sinalv', 'sing', 'sins', 'sinu', 'siocmf', 'siog', 'sioi', 'siommms', 'siot', 'sip', 'sipem', 'sips', 'sirf', 'sirm', 'sis', 'sisde', 'sismi', 'sissa', 'sit', 'siulp', 'siusa', 'sla', 'sldn', 'slm', 'slr', 'sm', 'sma', 'smau', 'smd', 'sme', 'smes', 'smm', 'smpt', 'sms', 'sn', 'snad', 'snai', 'snc', 'sncci', 'sncf', 'sngci', 'snit', 'so', 'soc', 'sociol', 'sogg', 'soho', 'soi', 'sol', 'somipar', 'somm', 'sonar', 'sp', 'spa', 'spe', 'spett', 'spi', 'spm', 'spot', 'spp', 'spreg', 'sq', 'sqd', 'sr', 'srd', 'srl', 'srr', 'ss', 'ssi', 'ssn', 'ssr', 'sss', 'st', 'st. d. arte', 'st. d. dir', 'st. d. filos', 'st. d. rel', 'stat', 'stg', 'stp', 'stw', 'su', 'suap', 'suem', 'suff', 'sup', 'superl', 'supt', 'surg', 'surl', 'susm', 'sut', 'suv', 'sv', 'svga', 'swics', 'swift', 'swot', 'sxga', 'sz', 't-dab', 't.sg', 'ta', 'taa', 'tac', 'tacan', 'tacs', 'taeg', 'tai', 'tan', 'tar', 'targa', 'tav', 'tb', 'tbt', 'tci', 'tcp', 'tcp/ip', 'tcsm', 'tdm', 'tdma', 'te', 'tecn', 'tecnol', 'ted', 'tel', 'telecom', 'temp', 'ten.(lt)', 'ten.col.(ltc)', 'ten.gen', 'teol', 'term', 'tesa', 'tese', 'tesol', 'tess', 'tet', 'tetra', 'tfr', 'tft', 'tfts', 'tgv', 'thx', 'tim', 'tipogr', 'tir', 'tit', 'tld', 'tm', 'tmc', 'tn', 'to', 'toefl', 'ton', 'top', 'topog', 'tos', 'tosap', 'tosc', 'tp', 'tpl', 'tr', 'trad', 'tramat', 'trasp', 'ts', 'tso', 'tuir', 'tuld', 'tv', 'twa', 'twain', 'u.ad', 'u.s', 'ucai', 'ucca', 'ucei', 'ucina', 'uclaf', 'ucoi', 'ucoii', 'ucsi', 'ud', 'udc', 'udi', 'udp', 'ue', 'uefa', 'uemri', 'ufo', 'ugc', 'uhci', 'uhf', 'uht', 'uibm', 'uic', 'uicc', 'uiga', 'uil', 'uilps', 'uisp', 'uits', 'uk', 'ul', 'ull', 'uma', 'umb', 'ummc', 'umss', 'umts', 'unac', 'unar', 'unasp', 'uncem', 'unctad', 'undp', 'unefa', 'unep', 'unesco', 'ungh', 'unhcr', 'uni', 'unicef', 'unitec', 'unpredep', 'unsa', 'upa', 'upc', 'urar', 'urban', 'url', 'urp', 'urss', 'usa', 'usb', 'usfi', 'usga', 'usl', 'usp', 'uspi', 'ussr', 'utap', 'v', 'v.brig', 'v.cte', 'v.m', 'v.p', 'v.r', 'v.s', 'va', 'vab', 'vaio', 'val', 'vas', 'vb', 'vbr', 'vc', 'vcc', 'vcr', 'vda', 've', 'ven', 'ves', 'vesa', 'veter', 'vezz', 'vfb', 'vfp', 'vfx', 'vga', 'vhf', 'vhs', 'vi', 'via', 'vip', 'vis', 'vn', 'vo', 'voc', 'voip', 'vol', 'volg', 'voll', 'vor', 'vpdn', 'vpn', 'vr', 'vs', 'vsp', 'vt', 'vtc', 'vts', 'vtt', 'vv', 'vvf', 'wai', 'wais', 'wan', 'wap', 'wasp', 'wc', 'wcdma', 'wcm', 'wga', 'wi-fi', 'wipo', 'wisp', 'wll', 'wml', 'wms', 'worm', 'wp', 'wpan', 'wssn', 'wto', 'wwan', 'wwf', 'www', 'wygiwys', 'xl', 'xml', 'xs', 'xxl', 'xxs', 'yaf', 'yb', 'yci', 'yd', 'yd²', 'yd³', 'ymca', 'zat', 'zb', 'zcs', 'zdf', 'zdg', 'zift', 'zool', 'zoot', 'ztc', 'ztl', '°c', '°f', '°n', '°ra', '°ré', 'µg']).freeze 10 | PREPOSITIVE_ABBREVIATIONS = Set.new(['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']).freeze 11 | NUMBER_ABBREVIATIONS = Set.new(['art', 'no', 'nos', 'nr', 'pp']).freeze 12 | end 13 | 14 | class AbbreviationReplacer < AbbreviationReplacer 15 | SENTENCE_STARTERS = [].freeze 16 | end 17 | end 18 | end 19 | end 20 | --------------------------------------------------------------------------------