├── .rspec
├── lib
├── unicode.rb
├── pragmatic_segmenter.rb
└── pragmatic_segmenter
│ ├── version.rb
│ ├── types.rb
│ ├── languages
│ ├── armenian.rb
│ ├── greek.rb
│ ├── urdu.rb
│ ├── amharic.rb
│ ├── burmese.rb
│ ├── hindi.rb
│ ├── persian.rb
│ ├── english.rb
│ ├── bulgarian.rb
│ ├── french.rb
│ ├── russian.rb
│ ├── arabic.rb
│ ├── common
│ │ ├── ellipsis.rb
│ │ └── numbers.rb
│ ├── chinese.rb
│ ├── polish.rb
│ ├── japanese.rb
│ ├── spanish.rb
│ ├── kazakh.rb
│ ├── deutsch.rb
│ ├── common.rb
│ ├── danish.rb
│ ├── dutch.rb
│ └── italian.rb
│ ├── exclamation_words.rb
│ ├── segmenter.rb
│ ├── languages.rb
│ ├── punctuation_replacer.rb
│ ├── cleaner
│ └── rules.rb
│ ├── between_punctuation.rb
│ ├── cleaner.rb
│ ├── processor.rb
│ ├── abbreviation_replacer.rb
│ └── list.rb
├── spec
├── spec_helper.rb
├── pragmatic_segmenter
│ ├── languages
│ │ ├── polish_spec.rb
│ │ ├── burmese_spec.rb
│ │ ├── amharic_spec.rb
│ │ ├── chinese_spec.rb
│ │ ├── urdu_spec.rb
│ │ ├── persian_spec.rb
│ │ ├── greek_spec.rb
│ │ ├── hindi_spec.rb
│ │ ├── dutch_spec.rb
│ │ ├── bulgarian_spec.rb
│ │ ├── french_spec.rb
│ │ ├── kazakh_spec.rb
│ │ ├── japanese_spec.rb
│ │ ├── arabic_spec.rb
│ │ ├── italian_spec.rb
│ │ ├── russian_spec.rb
│ │ ├── armenian_spec.rb
│ │ └── spanish_spec.rb
│ └── languages_spec.rb
├── pragmatic_segmenter_spec.rb
└── performance_spec.rb
├── Rakefile
├── Gemfile
├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── pragmatic_segmenter.gemspec
├── CODE_OF_CONDUCT.md
└── NEWS
/.rspec:
--------------------------------------------------------------------------------
1 | --color
--------------------------------------------------------------------------------
/lib/unicode.rb:
--------------------------------------------------------------------------------
1 | module Unicode
2 | def self.downcase(text)
3 | text.downcase
4 | end
5 | end
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | require 'simplecov'
2 | SimpleCov.start
3 | require 'pragmatic_segmenter'
4 |
5 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter.rb:
--------------------------------------------------------------------------------
1 | require "set"
2 | require "pragmatic_segmenter/version"
3 | require "pragmatic_segmenter/segmenter"
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/version.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | VERSION = "0.3.24"
5 | end
6 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler/gem_tasks'
2 | require 'rspec/core/rake_task'
3 |
4 | RSpec::Core::RakeTask.new(:spec)
5 | task :default => :spec
6 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | group :test do
3 | gem 'simplecov'
4 | gem 'codeclimate-test-reporter'
5 | end
6 | # Specify your gem's dependencies in pragmatic_segmenter.gemspec
7 | gemspec
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle/
2 | /.yardoc
3 | /Gemfile.lock
4 | /_yardoc/
5 | /coverage/
6 | /doc/
7 | /pkg/
8 | /spec/reports/
9 | /tmp/
10 | *.bundle
11 | *.so
12 | *.o
13 | *.a
14 | mkmf.log
15 | .DS_Store
16 | .vscode/launch.json
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/types.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | class Rule < Struct.new(:pattern, :replacement)
5 | class << self
6 | def apply(str, *rules)
7 | rules.flatten.each do |rule|
8 | str.gsub!(rule.pattern, rule.replacement)
9 | end
10 | str
11 | end
12 | end
13 | end
14 | end
15 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/polish_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Polish, '(pl)' do
4 |
5 | describe '#segment' do
6 | it 'correctly segments text #001' do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "To słowo bałt. jestskrótem.", language: 'pl')
8 | expect(ps.segment).to eq(["To słowo bałt. jestskrótem."])
9 | end
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/armenian.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Armenian
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
9 | Punctuations = ['։', '՜', ':'].freeze
10 |
11 | class AbbreviationReplacer < AbbreviationReplacer
12 | SENTENCE_STARTERS = [].freeze
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/greek.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Greek
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[\.;!\?]|.*?$/
9 | Punctuations = ['.', '!', ';', '?'].freeze
10 |
11 | class AbbreviationReplacer < AbbreviationReplacer
12 | SENTENCE_STARTERS = [].freeze
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/urdu.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Urdu
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[۔؟!\?]|.*?$/
9 | Punctuations = ['?', '!', '۔', '؟'].freeze
10 |
11 | class AbbreviationReplacer < AbbreviationReplacer
12 | SENTENCE_STARTERS = [].freeze
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/amharic.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Amharic
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[፧።!\?]|.*?$/
9 | Punctuations = ['።', '፧', '?', '!'].freeze
10 |
11 | class AbbreviationReplacer < AbbreviationReplacer
12 | SENTENCE_STARTERS = [].freeze
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/burmese.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Burmese
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
9 | Punctuations = ['။', '၏', '?', '!'].freeze
10 |
11 | class AbbreviationReplacer < AbbreviationReplacer
12 | SENTENCE_STARTERS = [].freeze
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/hindi.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Hindi
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[।\|!\?]|.*?$/
9 | Punctuations = ['।', '|', '.', '!', '?'].freeze
10 |
11 | class AbbreviationReplacer < AbbreviationReplacer
12 | SENTENCE_STARTERS = [].freeze
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - "2.1.5"
4 | - "2.2.0"
5 | - "2.2.4"
6 | - "2.3.0"
7 | - "2.3.1"
8 | # uncomment this line if your project needs to run something other than `rake`:
9 | # script: bundle exec rspec spec
10 | addons:
11 | code_climate:
12 | repo_token:
13 | secure: "TDtg1SY+50yvYL8nRhf3YG2xtyS4b7wdJddGL7BRvYHkn5jhmGAXRU9F9+IRyPLPlwwd/VX2zxClmU4hr3DAbb7C/JUscNmVUcDeiwlMOIEUIjKXT+f+TFkLLjTsXjivdX7T9oD/pzHUHB5SjqWfWyZKIo2uAiTv6zt4PYvoeUQ="
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/burmese_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Burmese, '(my)' do
4 |
5 | context "Golden Rules" do
6 | it "Sentence ending punctuation #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my')
8 | expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
9 | end
10 | end
11 |
12 | describe '#segment' do
13 | it 'correctly segments text #001' do
14 | ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my')
15 | expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/amharic_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Amharic, '(am)' do
4 |
5 | context "Golden Rules" do
6 | it "Sentence ending punctuation #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am')
8 | expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"])
9 | end
10 | end
11 |
12 | describe '#segment' do
13 | it 'correctly segments text #001' do
14 | ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am')
15 | expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"])
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/chinese_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Chinese, '(zh)' do
4 |
5 | describe '#segment' do
6 | it 'correctly segments text #001' do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh')
8 | expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"])
9 | end
10 |
11 | it 'correctly segments text #002' do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "我们明天一起去看《摔跤吧!爸爸》好吗?好!", language: 'zh')
13 | expect(ps.segment).to eq(["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"])
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/urdu_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Urdu, '(ur)' do
4 |
5 | context "Golden Rules" do
6 | it "Sentence ending punctuation #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur')
8 | expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"])
9 | end
10 | end
11 |
12 | describe '#segment' do
13 | it 'correctly segments text #001' do
14 | ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur')
15 | expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"])
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/persian.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Persian
6 | include Languages::Common
7 |
8 | SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟]|.*?\z|.*?$/
9 | Punctuations = ['?', '!', ':', '.', '؟'].freeze
10 |
11 | ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
12 | ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 |
17 | private
18 |
19 | def scan_for_replacements(txt, am, index, character_array)
20 | txt.gsub!(/(?<=#{am})\./, '∯')
21 | txt
22 | end
23 | end
24 | end
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/exclamation_words.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | require 'pragmatic_segmenter/punctuation_replacer'
5 |
6 | module PragmaticSegmenter
7 | # This class searches for exclamation points that
8 | # are part of words and not ending punctuation and replaces them.
9 | module ExclamationWords
10 | EXCLAMATION_WORDS = %w[!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!].freeze
11 | REGEXP = Regexp.new(EXCLAMATION_WORDS.map { |string| Regexp.escape(string) }.join('|'))
12 |
13 | def self.apply_rules(text)
14 | PragmaticSegmenter::PunctuationReplacer.new(
15 | matches_array: text.scan(REGEXP),
16 | text: text
17 | ).replace
18 | end
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/persian_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Persian, '(fa)' do
4 |
5 | context "Golden Rules" do
6 | it "Sentence ending punctuation #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa')
8 | expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
9 | end
10 | end
11 |
12 | describe '#segment' do
13 | it 'correctly segments text #001' do
14 | ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa')
15 | expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/english.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module English
6 | include Languages::Common
7 |
8 | class Cleaner < Cleaner
9 | def clean
10 | super
11 | clean_quotations
12 | end
13 |
14 | private
15 |
16 | def clean_quotations
17 | @text.gsub(/`/, "'")
18 | end
19 |
20 | def abbreviations
21 | [].freeze
22 | end
23 | end
24 |
25 | class AbbreviationReplacer < AbbreviationReplacer
26 | SENTENCE_STARTERS = %w(
27 | A Being Did For He How However I In It Millions More She That The
28 | There They We What When Where Who Why
29 | ).freeze
30 | end
31 | end
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/greek_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Greek, '(el)' do
4 |
5 | context "Golden Rules" do
6 | it "Question mark to end sentence #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: "el")
8 | expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."])
9 | end
10 | end
11 |
12 | describe '#segment' do
13 | it 'correctly segments text #001' do
14 | ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: 'el')
15 | expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."])
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/hindi_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Hindi, '(hi)' do
4 |
5 | context "Golden Rules" do
6 | it "Full stop #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: "hi")
8 | expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
9 | end
10 | end
11 |
12 | describe '#segment' do
13 | it 'correctly segments text #001' do
14 | ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: 'hi')
15 | expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/segmenter.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | require 'pragmatic_segmenter/languages'
5 |
6 | module PragmaticSegmenter
7 | # This class segments a text into an array of sentences.
8 | class Segmenter
9 | attr_reader :text, :language, :doc_type
10 |
11 | def initialize(text:, language: 'en', doc_type: nil, clean: true)
12 | return unless text
13 | @language = language
14 | @language_module = Languages.get_language_by_code(language)
15 | @doc_type = doc_type
16 |
17 | if clean
18 | @text = cleaner.new(text: text, doc_type: @doc_type, language: @language_module).clean
19 | else
20 | @text = text
21 | end
22 | end
23 |
24 | def segment
25 | return [] unless @text
26 | processor.new(language: @language_module).process(text: @text)
27 | end
28 |
29 | private
30 |
31 | def processor
32 | @language_module::Processor
33 | rescue
34 | Processor
35 | end
36 |
37 | def cleaner
38 | @language_module::Cleaner
39 | rescue
40 | Cleaner
41 | end
42 | end
43 | end
44 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Kevin S. Dias
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/bulgarian.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Bulgarian
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(["p.s", "акад", "ал", "б.р", "б.ред", "бел.а", "бел.пр", "бр", "бул", "в", "вж", "вкл", "вм", "вр", "г", "ген", "гр", "дж", "дм", "доц", "др", "ем", "заб", "зам", "инж", "к.с", "кв", "кв.м", "кг", "км", "кор", "куб", "куб.м", "л", "лв", "м", "м.г", "мин", "млн", "млрд", "мм", "н.с", "напр", "пл", "полк", "проф", "р", "рис", "с", "св", "сек", "см", "сп", "срв", "ст", "стр", "т", "т.г", "т.е", "т.н", "т.нар", "табл", "тел", "у", "ул", "фиг", "ха", "хил", "ч", "чл", "щ.д"]).freeze
10 | NUMBER_ABBREVIATIONS = [].freeze
11 | PREPOSITIVE_ABBREVIATIONS = [].freeze
12 | end
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 |
17 | private
18 | def replace_period_of_abbr(txt, abbr)
19 | txt.gsub!(/(?<=\s#{abbr.strip})\.|(?<=^#{abbr.strip})\./, '∯')
20 | txt
21 | end
22 | end
23 | end
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/dutch_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Dutch, '(nl)' do
4 |
5 | context "Golden Rules" do
6 | it "Sentence starting with a number #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.", language: 'nl')
8 | expect(ps.segment).to eq(["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."])
9 | end
10 |
11 | it "Sentence starting with an ellipsis #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "81 procent van de schoten was raak. ...en toen barste de hel los.", language: 'nl')
13 | expect(ps.segment).to eq(["81 procent van de schoten was raak.", "...en toen barste de hel los."])
14 | end
15 | end
16 |
17 | describe '#segment' do
18 | it 'correctly segments text #001' do
19 | ps = PragmaticSegmenter::Segmenter.new(text: "Afkorting aanw. vnw.", language: 'nl')
20 | expect(ps.segment).to eq(["Afkorting aanw. vnw."])
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe PragmaticSegmenter::Languages do
4 | describe '.get_language_by_code' do
5 | context "when language code defined" do
6 | PragmaticSegmenter::Languages::LANGUAGE_CODES.each do |code, lang|
7 | it "returns '#{lang}' for '#{code}'" do
8 | expect(described_class.get_language_by_code(code)).to eql(lang)
9 | end
10 | end
11 | end
12 |
13 | context "when language code not defined" do
14 | it "returns 'PragmaticSegmenter::Languages::Common'" do
15 | expect(described_class.get_language_by_code('xxyyzz')).to eql(PragmaticSegmenter::Languages::Common)
16 | end
17 | end
18 |
19 | context "when language code empty string" do
20 | it "returns 'PragmaticSegmenter::Languages::Common'" do
21 | expect(described_class.get_language_by_code('')).to eql(PragmaticSegmenter::Languages::Common)
22 | end
23 | end
24 |
25 | context "when language code nil" do
26 | it "returns 'PragmaticSegmenter::Languages::Common'" do
27 | expect(described_class.get_language_by_code(nil)).to eql(PragmaticSegmenter::Languages::Common)
28 | end
29 | end
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/french.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module French
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']).freeze
10 | PREPOSITIVE_ABBREVIATIONS = [].freeze
11 | NUMBER_ABBREVIATIONS = [].freeze
12 | end
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/russian.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Russian
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(["y", "y.e", "а", "авт", "адм.-терр", "акад", "в", "вв", "вкз", "вост.-европ", "г", "гг", "гос", "гр", "д", "деп", "дисс", "дол", "долл", "ежедн", "ж", "жен", "з", "зап", "зап.-европ", "заруб", "и", "ин", "иностр", "инст", "к", "канд", "кв", "кг", "куб", "л", "л.h", "л.н", "м", "мин", "моск", "муж", "н", "нед", "о", "п", "пгт", "пер", "пп", "пр", "просп", "проф", "р", "руб", "с", "сек", "см", "спб", "стр", "т", "тел", "тов", "тт", "тыс", "у", "у.е", "ул", "ф", "ч"]).freeze
10 | PREPOSITIVE_ABBREVIATIONS = [].freeze
11 | NUMBER_ABBREVIATIONS = [].freeze
12 | end
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 |
17 | private
18 |
19 | def replace_period_of_abbr(txt, abbr)
20 | txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
21 | txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
22 | txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
23 | txt
24 | end
25 | end
26 | end
27 | end
28 | end
29 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/arabic.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Arabic
6 | include Languages::Common
7 |
8 | Punctuations = ['?', '!', ':', '.', '؟', '،'].freeze
9 | SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/
10 |
11 | module Abbreviation
12 | ABBREVIATIONS = Set.new(['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']).freeze
13 | PREPOSITIVE_ABBREVIATIONS = [].freeze
14 | NUMBER_ABBREVIATIONS = [].freeze
15 | end
16 |
17 | # Rubular: http://rubular.com/r/RX5HpdDIyv
18 | ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
19 |
20 | # Rubular: http://rubular.com/r/kPRgApNHUg
21 | ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
22 |
23 | class AbbreviationReplacer < AbbreviationReplacer
24 | SENTENCE_STARTERS = [].freeze
25 | private
26 |
27 | def scan_for_replacements(txt, am, index, character_array)
28 | txt.gsub!(/(?<=#{am})\./, '∯')
29 | txt
30 | end
31 | end
32 | end
33 | end
34 | end
35 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/common/ellipsis.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | module PragmaticSegmenter
5 | module Languages
6 | module Common
7 | # This class searches for ellipses within a string and
8 | # replaces the periods.
9 |
10 | # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
11 | # http://www.thepunctuationguide.com/ellipses.html
12 |
13 | module EllipsisRules
14 | # Rubular: http://rubular.com/r/i60hCK81fz
15 | ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
16 |
17 | # Rubular: http://rubular.com/r/Hdqpd90owl
18 | FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
19 |
20 | # Rubular: http://rubular.com/r/YBG1dIHTRu
21 | ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
22 |
23 | # Rubular: http://rubular.com/r/2VvZ8wRbd8
24 | FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
25 |
26 | OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
27 |
28 | All = [
29 | ThreeSpaceRule,
30 | FourSpaceRule,
31 | FourConsecutiveRule,
32 | ThreeConsecutiveRule,
33 | OtherThreePeriodRule
34 | ]
35 | end
36 | end
37 | end
38 | end
39 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/chinese.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Chinese
6 | include Languages::Common
7 |
8 | class AbbreviationReplacer < AbbreviationReplacer
9 | SENTENCE_STARTERS = [].freeze
10 | end
11 |
12 | class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
13 | BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/
14 | BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/
15 | private
16 |
17 | def sub_punctuation_between_quotes_and_parens(txt)
18 | super
19 | sub_punctuation_between_double_angled_quotation_marks(txt)
20 | sub_punctuation_between_l_bracket(txt)
21 | end
22 |
23 | def sub_punctuation_between_double_angled_quotation_marks(txt)
24 | PunctuationReplacer.new(
25 | matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX),
26 | text: txt
27 | ).replace
28 | end
29 |
30 | def sub_punctuation_between_l_bracket(txt)
31 | PunctuationReplacer.new(
32 | matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX),
33 | text: txt
34 | ).replace
35 | end
36 | end
37 | end
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/pragmatic_segmenter.gemspec:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | lib = File.expand_path('../lib', __FILE__)
3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4 | require 'pragmatic_segmenter/version'
5 |
6 | Gem::Specification.new do |spec|
7 | spec.name = "pragmatic_segmenter"
8 | spec.version = PragmaticSegmenter::VERSION
9 | spec.authors = ["Kevin S. Dias"]
10 | spec.email = ["diasks2@gmail.com"]
11 | spec.summary = %q{A rule-based sentence boundary detection gem that works out-of-the-box across many languages}
12 | spec.description = %q{Pragmatic Segmenter is a sentence segmentation tool for Ruby. It allows you to split a text into an array of sentences. This gem provides 2 main benefits over other segmentation gems - 1) It works well even with ill-formatted text 2) It works for multiple languages }
13 | spec.homepage = "https://github.com/diasks2/pragmatic_segmenter"
14 | spec.license = "MIT"
15 |
16 | spec.files = `git ls-files -z`.split("\x0")
17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19 | spec.require_paths = ["lib"]
20 |
21 | spec.add_development_dependency "bundler", ">= 1.7"
22 | spec.add_development_dependency "rake", ">= 12.3.3"
23 | spec.add_development_dependency "rspec"
24 | spec.add_development_dependency "stackprof"
25 | end
26 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/polish.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Polish
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(['ags', 'alb', 'ang', 'aor', 'awest', 'bałt', 'bojkow', 'bret', 'brus', 'bsł', 'bułg', 'c.b.d.o', 'c.b.d.u', 'celt', 'chorw', 'cs', 'czakaw', 'czerw', 'czes', 'dłuż', 'dniem', 'dor', 'dubrow', 'duń', 'ekaw', 'fiń', 'franc', 'gal', 'germ', 'głuż', 'gniem', 'goc', 'gr', 'grudz', 'hebr', 'het', 'hol', 'I cont', 'ie', 'ikaw', 'irań', 'irl', 'islandz', 'itd', 'itd.', 'itp', 'jekaw', 'kajkaw', 'kasz', 'kirg', 'kwiec', 'łac', 'lip', 'listop', 'lit', 'łot', 'lp', 'maced', 'mar', 'młpol', 'moraw', 'n.e', 'nb.', 'ngr', 'niem', 'nord', 'norw', 'np', 'np.', 'ok.', 'orm', 'oset', 'osk', 'p.n', 'p.n.e', 'p.o', 'pazdz', 'pers', 'pie', 'pod red.', 'podhal', 'pol', 'połab', 'port', 'prekm', 'pskow', 'psł', 'R cont', 'rez', 'rom', 'rozdz.', 'rum', 'rus', 'rys.', 'sas', 'sch', 'scs', 'serb', 'sierp', 'śl', 'sła', 'słe', 'słi', 'słow', 'sp. z o.o', 'śrdniem', 'śrgniem', 'śrirl', 'stbułg', 'stind', 'stpol', 'stpr', 'str.', 'strus', 'stwniem', 'stycz', 'sztokaw', 'szwedz', 't.', 'tj.', 'tłum.', 'toch', 'tur', 'tzn', 'ukr', 'ul', 'umbr', 'wed', 'węg', 'wlkpol', 'włos', 'wrzes', 'wyd.', 'zakarp']).freeze
10 | PREPOSITIVE_ABBREVIATIONS = [].freeze
11 | NUMBER_ABBREVIATIONS = [].freeze
12 | end
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/japanese.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Japanese
6 | include Languages::Common
7 |
8 | class Cleaner < PragmaticSegmenter::Cleaner
9 | # Rubular: http://rubular.com/r/N4kPuJgle7
10 | NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')
11 |
12 | def clean
13 | super
14 | remove_newline_in_middle_of_word
15 | end
16 |
17 | private
18 |
19 | def remove_newline_in_middle_of_word
20 | Rule.apply @text, NewLineInMiddleOfWordRule
21 | end
22 | end
23 |
24 | class AbbreviationReplacer < AbbreviationReplacer
25 | SENTENCE_STARTERS = [].freeze
26 | end
27 |
28 | class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
29 | # Rubular: http://rubular.com/r/GnjOmry5Z2
30 | BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/
31 |
32 | # Rubular: http://rubular.com/r/EjHcZn5ZSG
33 | BETWEEN_PARENS_JA_REGEX = /\u{ff08}(?>[^\u{ff08}\u{ff09}\\]+|\\{2}|\\.)*\u{ff09}/
34 | private
35 |
36 | def sub_punctuation_between_quotes_and_parens(txt)
37 | super
38 | sub_punctuation_between_parens_ja(txt)
39 | sub_punctuation_between_quotes_ja(txt)
40 | end
41 |
42 | def sub_punctuation_between_quotes_ja(txt)
43 | PunctuationReplacer.new(
44 | matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
45 | text: txt
46 | ).replace
47 | end
48 |
49 | def sub_punctuation_between_parens_ja(txt)
50 | PunctuationReplacer.new(
51 | matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
52 | text: txt
53 | ).replace
54 | end
55 | end
56 | end
57 | end
58 | end
59 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
4 |
5 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality.
6 |
7 | Examples of unacceptable behavior by participants include:
8 |
9 | * The use of sexualized language or imagery
10 | * Personal attacks
11 | * Trolling or insulting/derogatory comments
12 | * Public or private harassment
13 | * Publishing other's private information, such as physical or electronic addresses, without explicit permission
14 | * Other unethical or unprofessional conduct
15 |
16 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team.
17 |
18 | This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community.
19 |
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
21 |
22 | This Code of Conduct is adapted from the Contributor Covenant, version 1.2.0, available from http://contributor-covenant.org/version/1/2/0/
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'pragmatic_segmenter/types'
4 | require 'pragmatic_segmenter/processor'
5 | require 'pragmatic_segmenter/cleaner'
6 |
7 | require 'pragmatic_segmenter/languages/common'
8 |
9 | require 'pragmatic_segmenter/languages/english'
10 | require 'pragmatic_segmenter/languages/deutsch'
11 | require 'pragmatic_segmenter/languages/hindi'
12 | require 'pragmatic_segmenter/languages/persian'
13 | require 'pragmatic_segmenter/languages/amharic'
14 | require 'pragmatic_segmenter/languages/arabic'
15 | require 'pragmatic_segmenter/languages/greek'
16 | require 'pragmatic_segmenter/languages/armenian'
17 | require 'pragmatic_segmenter/languages/burmese'
18 | require 'pragmatic_segmenter/languages/urdu'
19 | require 'pragmatic_segmenter/languages/french'
20 | require 'pragmatic_segmenter/languages/italian'
21 | require 'pragmatic_segmenter/languages/spanish'
22 | require 'pragmatic_segmenter/languages/russian'
23 | require 'pragmatic_segmenter/languages/japanese'
24 | require 'pragmatic_segmenter/languages/dutch'
25 | require 'pragmatic_segmenter/languages/polish'
26 | require 'pragmatic_segmenter/languages/chinese'
27 | require 'pragmatic_segmenter/languages/bulgarian'
28 | require 'pragmatic_segmenter/languages/danish'
29 | require 'pragmatic_segmenter/languages/kazakh'
30 |
31 | module PragmaticSegmenter
32 | module Languages
33 | LANGUAGE_CODES = {
34 | 'en' => English,
35 | 'bg' => Bulgarian,
36 | 'de' => Deutsch,
37 | 'es' => Spanish,
38 | 'fr' => French,
39 | 'it' => Italian,
40 | 'ja' => Japanese,
41 | 'el' => Greek,
42 | 'ru' => Russian,
43 | 'ar' => Arabic,
44 | 'am' => Amharic,
45 | 'hi' => Hindi,
46 | 'hy' => Armenian,
47 | 'fa' => Persian,
48 | 'my' => Burmese,
49 | 'ur' => Urdu,
50 | 'nl' => Dutch,
51 | 'pl' => Polish,
52 | 'zh' => Chinese,
53 | 'da' => Danish,
54 | 'kk' => Kazakh
55 | }
56 |
57 | def self.get_language_by_code(code)
58 | LANGUAGE_CODES[code] || Common
59 | end
60 | end
61 | end
62 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/punctuation_replacer.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | module PragmaticSegmenter
5 | # This class replaces punctuation that is typically a sentence boundary
6 | # but in this case is not a sentence boundary.
7 | class PunctuationReplacer
8 | module Rules
9 | module EscapeRegexReservedCharacters
10 | LeftParen = Rule.new('(', '\\(')
11 | RightParen = Rule.new(')', '\\)')
12 | LeftBracket = Rule.new('[', '\\[')
13 | RightBracket = Rule.new(']', '\\]')
14 | Dash = Rule.new('-', '\\-')
15 |
16 | All = [ LeftParen, RightParen,
17 | LeftBracket, RightBracket, Dash ]
18 | end
19 |
20 | module SubEscapedRegexReservedCharacters
21 | SubLeftParen = Rule.new('\\(', '(')
22 | SubRightParen = Rule.new('\\)', ')')
23 | SubLeftBracket = Rule.new('\\[', '[')
24 | SubRightBracket = Rule.new('\\]', ']')
25 | SubDash = Rule.new('\\-', '-')
26 |
27 | All = [ SubLeftParen, SubRightParen,
28 | SubLeftBracket, SubRightBracket, SubDash ]
29 | end
30 |
31 | end
32 |
33 | attr_reader :matches_array, :text, :match_type
34 | def initialize(text:, matches_array:, match_type: nil)
35 | @text = text
36 | @matches_array = matches_array
37 | @match_type = match_type
38 | end
39 |
40 | def replace
41 | replace_punctuation(matches_array)
42 | end
43 |
44 | private
45 |
46 | def replace_punctuation(array)
47 | return if !array || array.empty?
48 | Rule.apply(@text, Rules::EscapeRegexReservedCharacters::All)
49 | array.each do |a|
50 | Rule.apply(a, Rules::EscapeRegexReservedCharacters::All)
51 | sub = sub_characters(a, '.', '∯')
52 | sub_1 = sub_characters(sub, '。', '&ᓰ&')
53 | sub_2 = sub_characters(sub_1, '.', '&ᓱ&')
54 | sub_3 = sub_characters(sub_2, '!', '&ᓳ&')
55 | sub_4 = sub_characters(sub_3, '!', '&ᓴ&')
56 | sub_5 = sub_characters(sub_4, '?', '&ᓷ&')
57 | sub_6 = sub_characters(sub_5, '?', '&ᓸ&')
58 | unless match_type.eql?('single')
59 | sub_7 = sub_characters(sub_6, "'", '&⎋&')
60 | end
61 | end
62 | Rule.apply(@text, Rules::SubEscapedRegexReservedCharacters::All)
63 | end
64 |
65 | def sub_characters(string, char_a, char_b)
66 | sub = string.gsub(char_a, char_b)
67 | @text.gsub!(/#{Regexp.escape(string)}/, sub)
68 | sub
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/bulgarian_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Bulgarian, '(bg)' do
4 |
5 | describe '#segment' do
6 |
7 | it 'correctly segments text #001' do
8 | ps = PragmaticSegmenter::Segmenter.new(text: "В първата половина на ноември т.г. ще бъде свикан Консултативният съвет за национална сигурност, обяви държавният глава.", language: 'bg')
9 | expect(ps.segment).to eq(["В първата половина на ноември т.г. ще бъде свикан Консултативният съвет за национална сигурност, обяви държавният глава."])
10 | end
11 |
12 | it 'correctly segments text #002' do
13 | ps = PragmaticSegmenter::Segmenter.new(text: "Компютърът е устройство с общо предназначение, което може да бъде програмирано да извършва набор от аритметични и/или логически операции. Възможността поредицата такива операции да бъде променяна позволява компютърът да се използва за решаването на теоретично всяка изчислителна/логическа задача. Обикновено целта на тези операции е обработката на въведена информация (данни), представена в цифров (дигитален) вид, резултатът от които може да се изведе в най-общо казано използваема форма.", language: 'bg')
14 | expect(ps.segment).to eq(["Компютърът е устройство с общо предназначение, което може да бъде програмирано да извършва набор от аритметични и/или логически операции.", "Възможността поредицата такива операции да бъде променяна позволява компютърът да се използва за решаването на теоретично всяка изчислителна/логическа задача.", "Обикновено целта на тези операции е обработката на въведена информация (данни), представена в цифров (дигитален) вид, резултатът от които може да се изведе в най-общо казано използваема форма."])
15 | end
16 |
17 | it 'correctly segments text #003' do
18 | ps = PragmaticSegmenter::Segmenter.new(text: "Пл. \"20 Април\"", language: 'bg')
19 | expect(ps.segment).to eq(["Пл. \"20 Април\""])
20 | end
21 |
22 | it 'correctly segments text #004' do
23 | ps = PragmaticSegmenter::Segmenter.new(text: "Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат. Стойностни, вкл. български и руски", language: 'bg')
24 | expect(ps.segment).to eq(["Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.", "Стойностни, вкл. български и руски"])
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Segmenter do
4 |
5 | describe '#segment' do
6 | it 'handles nil' do
7 | ps = PragmaticSegmenter::Segmenter.new(text: nil)
8 | expect(ps.segment).to eq([])
9 | end
10 |
11 | it 'handles no language' do
12 | ps = PragmaticSegmenter::Segmenter.new(text: 'Hello world. Hello.')
13 | expect(ps.segment).to eq(["Hello world.", "Hello."])
14 | end
15 |
16 | it 'handles empty strings' do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "\n")
18 | expect(ps.segment).to eq([])
19 | end
20 |
21 | it 'handles empty strings' do
22 | ps = PragmaticSegmenter::Segmenter.new(text: "")
23 | expect(ps.segment).to eq([])
24 | end
25 |
26 | it 'handles empty strings' do
27 | ps = PragmaticSegmenter::Segmenter.new(text: '')
28 | expect(ps.segment).to eq([])
29 | end
30 |
31 | it 'has an option to not use the cleaner' do
32 | ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en", clean: false)
33 | expect(ps.segment).to eq(["It was a cold", "night in the city."])
34 | end
35 |
36 | it 'does not mutate the input string' do
37 | text = "It was a cold \nnight in the city."
38 | PragmaticSegmenter::Segmenter.new(text: text, language: "en").segment
39 | expect(text).to eq("It was a cold \nnight in the city.")
40 | end
41 |
42 | describe '#clean' do
43 | it 'cleans the text #001' do
44 | ps = PragmaticSegmenter::Cleaner.new(text: "It was a cold \nnight in the city.", language: "en")
45 | expect(ps.clean).to eq("It was a cold night in the city.")
46 | end
47 |
48 | it 'cleans the text #002' do
49 | text = 'injections made by the Shareholder through the years. 7 (max.) 3. Specifications/4.Design and function The operating instructions are part of the product and must be kept in the immediate vicinity of the instrument and readily accessible to skilled "'
50 | ps = PragmaticSegmenter::Cleaner.new(text: text)
51 | expect(ps.clean).to eq("injections made by the Shareholder through the years. 7 (max.) 3. Specifications/4.Design and function The operating instructions are part of the product and must be kept in the immediate vicinity of the instrument and readily accessible to skilled \"")
52 | end
53 |
54 | it 'does not mutate the input string (cleaner)' do
55 | text = "It was a cold \nnight in the city."
56 | PragmaticSegmenter::Cleaner.new(text: text, language: "en").clean
57 | expect(text).to eq("It was a cold \nnight in the city.")
58 | end
59 | end
60 | end
61 | end
62 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/french_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::French, '(fr)' do
4 |
5 | describe '#segment' do
6 | it 'correctly segments text #001' do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale. L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle.", language: 'fr')
8 | expect(ps.segment).to eq(["Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale.", "L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle."])
9 | end
10 |
11 | it 'correctly segments text #002' do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté.", language: 'fr')
13 | expect(ps.segment).to eq(["\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté."])
14 | end
15 |
16 | it 'correctly segments text #003' do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires. Elle assure que ce mouvement « n’aura aucun impact sur les livraisons ».", language: 'fr')
18 | expect(ps.segment).to eq(["À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires.", "Elle assure que ce mouvement « n’aura aucun impact sur les livraisons »."])
19 | end
20 |
21 | it 'correctly segments text #004' do
22 | ps = PragmaticSegmenter::Segmenter.new(text: "Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle.", language: 'fr')
23 | expect(ps.segment).to eq(["Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle."])
24 | end
25 |
26 | it 'correctly segments text #005' do
27 | ps = PragmaticSegmenter::Segmenter.new(text: "Les derniers ouvrages de Intercept Ltd. sont ici.", language: 'fr')
28 | expect(ps.segment).to eq(["Les derniers ouvrages de Intercept Ltd. sont ici."])
29 | end
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/spanish.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Spanish
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']).freeze
10 | PREPOSITIVE_ABBREVIATIONS = Set.new(['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']).freeze
11 | NUMBER_ABBREVIATIONS = Set.new(['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']).freeze
12 | end
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/cleaner/rules.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | # This is an opinionated class that removes errant newlines,
5 | # xhtml, inline formatting, etc.
6 | class Cleaner
7 | module Rules
8 | # Rubular: http://rubular.com/r/V57WnM9Zut
9 | NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
10 |
11 | # Rubular: http://rubular.com/r/dMxp5MixFS
12 | DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
13 |
14 | # Rubular: http://rubular.com/r/H6HOJeA8bq
15 | DoubleNewLineRule = Rule.new(/\n\n/, "\r")
16 |
17 | # Rubular: http://rubular.com/r/FseyMiiYFT
18 | NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
19 |
20 |
21 | ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
22 |
23 | EscapedNewLineRule = Rule.new(/\\n/, "\n")
24 | EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
25 |
26 | TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
27 |
28 | TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
29 |
30 |
31 |
32 |
33 | # Rubular: http://rubular.com/r/bAJrhyLNeZ
34 | InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*\s]+))?)+\s*|\s*)\/?>/, '')
71 |
72 | # Rubular: http://rubular.com/r/XZVqMPJhea
73 | EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
74 |
75 | All = [HTMLTagRule, EscapedHTMLTagRule]
76 | end
77 |
78 | module PDF
79 | # Rubular: http://rubular.com/r/UZAVcwqck8
80 | NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
81 |
82 | # Rubular: http://rubular.com/r/eaNwGavmdo
83 | NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
84 | end
85 |
86 | end
87 | end
88 | end
89 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/kazakh.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Kazakh
6 | include Languages::Common
7 |
8 | MULTI_PERIOD_ABBREVIATION_REGEX = /\b\p{Cyrillic}(?:\.\s?\p{Cyrillic})+[.]|b[a-z](?:\.[a-z])+[.]/i
9 |
10 | module Abbreviation
11 | ABBREVIATIONS = Set.new(['afp', 'anp', 'atp', 'bae', 'bg', 'bp', 'cam', 'cctv', 'cd', 'cez', 'cgi', 'cnpc', 'farc', 'fbi', 'eiti', 'epo', 'er', 'gp', 'gps', 'has', 'hiv', 'hrh', 'http', 'icu', 'idf', 'imd', 'ime', 'icu', 'idf', 'ip', 'iso', 'kaz', 'kpo', 'kpa', 'kz', 'kz', 'mri', 'nasa', 'nba', 'nbc', 'nds', 'ohl', 'omlt', 'ppm', 'pda', 'pkk', 'psm', 'psp', 'raf', 'rss', 'rtl', 'sas', 'sme', 'sms', 'tnt', 'udf', 'uefa', 'usb', 'utc', 'x', 'zdf', 'әқбк', 'әқбк', 'аақ', 'авг.', 'aбб', 'аек', 'ак', 'ақ', 'акцион.', 'акср', 'ақш', 'англ', 'аөсшк', 'апр', 'м.', 'а.', 'р.', 'ғ.', 'апр.', 'аум.', 'ацат', 'әч', 'т. б.', 'б. з. б.', 'б. з. б.', 'б. з. д.', 'б. з. д.', 'биікт.', 'б. т.', 'биол.', 'биохим', 'бө', 'б. э. д.', 'бта', 'бұұ', 'вич', 'всоонл', 'геогр.', 'геол.', 'гленкор', 'гэс', 'қк', 'км', 'г', 'млн', 'млрд', 'т', 'ғ. с.', 'ғ.', 'қ.', 'ғ.', 'дек.', 'днқ', 'дсұ', 'еақк', 'еқыұ', 'ембімұнайгаз', 'ео', 'еуразэқ', 'еуроодақ', 'еұу', 'ж.', 'ж.', 'жж.', 'жоо', 'жіө', 'жсдп', 'жшс', 'іім', 'инта', 'исаф', 'камаз', 'кгб', 'кеу', 'кг', 'км²', 'км²', 'км³', 'км³', 'кимеп', 'кср', 'ксро', 'кокп', 'кхдр', 'қазатомпром', 'қазкср', 'қазұу', 'қазмұнайгаз', 'қазпошта', 'қазтаг', 'қазұу', 'қкп', 'қмдб', 'қр', 'қхр', 'лат.', 'м²', 'м²', 'м³', 'м³', 'магатэ', 'май.', 'максам', 'мб', 'мвт', 'мемл', 'м', 'мсоп', 'мтк', 'мыс.', 'наса', 'нато', 'нквд', 'нояб.', 'обл.', 'огпу', 'окт.', 'оңт.', 'опек', 'оеб', 'өзенмұнайгаз', 'өф', 'пәк', 'пед.', 'ркфср', 'рнқ', 'рсфср', 'рф', 'свс', 'сву', 'сду', 'сес', 'сент.', 'см', 'снпс', 'солт.', 'солт.', 'сооно', 'ссро', 'сср', 'ссср', 'ссс', 'сэс', 'дк', 'т. б.', 'т', 'тв', 'тереңд.', 'тех.', 'тжқ', 'тмд', 'төм.', 'трлн', 'тр', 'т.', 'и.', 'м.', 'с.', 'ш.', 'т.', 'т. с. с.', 'тэц', 'уаз', 'уефа', 'еқыұ', 'ұқк', 'ұқшұ', 'февр.', 'фққ', 'фсб', 'хим.', 'хқко', 'шұар', 'шыұ', 'экон.', 'экспо', 'цтп', 'цас', 'янв.', 'dvd', 'жкт', 'ққс', 'км', 'ацат', 'юнеско', 'ббс', 'mgm', 'жск', 'зоо', 'бсн', 'өұқ', 'оар', 'боак', 'эөкк', 'хтқо', 'әөк', 'жэк', 'хдо', 'спбму', 'аф', 'сбд', 'амт', 'гсдп', 'гсбп', 'эыдұ', 'нұсжп', 'шыұ', 'жтсх', 'хдп', 'эқк', 'фкққ', 'пиқ', 'өгк', 'мбф', 'маж', 'кота', 'тж', 'ук', 'обб', 'сбл', 'жхл', 'кмс', 'бмтрк', 'жққ', 'бхооо', 'мқо', 'ржмб', 'гулаг', 'жко', 'еэы', 'еаэы', 'кхдр', 'рфкп', 'рлдп', 'хвқ', 'мр', 'мт', 'кту', 'ртж', 'тим', 'мемдум', 'ксро', 'т.с.с', 'с.ш.', 'ш.б.', 'б.б.', 'руб', 'мин', 'акад.', 'ғ.', 'мм', 'мм.']).freeze
12 | PREPOSITIVE_ABBREVIATIONS = [].freeze
13 | NUMBER_ABBREVIATIONS = [].freeze
14 | end
15 |
16 | class Processor < PragmaticSegmenter::Processor
17 | private
18 |
19 | # Rubular: http://rubular.com/r/WRWy56Z5zp
20 | QuestionMarkFollowedByDashLowercaseRule = Rule.new(/(?<=\p{Ll})\?(?=\s*[-—]\s*\p{Ll})/, '&ᓷ&')
21 | # Rubular: http://rubular.com/r/lixxP7puSa
22 | ExclamationMarkFollowedByDashLowercaseRule = Rule.new(/(?<=\p{Ll})!(?=\s*[-—]\s*\p{Ll})/, '&ᓴ&')
23 |
24 | def between_punctuation(txt)
25 | super(txt)
26 | Rule.apply(txt, QuestionMarkFollowedByDashLowercaseRule, ExclamationMarkFollowedByDashLowercaseRule)
27 | end
28 | end
29 |
30 | class AbbreviationReplacer < AbbreviationReplacer
31 | SENTENCE_STARTERS = [].freeze
32 |
33 | SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule.new(/(?<=^[А-ЯЁ])\.(?=\s)/, '∯')
34 | SingleUpperCaseCyrillicLetterRule = Rule.new(/(?<=\s[А-ЯЁ])\.(?=\s)/, '∯')
35 |
36 | def replace
37 | super
38 | Rule.apply(@text, SingleUpperCaseCyrillicLetterAtStartOfLineRule, SingleUpperCaseCyrillicLetterRule)
39 | end
40 | end
41 | end
42 | end
43 | end
44 |
45 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/common/numbers.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | module PragmaticSegmenter
5 | module Languages
6 | module Common
7 | module Numbers
8 | # Rubular: http://rubular.com/r/oNyxBOqbyy
9 | PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
10 |
11 | # Rubular: http://rubular.com/r/EMk5MpiUzt
12 | NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
13 |
14 | # Rubular: http://rubular.com/r/rf4l1HjtjG
15 | NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
16 |
17 | # Rubular: http://rubular.com/r/HPa4sdc6b9
18 | StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
19 |
20 | # Rubular: http://rubular.com/r/NuvWnKleFl
21 | StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
22 |
23 | All = [
24 | PeriodBeforeNumberRule,
25 | NumberAfterPeriodBeforeLetterRule,
26 | NewLineNumberPeriodSpaceLetterRule,
27 | StartLineNumberPeriodRule,
28 | StartLineTwoDigitNumberPeriodRule
29 | ]
30 | end
31 |
32 |
33 | SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
34 |
35 | # Rubular: http://rubular.com/r/NqCqv372Ix
36 | QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
37 |
38 | # Rubular: http://rubular.com/r/6flGnUMEVl
39 | PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
40 |
41 | # Rubular: http://rubular.com/r/TYzr4qOW1Q
42 | BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
43 |
44 | # Rubular: http://rubular.com/r/JMjlZHAT4g
45 | SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
46 |
47 | # Rubular: http://rubular.com/r/mQ8Es9bxtk
48 | CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
49 |
50 | NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)?\b\d{1,3}\])+|((\d{1,3}\s?){0,3}\d{1,3}))(\s)(?=[A-Z])/
51 |
52 | # Rubular: http://rubular.com/r/yqa4Rit8EY
53 | PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
54 |
55 | # Rubular: http://rubular.com/r/NEv265G2X2
56 | KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
57 |
58 | # Rubular: http://rubular.com/r/xDkpFZ0EgH
59 | MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
60 |
61 | module AmPmRules
62 | # Rubular: http://rubular.com/r/Vnx3m4Spc8
63 | UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
64 |
65 | # Rubular: http://rubular.com/r/AJMCotJVbW
66 | UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
67 |
68 | # Rubular: http://rubular.com/r/13q7SnOhgA
69 | LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
70 |
71 | # Rubular: http://rubular.com/r/DgUDq4mLz5
72 | LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
73 |
74 | All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
75 | end
76 |
77 | # This class searches for periods within an abbreviation and
78 | # replaces the periods.
79 | module SingleLetterAbbreviationRules
80 | # Rubular: http://rubular.com/r/e3H6kwnr6H
81 | SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=,?\s)/, '∯')
82 |
83 | # Rubular: http://rubular.com/r/gitvf0YWH4
84 | SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=,?\s)/, '∯')
85 |
86 | All = [
87 | SingleUpperCaseLetterAtStartOfLineRule,
88 | SingleUpperCaseLetterRule
89 | ]
90 | end
91 | end
92 | end
93 | end
94 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/between_punctuation.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | module PragmaticSegmenter
5 | # This class searches for punctuation between quotes or parenthesis
6 | # and replaces it
7 | class BetweenPunctuation
8 | # Rubular: http://rubular.com/r/2YFrKWQUYi
9 | BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
10 |
11 | BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/
12 |
13 | # Rubular: http://rubular.com/r/3Pw1QlXOjd
14 | BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
15 |
16 | # Rubular: http://rubular.com/r/x6s4PZK8jc
17 | BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/
18 |
19 | # Rubular: http://rubular.com/r/JbAIpKdlSq
20 | BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
21 |
22 | # Rubular: http://rubular.com/r/WX4AvnZvlX
23 | BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
24 |
25 | # Rubular: http://rubular.com/r/6tTityPflI
26 | BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
27 |
28 | # Rubular: http://rubular.com/r/mXf8cW025o
29 | WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/
30 |
31 | # Rubular: http://rubular.com/r/jTtDKfjxzr
32 | BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/
33 |
34 | attr_reader :text
35 | def initialize(text:)
36 | @text = text
37 | end
38 |
39 | def replace
40 | sub_punctuation_between_quotes_and_parens(text)
41 | end
42 |
43 | private
44 |
45 | def sub_punctuation_between_quotes_and_parens(txt)
46 | sub_punctuation_between_single_quotes(txt)
47 | sub_punctuation_between_single_quote_slanted(txt)
48 | sub_punctuation_between_double_quotes(txt)
49 | sub_punctuation_between_square_brackets(txt)
50 | sub_punctuation_between_parens(txt)
51 | sub_punctuation_between_quotes_arrow(txt)
52 | sub_punctuation_between_em_dashes(txt)
53 | sub_punctuation_between_quotes_slanted(txt)
54 | end
55 |
56 | def sub_punctuation_between_parens(txt)
57 | PragmaticSegmenter::PunctuationReplacer.new(
58 | matches_array: txt.scan(BETWEEN_PARENS_REGEX),
59 | text: txt
60 | ).replace
61 | end
62 |
63 | def sub_punctuation_between_square_brackets(txt)
64 | PragmaticSegmenter::PunctuationReplacer.new(
65 | matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
66 | text: txt
67 | ).replace
68 | end
69 |
70 | def sub_punctuation_between_single_quotes(txt)
71 | unless !(txt !~ WORD_WITH_LEADING_APOSTROPHE) && txt !~ /'\s/
72 | PragmaticSegmenter::PunctuationReplacer.new(
73 | matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
74 | text: txt,
75 | match_type: 'single'
76 | ).replace
77 | end
78 | end
79 |
80 | def sub_punctuation_between_single_quote_slanted(txt)
81 | PragmaticSegmenter::PunctuationReplacer.new(
82 | matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX),
83 | text: txt
84 | ).replace
85 | end
86 |
87 | def sub_punctuation_between_double_quotes(txt)
88 | PragmaticSegmenter::PunctuationReplacer.new(
89 | matches_array: btwn_dbl_quote(txt),
90 | text: txt
91 | ).replace
92 | end
93 |
94 | def btwn_dbl_quote(txt)
95 | txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
96 | end
97 |
98 | def sub_punctuation_between_quotes_arrow(txt)
99 | PragmaticSegmenter::PunctuationReplacer.new(
100 | matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
101 | text: txt
102 | ).replace
103 | end
104 |
105 | def sub_punctuation_between_em_dashes(txt)
106 | PragmaticSegmenter::PunctuationReplacer.new(
107 | matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
108 | text: txt
109 | ).replace
110 | end
111 |
112 | def sub_punctuation_between_quotes_slanted(txt)
113 | PragmaticSegmenter::PunctuationReplacer.new(
114 | matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
115 | text: txt
116 | ).replace
117 | end
118 | end
119 | end
120 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/cleaner.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | require_relative 'cleaner/rules'
5 |
6 | module PragmaticSegmenter
7 | # This is an opinionated class that removes errant newlines,
8 | # xhtml, inline formatting, etc.
9 | class Cleaner
10 | include Rules
11 |
12 | attr_reader :text, :doc_type
13 | def initialize(text:, doc_type: nil, language: Languages::Common)
14 | @text = text.dup
15 | @doc_type = doc_type
16 | @language = language
17 | end
18 |
19 | # Clean text of unwanted formatting
20 | #
21 | # Example:
22 | # >> text = "This is a sentence\ncut off in the middle because pdf."
23 | # >> PragmaticSegmenter::Cleaner(text: text).clean
24 | # => "This is a sentence cut off in the middle because pdf."
25 | #
26 | # Arguments:
27 | # text: (String) *required
28 | # language: (String) *optional
29 | # (two character ISO 639-1 code e.g. 'en')
30 | # doc_type: (String) *optional
31 | # (e.g. 'pdf')
32 |
33 | def clean
34 | return unless text
35 | remove_all_newlines
36 | replace_double_newlines
37 | replace_newlines
38 | replace_escaped_newlines
39 |
40 | Rule.apply(@text, HTML::All)
41 |
42 | replace_punctuation_in_brackets
43 | Rule.apply(@text, InlineFormattingRule)
44 | clean_quotations
45 | clean_table_of_contents
46 | check_for_no_space_in_between_sentences
47 | clean_consecutive_characters
48 | end
49 |
50 | private
51 |
52 | def abbreviations
53 | @language::Abbreviation::ABBREVIATIONS
54 | end
55 |
56 | def check_for_no_space_in_between_sentences
57 | words = @text.split(' ')
58 | words.each do |word|
59 | search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
60 | search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
61 | end
62 | @text
63 | end
64 |
65 | def replace_punctuation_in_brackets
66 | @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
67 | @text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?')
68 | end
69 | end
70 |
71 | def search_for_connected_sentences(word, txt, regex, rule)
72 | if word =~ regex
73 | unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
74 | unless abbreviations.any? { |abbr| word =~ /#{abbr}/i }
75 | new_word = Rule.apply(word.dup, rule)
76 | txt.gsub!(/#{Regexp.escape(word)}/, new_word)
77 | end
78 | end
79 | end
80 | end
81 |
82 | def remove_all_newlines
83 | remove_newline_in_middle_of_sentence
84 | remove_newline_in_middle_of_word
85 | end
86 |
87 | def remove_newline_in_middle_of_sentence
88 | @text.gsub!(/(?:[^\.])*/) do |match|
89 | match.gsub(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
90 | end
91 | @text
92 | end
93 |
94 | def remove_newline_in_middle_of_word
95 | Rule.apply @text, NewLineInMiddleOfWordRule
96 | end
97 |
98 | def replace_escaped_newlines
99 | Rule.apply @text, EscapedNewLineRule, EscapedCarriageReturnRule,
100 | TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
101 | end
102 |
103 | def replace_double_newlines
104 | Rule.apply @text, DoubleNewLineWithSpaceRule, DoubleNewLineRule
105 | end
106 |
107 | def replace_newlines
108 | if doc_type.eql?('pdf')
109 | remove_pdf_line_breaks
110 | else
111 | Rule.apply @text, NewLineFollowedByPeriodRule,
112 | ReplaceNewlineWithCarriageReturnRule
113 | end
114 | end
115 |
116 | def remove_pdf_line_breaks
117 | Rule.apply @text, NewLineFollowedByBulletRule,
118 |
119 | PDF::NewLineInMiddleOfSentenceRule,
120 | PDF::NewLineInMiddleOfSentenceNoSpacesRule
121 | end
122 |
123 | def clean_quotations
124 | Rule.apply @text, QuotationsFirstRule, QuotationsSecondRule
125 | end
126 |
127 | def clean_table_of_contents
128 | Rule.apply @text, TableOfContentsRule, ConsecutivePeriodsRule,
129 | ConsecutiveForwardSlashRule
130 | end
131 |
132 | def clean_consecutive_characters
133 | Rule.apply @text, ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
134 | end
135 | end
136 | end
137 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/processor.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | require 'pragmatic_segmenter/punctuation_replacer'
5 | require 'pragmatic_segmenter/between_punctuation'
6 |
7 |
8 | require 'pragmatic_segmenter/list'
9 | require 'pragmatic_segmenter/abbreviation_replacer'
10 | require 'pragmatic_segmenter/exclamation_words'
11 |
12 | module PragmaticSegmenter
13 | # This class processing segmenting the text.
14 | class Processor
15 |
16 | attr_reader :text
17 | def initialize(language: Languages::Common)
18 | @language = language
19 | end
20 |
21 | def process(text:)
22 | @text = List.new(text: text).add_line_break
23 | replace_abbreviations
24 | replace_numbers
25 | replace_continuous_punctuation
26 | replace_periods_before_numeric_references
27 | Rule.apply(@text, @language::Abbreviations::WithMultiplePeriodsAndEmailRule)
28 | Rule.apply(@text, @language::GeoLocationRule)
29 | Rule.apply(@text, @language::FileFormatRule)
30 | split_into_segments
31 | end
32 |
33 | private
34 |
35 | def split_into_segments
36 | check_for_parens_between_quotes(@text).split("\r")
37 | .map! { |segment| Rule.apply(segment, @language::SingleNewLineRule, @language::EllipsisRules::All) }
38 | .map { |segment| check_for_punctuation(segment) }.flatten
39 | .map! { |segment| Rule.apply(segment, @language::SubSymbolsRules::All) }
40 | .map { |segment| post_process_segments(segment) }
41 | .flatten.compact.delete_if(&:empty?)
42 | .map! { |segment| Rule.apply(segment, @language::SubSingleQuoteRule) }
43 | end
44 |
45 | def post_process_segments(txt)
46 | return txt if txt.length < 2 && txt =~ /\A[a-zA-Z]*\Z/
47 | return if consecutive_underscore?(txt) || txt.length < 2
48 | Rule.apply(
49 | txt,
50 | @language::ReinsertEllipsisRules::All,
51 | @language::ExtraWhiteSpaceRule
52 | )
53 |
54 | if txt =~ @language::QUOTATION_AT_END_OF_SENTENCE_REGEX
55 | txt.split(@language::SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
56 | else
57 | txt.tr("\n", '').strip
58 | end
59 | end
60 |
61 | def check_for_parens_between_quotes(txt)
62 | return txt unless txt =~ @language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX
63 | txt.gsub!(@language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX) do |match|
64 | match.gsub(/\s(?=\()/, "\r").gsub(/(?<=\))\s/, "\r")
65 | end
66 | end
67 |
68 | def replace_continuous_punctuation
69 | @text.gsub!(@language::CONTINUOUS_PUNCTUATION_REGEX) do |match|
70 | match.gsub(/!/, '&ᓴ&').gsub(/\?/, '&ᓷ&')
71 | end
72 | end
73 |
74 | def replace_periods_before_numeric_references
75 | @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7")
76 | end
77 |
78 | def consecutive_underscore?(txt)
79 | # Rubular: http://rubular.com/r/fTF2Ff3WBL
80 | txt.gsub(/_{3,}/, '').length.eql?(0)
81 | end
82 |
83 | def check_for_punctuation(txt)
84 | if @language::Punctuations.any? { |p| txt.include?(p) }
85 | process_text(txt)
86 | else
87 | txt
88 | end
89 | end
90 |
91 | def process_text(txt)
92 | txt << 'ȸ' unless @language::Punctuations.any? { |p| txt[-1].include?(p) }
93 | ExclamationWords.apply_rules(txt)
94 | between_punctuation(txt)
95 | txt = Rule.apply(
96 | txt,
97 | @language::DoublePunctuationRules::All,
98 | @language::QuestionMarkInQuotationRule,
99 | @language::ExclamationPointRules::All
100 | )
101 | txt = List.new(text: txt).replace_parens
102 | sentence_boundary_punctuation(txt)
103 | end
104 |
105 | def replace_numbers
106 | Rule.apply @text, @language::Numbers::All
107 | end
108 |
109 | def abbreviations_replacer
110 | if defined? @language::AbbreviationReplacer
111 | @language::AbbreviationReplacer
112 | else
113 | AbbreviationReplacer
114 | end
115 | end
116 |
117 | def replace_abbreviations
118 | @text = abbreviations_replacer.new(text: @text, language: @language).replace
119 | end
120 |
121 | def between_punctuation_processor
122 | if defined? @language::BetweenPunctuation
123 | @language::BetweenPunctuation
124 | else
125 | BetweenPunctuation
126 | end
127 | end
128 |
129 | def between_punctuation(txt)
130 | between_punctuation_processor.new(text: txt).replace
131 | end
132 |
133 | def sentence_boundary_punctuation(txt)
134 | txt = Rule.apply txt, @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule
135 | txt = Rule.apply txt, @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule
136 |
137 | txt.scan(@language::SENTENCE_BOUNDARY_REGEX)
138 | end
139 | end
140 | end
141 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/deutsch.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Deutsch
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt', 'univ.-prof', 'o.univ.-prof', 'ao.univ.prof', 'ass.prof', 'hon.prof', 'univ.-doz', 'univ.ass', 'stud.ass', 'projektass', 'ass', 'di', 'dipl.-ing', 'mag']).freeze
10 | NUMBER_ABBREVIATIONS = Set.new(['art', 'ca', 'no', 'nos', 'nr', 'pp']).freeze
11 | PREPOSITIVE_ABBREVIATIONS = [].freeze
12 | end
13 |
14 | # Rubular: http://rubular.com/r/OdcXBsub0w
15 | BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
16 |
17 | # Rubular: http://rubular.com/r/2UskIupGgP
18 | SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
19 |
20 | # Rubular: http://rubular.com/r/TkZomF9tTM
21 | BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
22 |
23 |
24 | module Numbers
25 | # Rubular: http://rubular.com/r/hZxoyQwKT1
26 | NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
27 |
28 | # Rubular: http://rubular.com/r/ityNMwdghj
29 | NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
30 |
31 | All = [
32 | Common::Numbers::All,
33 | NumberPeriodSpaceRule,
34 | NegativeNumberPeriodSpaceRule
35 | ]
36 | end
37 |
38 | MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember'].freeze
39 |
40 | # Rubular: http://rubular.com/r/B4X33QKIL8
41 | SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
42 |
43 | # Rubular: http://rubular.com/r/iUNSkCuso0
44 | SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
45 |
46 | class Processor < PragmaticSegmenter::Processor
47 | private
48 |
49 | def replace_numbers
50 | Rule.apply @text, Numbers::All
51 |
52 | replace_period_in_deutsch_dates
53 | end
54 |
55 | def replace_period_in_deutsch_dates
56 | MONTHS.each do |month|
57 | # Rubular: http://rubular.com/r/zlqgj7G5dA
58 | @text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
59 | end
60 | end
61 | end
62 |
63 | class AbbreviationReplacer < AbbreviationReplacer
64 |
65 | SENTENCE_STARTERS = %w(
66 | Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In
67 | Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir
68 | ).freeze
69 |
70 | def replace
71 | @text = Rule.apply(
72 | text,
73 | @language::PossessiveAbbreviationRule,
74 | @language::SingleLetterAbbreviationRules::All,
75 | SingleLowerCaseLetterRule,
76 | SingleLowerCaseLetterAtStartOfLineRule)
77 |
78 | @text = search_for_abbreviations_in_string(@text)
79 | @text = replace_multi_period_abbreviations(@text)
80 | Rule.apply(@text, Languages::Common::AmPmRules::All)
81 | replace_abbreviation_as_sentence_boundary(@text)
82 | end
83 |
84 | private
85 |
86 | def scan_for_replacements(txt, am, index, character_array)
87 | txt.gsub!(/(?<=#{am})\.(?=\s)/, '∯')
88 | txt
89 | end
90 | end
91 |
92 | class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
93 | private
94 |
95 | def btwn_dbl_quote(txt)
96 | if txt.include?('„')
97 | btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
98 | txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
99 | btwn_dbl_quote << q
100 | end
101 | elsif txt.include?(',,')
102 | btwn_dbl_quote = txt.scan(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX)
103 | end
104 | btwn_dbl_quote
105 | end
106 | end
107 | end
108 | end
109 | end
110 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/abbreviation_replacer.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | require 'unicode'
5 |
6 | module PragmaticSegmenter
7 | # This class searches for periods within an abbreviation and
8 | # replaces the periods.
9 | class AbbreviationReplacer
10 |
11 | attr_reader :text
12 | def initialize(text:, language: )
13 | @text = text.dup
14 | @language = language
15 | end
16 |
17 | def replace
18 | Rule.apply(@text,
19 | @language::PossessiveAbbreviationRule,
20 | @language::KommanditgesellschaftRule,
21 | @language::SingleLetterAbbreviationRules::All)
22 |
23 | @text = search_for_abbreviations_in_string(@text)
24 | @text = replace_multi_period_abbreviations(@text)
25 | Rule.apply(@text, @language::AmPmRules::All)
26 | replace_abbreviation_as_sentence_boundary(@text)
27 | end
28 |
29 | private
30 |
31 | def search_for_abbreviations_in_string(txt)
32 | original = txt.dup
33 | downcased = Unicode::downcase(txt)
34 | @language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
35 | stripped = abbreviation.strip
36 | next unless downcased.include?(stripped)
37 | abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
38 | next if abbrev_match.empty?
39 | next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
40 | character_array = @text.scan(next_word_start)
41 | abbrev_match.each_with_index do |am, index|
42 | txt = scan_for_replacements(txt, am, index, character_array)
43 | end
44 | end
45 | txt
46 | end
47 |
48 | def scan_for_replacements(txt, am, index, character_array)
49 | character = character_array[index]
50 | prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
51 | number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
52 | upper = /[[:upper:]]/.match(character.to_s)
53 | if upper.nil? || prepositive.include?(Unicode::downcase(am.strip))
54 | if prepositive.include?(Unicode::downcase(am.strip))
55 | txt = replace_prepositive_abbr(txt, am)
56 | elsif number_abbr.include?(Unicode::downcase(am.strip))
57 | txt = replace_pre_number_abbr(txt, am)
58 | else
59 | txt = replace_period_of_abbr(txt, am)
60 | end
61 | end
62 | txt
63 | end
64 |
65 | def replace_abbreviation_as_sentence_boundary(txt)
66 | # As we are being conservative and keeping ambiguous
67 | # sentence boundaries as one sentence instead of
68 | # splitting into two, we can split at words that
69 | # we know for certain never follow these abbreviations.
70 | # Some might say that the set of words that follow an
71 | # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
72 | # the set of words that could start a sentence and
73 | # never follow U.S. However, we are being conservative
74 | # and not splitting by default, so we need to look for places
75 | # where we definitely can split. Obviously SENTENCE_STARTERS
76 | # will never cover all cases, but as the gem is named
77 | # 'Pragmatic Segmenter' we need to be pragmatic
78 | # and try to cover the words that most often start a
79 | # sentence but could never follow one of the abbreviations below.
80 |
81 | # Rubular: http://rubular.com/r/PkBQ3PVBS8
82 | @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
83 | escaped = Regexp.escape(word)
84 | regex = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
85 | txt.gsub!(regex, '\1.')
86 | end
87 | txt
88 | end
89 |
90 | def replace_multi_period_abbreviations(txt)
91 | mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
92 | return txt if mpa.empty?
93 | mpa.each do |r|
94 | txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
95 | end
96 | txt
97 | end
98 |
99 | def replace_pre_number_abbr(txt, abbr)
100 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
101 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
102 | txt
103 | end
104 |
105 | def replace_prepositive_abbr(txt, abbr)
106 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
107 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
108 | txt
109 | end
110 |
111 | def replace_period_of_abbr(txt, abbr)
112 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
113 | txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
114 | txt
115 | end
116 |
117 | def replace_possessive_abbreviations(txt)
118 | txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
119 | txt
120 | end
121 | end
122 | end
123 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/kazakh_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Kazakh, "(kk)" do
4 |
5 | context "Golden Rules" do
6 | it "Simple period to end sentence #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Мұхитқа тікелей шыға алмайтын мемлекеттердің ішінде Қазақстан - ең үлкені.", language: "kk")
8 | expect(ps.segment).to eq(["Мұхитқа тікелей шыға алмайтын мемлекеттердің ішінде Қазақстан - ең үлкені."])
9 | end
10 |
11 | it "Question mark to end sentence #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "Оқушылар үйі, Достық даңғылы, Абай даналығы, ауыл шаруашылығы – кім? не?", language: "kk")
13 | expect(ps.segment).to eq(["Оқушылар үйі, Достық даңғылы, Абай даналығы, ауыл шаруашылығы – кім?", "не?"])
14 | end
15 |
16 | it "Parenthetical inside sentence #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "Әр түрлі өлшемнің атауы болып табылатын м (метр), см (сантиметр), кг (киллограмм), т (тонна), га (гектар), ц (центнер), т. б. (тағы басқа), тәрізді белгілер де қысқарған сөздер болып табылады.", language: "kk")
18 | expect(ps.segment).to eq(["Әр түрлі өлшемнің атауы болып табылатын м (метр), см (сантиметр), кг (киллограмм), т (тонна), га (гектар), ц (центнер), т. б. (тағы басқа), тәрізді белгілер де қысқарған сөздер болып табылады."])
19 | end
20 |
21 | it "Two letter abbreviation to end sentence #004" do
22 | ps = PragmaticSegmenter::Segmenter.new(text: "Мысалы: обкомға (облыстық комитетке) барды, ауаткомда (аудандық атқару комитетінде) болды, педучилищеге (педагогтік училищеге) түсті, медпункттің (медициналық пункттің) алдында т. б.", language: "kk")
23 | expect(ps.segment).to eq(["Мысалы: обкомға (облыстық комитетке) барды, ауаткомда (аудандық атқару комитетінде) болды, педучилищеге (педагогтік училищеге) түсті, медпункттің (медициналық пункттің) алдында т. б."])
24 | end
25 |
26 | it "Number as non sentence boundary #005" do
27 | ps = PragmaticSegmenter::Segmenter.new(text: "Елдің жалпы ішкі өнімі ЖІӨ (номинал) = $225.619 млрд (2014)", language: "kk")
28 | expect(ps.segment).to eq(["Елдің жалпы ішкі өнімі ЖІӨ (номинал) = $225.619 млрд (2014)"])
29 | end
30 |
31 | it "No whitespace between sentence boundary #006" do
32 | ps = PragmaticSegmenter::Segmenter.new(text: "Ресейдiң әлеуметтiк-экономикалық жағдайы.XVIII ғасырдың бiрiншi ширегiнде Ресейге тән нәрсе.", language: "kk")
33 | expect(ps.segment).to eq(["Ресейдiң әлеуметтiк-экономикалық жағдайы.", "XVIII ғасырдың бiрiншi ширегiнде Ресейге тән нәрсе."])
34 | end
35 |
36 | it "Dates within sentence #007" do
37 | ps = PragmaticSegmenter::Segmenter.new(text: "(«Егемен Қазақстан», 7 қыркүйек 2012 жыл. №590-591); Бұл туралы кеше санпедқадағалау комитетінің облыыстық департаменті хабарлады. («Айқын», 23 сəуір 2010 жыл. № 70).", language: "kk")
38 | expect(ps.segment).to eq(["(«Егемен Қазақстан», 7 қыркүйек 2012 жыл. №590-591); Бұл туралы кеше санпедқадағалау комитетінің облыыстық департаменті хабарлады.", "(«Айқын», 23 сəуір 2010 жыл. № 70)."])
39 | end
40 |
41 | it "Multi period abbreviation within sentence #008" do
42 | ps = PragmaticSegmenter::Segmenter.new(text: "Иран революциясы (1905 — 11) және азаматтық қозғалыс (1918 — 21) кезінде А. Фарахани, М. Кермани, М. Т. Бехар, т.б. ақындар демократиялық идеяның жыршысы болды.", language: "kk")
43 | expect(ps.segment).to eq(["Иран революциясы (1905 — 11) және азаматтық қозғалыс (1918 — 21) кезінде А. Фарахани, М. Кермани, М. Т. Бехар, т.б. ақындар демократиялық идеяның жыршысы болды."])
44 | end
45 |
46 | it "Web addresses #009" do
47 | ps = PragmaticSegmenter::Segmenter.new(text: "Владимир Федосеев: Аттар магиясы енді жоқ http://www.vremya.ru/2003/179/10/80980.html", language: "kk")
48 | expect(ps.segment).to eq(["Владимир Федосеев: Аттар магиясы енді жоқ http://www.vremya.ru/2003/179/10/80980.html"])
49 | end
50 |
51 | it "Question mark not at end of sentence #010" do
52 | ps = PragmaticSegmenter::Segmenter.new(text: "Бірақ оның енді не керегі бар? — деді.", language: "kk")
53 | expect(ps.segment).to eq(["Бірақ оның енді не керегі бар? — деді."])
54 | end
55 |
56 | it "Exclamation mark not at end of sentence #011" do
57 | ps = PragmaticSegmenter::Segmenter.new(text: "Сондықтан шапаныма жегізіп отырғаным! - деп, жауап береді.", language: "kk")
58 | expect(ps.segment).to eq(["Сондықтан шапаныма жегізіп отырғаным! - деп, жауап береді."])
59 | end
60 | end
61 |
62 | describe '#segment' do
63 | it 'correctly segments text #001' do
64 | ps = PragmaticSegmenter::Segmenter.new(text: "Б.з.б. 6 – 3 ғасырларда конфуцийшілдік, моизм, легизм мектептерінің қалыптасуы нәтижесінде Қытай философиясы пайда болды.", language: 'kk')
65 | expect(ps.segment).to eq(["Б.з.б. 6 – 3 ғасырларда конфуцийшілдік, моизм, легизм мектептерінің қалыптасуы нәтижесінде Қытай философиясы пайда болды."])
66 | end
67 |
68 | it 'correctly segments text #002' do
69 | ps = PragmaticSegmenter::Segmenter.new(text: "'Та марбута' тек сөз соңында екі түрде жазылады:", language: "kk")
70 | expect(ps.segment).to eq(["'Та марбута' тек сөз соңында екі түрде жазылады:"])
71 | end
72 | end
73 | end
74 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/japanese_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Japanese, "(ja)" do
4 |
5 | context "Golden Rules" do
6 | it "Simple period to end sentence #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "これはペンです。それはマーカーです。", language: "ja")
8 | expect(ps.segment).to eq(["これはペンです。", "それはマーカーです。"])
9 | end
10 |
11 | it "Question mark to end sentence #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "それは何ですか?ペンですか?", language: "ja")
13 | expect(ps.segment).to eq(["それは何ですか?", "ペンですか?"])
14 | end
15 |
16 | it "Exclamation point to end sentence #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "良かったね!すごい!", language: "ja")
18 | expect(ps.segment).to eq(["良かったね!", "すごい!"])
19 | end
20 |
21 | it "Quotation #004" do
22 | ps = PragmaticSegmenter::Segmenter.new(text: "自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。2%台後半を目指すとする方向で最終調整に入りました。", language: "ja")
23 | expect(ps.segment).to eq(["自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。", "2%台後半を目指すとする方向で最終調整に入りました。"])
24 | end
25 |
26 | it "Errant newlines in the middle of sentences #005" do
27 | ps = PragmaticSegmenter::Segmenter.new(text: "これは父の\n家です。", language: "ja")
28 | expect(ps.segment).to eq(["これは父の家です。"])
29 | end
30 | end
31 |
32 | describe '#segment' do
33 | it 'correctly segments text #001' do
34 | ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。 \nこれは山です(これは山です。これは山です)。これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。 \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。 \n※1 これは山です。 \n2.)これは山です、これは山です、これは山です、これは山です。 \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。 \n4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。 \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。 \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
35 | expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。", "これは山です(これは山です。これは山です)。", "これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。", "※1 これは山です。", "2.)これは山です、これは山です、これは山です、これは山です。", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。", "4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
36 | end
37 |
38 | it 'correctly segments text #002' do
39 | ps = PragmaticSegmenter::Segmenter.new(text: "フフーの\n主たる債務", language: 'ja')
40 | expect(ps.segment).to eq(["フフーの主たる債務"])
41 | end
42 |
43 | it 'correctly segments text #003' do
44 | ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です. \nこれは山です(これは山です.これは山です).これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です. \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です. \n※1 これは山です. \n2.)これは山です、これは山です、これは山です、これは山です. \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です. \n4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です). \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です. \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
45 | expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です.", "これは山です(これは山です.これは山です).", "これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です.", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です.", "※1 これは山です.", "2.)これは山です、これは山です、これは山です、これは山です.", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です.", "4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です).", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です.", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
46 | end
47 |
48 | it 'correctly segments text #004' do
49 | ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です! \nこれは山です(これは山です!これは山です)!これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です! \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です! \n※1 これは山です! \n2.)これは山です、これは山です、これは山です、これは山です! \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です! \n4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)! \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です! \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
50 | expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です!", "これは山です(これは山です!これは山です)!", "これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です!", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です!", "※1 これは山です!", "2.)これは山です、これは山です、これは山です、これは山です!", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です!", "4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)!", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です!", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
51 | end
52 | end
53 | end
54 |
--------------------------------------------------------------------------------
/spec/performance_spec.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | require 'benchmark'
3 | require 'spec_helper'
4 | require 'stackprof'
5 |
6 | RSpec.describe PragmaticSegmenter::Segmenter do
7 |
8 | # Speed benchmarks tests
9 |
10 | # it 'is fast' do
11 | # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 100
12 | # benchmark do
13 | # PragmaticSegmenter::Segmenter.new(text: string, language: 'en').segment
14 | # end
15 | # data = StackProf.run(mode: :cpu, interval: 1000) do
16 | # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
17 | # PragmaticSegmenter::Segmenter.new(text: string, language: 'en').segment
18 | # end
19 | # puts StackProf::Report.new(data).print_text
20 | # end
21 |
22 | end
23 |
24 | def benchmark(&block)
25 | block.call
26 | time = Benchmark.realtime { block.call }
27 | puts "RUNTIME: #{time}"
28 | end
29 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | 0.3.24 (2024-08-12):
2 |
3 | * Bug Fix: Catastrophic backtracking in regular expression for numerical references
4 | * Improvement: Remove unicode dependency
5 |
6 | 0.3.23 (2021-05-03):
7 |
8 | * Improvement: Refactor for Ruby 3.0 compatibility
9 |
10 | 0.3.22 (2018-09-23):
11 |
12 | * Improvement: Initial support for Kazakh
13 |
14 | 0.3.21 (2018-08-30):
15 |
16 | * Improvement: Add support for file formats
17 | * Improvement: Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
18 |
19 | 0.3.20 (2018-08-28):
20 |
21 | * Improvement: Handle slanted single quotation as a single quote
22 | * Bug Fix: The text contains a single character abbreviation as part of a list
23 | * Bug Fix: Chinese book quotes
24 | * Improvement: Add viz as abbreviation
25 |
26 | 0.3.19 (2018-07-19):
27 |
28 | * Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.
29 |
30 | 0.3.18 (2018-03-27):
31 |
32 | * Improvement: Performance optimizations
33 |
34 | 0.3.17 (2017-12-07):
35 |
36 | * Bug Fix: Regex for parsing HTML
37 |
38 | 0.3.16 (2017-11-13):
39 |
40 | * Improvement: Support for Danish
41 |
42 | 0.3.15 (2017-06-28):
43 |
44 | * Improvement: Handle em dashes that appear in the middle of a sentence and include a sentence ending punctuation mark
45 |
46 | 0.3.14 (2017-06-28):
47 |
48 | * Improvement: Add English abbreviation Rs. to denote the Indian currency
49 |
50 | 0.3.13 (2017-01-17):
51 |
52 | * Bug Fix: Unexpected sentence break between abbreviation and hyphen
53 |
54 | 0.3.12 (2016-12-12):
55 |
56 | * Bug Fix: Issue with words with leading apostrophes
57 |
58 | 0.3.11 (2016-11-08):
59 |
60 | * Improvement: Update German abbreviation list
61 | * Bug Fix: Refactor 'remove_newline_in_middle_of_sentence' method
62 |
63 | 0.3.10 (2016-07-01):
64 |
65 | * Bug Fix: Change load order of dependencies
66 |
67 | 0.3.9 (2016-06-16):
68 |
69 | * Improvement: Remove `guard-rspec` development dependency
70 |
71 | 0.3.8 (2016-03-03):
72 |
73 | * Bug Fix: Fix bug that cleaned away single letter segments
74 |
75 | 0.3.7 (2016-01-12):
76 |
77 | * Improvement: Add `unicode` gem and use it for downcasing to better handle cyrillic languages
78 |
79 | 0.3.6 (2016-01-05):
80 |
81 | * Improvement: Refactor SENTENCE_STARTERS to each individual language and add SENTENCE_STARTERS for German
82 |
83 | 0.3.5 (2016-01-04):
84 |
85 | * Performance: Reduce GC by replacing #gsub with #gsub! where possible
86 |
87 | 0.3.4 (2015-12-22):
88 |
89 | * Improvement: Large refactor
90 |
91 | 0.3.3 (2015-05-27):
92 |
93 | * Bug Fix: Fix cleaner bug
94 |
95 | 0.3.2 (2015-05-27):
96 |
97 | * Improvement: Add English abbreviations
98 |
99 | 0.3.1 (2015-03-02):
100 |
101 | * Bug Fix: Fix undefined method 'gsub!' for nil:NilClass issue
102 |
103 | 0.3.0 (2015-02-04):
104 |
105 | * Improvement: Add support for square brackets
106 | * Improvement: Add support for continuous exclamation points or questions marks or combinations of both
107 | * Bug Fix: Fix Roman numeral support
108 | * Improvement: Add English abbreviations
109 |
110 | 0.2.0 (2015-01-26):
111 |
112 | * Improvement: Add Dutch Golden Rules and abbreviations
113 | * Improvement: Update README with additional tools
114 | * Improvement: Update segmentation test scores in README with results of new Golden Rule tests
115 | * Improvement: Add Polish abbreviations
116 |
117 | 0.1.8 (2015-01-22):
118 |
119 | * Bug Fix: Fix bug in splitting new sentence after single quotes
120 |
121 | 0.1.7 (2015-01-22):
122 |
123 | * Improvement: Add Alice in Wonderland specs
124 | * Bug Fix: Fix parenthesis between double quotations bug
125 | * Bug Fix: Fix split after quotation ending in dash bug
126 |
127 | 0.1.6 (2015-01-16):
128 |
129 | * Bug Fix: Fix bug in numbered list finder (ignore longer digits)
130 |
131 | 0.1.5 (2015-01-13):
132 |
133 | * Bug Fix: Fix comma at end of quotation bug
134 |
135 | 0.1.4 (2015-01-13):
136 |
137 | * Bug Fix: Fix missing abbreviations
138 |
139 | 0.1.3 (2015-01-13):
140 |
141 | * Improvement: Improve punctuation in bracket replacement
142 |
143 | 0.1.2 (2015-01-13):
144 |
145 | * Bug Fix: Fix missing abbreviations
146 | * Improvement: Add footnote rule to `cleaner.rb`
147 |
148 | 0.1.1 (2015-01-12):
149 |
150 | * Bug Fix: Fix handling of German dates
151 |
152 | 0.1.0 (2015-01-12):
153 |
154 | * Improvement: Add Kommanditgesellschaft Rule
155 |
156 | 0.0.9 (2015-01-12):
157 |
158 | * Improvement: Improve handling of alphabetical and roman numeral lists
159 |
160 | 0.0.8 (2015-01-12):
161 |
162 | * Bug Fix: Fix error in `list.rb`
163 |
164 | 0.0.7 (2015-01-12):
165 |
166 | * Improvement: Add change log to README
167 | * Improvement: Add passing spec for new end of sentence abbreviation (EN)
168 | * Improvement: Add roman numeral list support
169 |
170 | 0.0.6 (2015-01-11):
171 |
172 | * Improvement: Add rule for escaped newlines that include a space between the slash and character
173 | * Improvement: Add Golden Rule #52 and code to make it pass
174 |
175 | 0.0.5 (2015-01-10):
176 |
177 | * Improvement: Make symbol substitution safer
178 | * Improvement: Refactor `process.rb`
179 | * Improvement: Update cleaner with escaped newline rules
180 |
181 | 0.0.4 (2015-01-10):
182 |
183 | * Improvement: Add `ConsecutiveForwardSlashRule` to cleaner
184 | * Improvement: Refactor `segmenter.rb` and `process.rb`
185 |
186 | 0.0.3 (2015-01-07):
187 |
188 | * Improvement: Add travis.yml
189 | * Improvement: Add Code Climate
190 | * Improvement: Update README
191 |
192 | 0.0.2 (2015-01-07):
193 |
194 | * Improvement: Major design refactor
195 |
196 | 0.0.1 (2015-01-07):
197 |
198 | * Initial Release
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/common.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require_relative 'common/numbers'
4 | require_relative 'common/ellipsis'
5 |
6 | module PragmaticSegmenter
7 | module Languages
8 | module Common
9 | # This class holds the punctuation marks.
10 | Punctuations = ['。', '.', '.', '!', '!', '?', '?'].freeze
11 |
12 | # Defines the abbreviations for each language (if available)
13 | module Abbreviation
14 | ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
15 | PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']).freeze
16 | NUMBER_ABBREVIATIONS = Set.new(['art', 'ext', 'no', 'nos', 'p', 'pp']).freeze
17 | end
18 |
19 | module Abbreviations
20 | # Rubular: http://rubular.com/r/EUbZCNfgei
21 | WithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
22 | end
23 |
24 | # Rubular: http://rubular.com/r/G2opjedIm9
25 | GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
26 |
27 | FileFormatRule = Rule.new(/(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)/, '∯')
28 |
29 | SingleNewLineRule = Rule.new(/\n/, 'ȹ')
30 |
31 | module DoublePunctuationRules
32 | FirstRule = Rule.new(/\?!/, '☉')
33 | SecondRule = Rule.new(/!\?/, '☈')
34 | ThirdRule = Rule.new(/\?\?/, '☇')
35 | ForthRule = Rule.new(/!!/, '☄')
36 |
37 | All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
38 | end
39 |
40 |
41 | # Rubular: http://rubular.com/r/aXPUGm6fQh
42 | QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
43 |
44 |
45 | module ExclamationPointRules
46 | # Rubular: http://rubular.com/r/XS1XXFRfM2
47 | InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
48 |
49 | # Rubular: http://rubular.com/r/sl57YI8LkA
50 | BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
51 |
52 | # Rubular: http://rubular.com/r/f9zTjmkIPb
53 | MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
54 |
55 | All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
56 | end
57 |
58 | module SubSymbolsRules
59 | Period = Rule.new(/∯/, '.')
60 | ArabicComma = Rule.new(/♬/, '،')
61 | SemiColon = Rule.new(/♭/, ':')
62 | FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
63 | SpecialPeriod = Rule.new(/&ᓱ&/, '.')
64 | FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
65 | ExclamationPoint = Rule.new(/&ᓴ&/, '!')
66 | QuestionMark = Rule.new(/&ᓷ&/, '?')
67 | FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
68 | MixedDoubleQE = Rule.new(/☉/, '?!')
69 | MixedDoubleQQ = Rule.new(/☇/, '??')
70 | MixedDoubleEQ = Rule.new(/☈/, '!?')
71 | MixedDoubleEE = Rule.new(/☄/, '!!')
72 | LeftParens = Rule.new(/&✂&/, '(')
73 | RightParens = Rule.new(/&⌬&/, ')')
74 | TemporaryEndingPunctutation = Rule.new('ȸ', '')
75 | Newline = Rule.new(/ȹ/, "\n")
76 |
77 | All = [ Period, ArabicComma,
78 | SemiColon, FullWidthPeriod,
79 | SpecialPeriod, FullWidthExclamation,
80 | ExclamationPoint, QuestionMark,
81 | FullWidthQuestionMark, MixedDoubleQE,
82 | MixedDoubleQQ, MixedDoubleEQ,
83 | MixedDoubleEE, LeftParens,
84 | RightParens, TemporaryEndingPunctutation,
85 | Newline ]
86 | end
87 |
88 |
89 | module ReinsertEllipsisRules
90 | SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
91 | SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
92 | SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
93 | SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
94 | SubOnePeriod = Rule.new(/∮/, '.')
95 |
96 | All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
97 | SubFourSpacePeriod, SubTwoConsecutivePeriod,
98 | SubOnePeriod ]
99 | end
100 |
101 | ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
102 |
103 | SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
104 |
105 | class AbbreviationReplacer < AbbreviationReplacer
106 | SENTENCE_STARTERS = %w(
107 | A Being Did For He How However I In It Millions More She That The
108 | There They We What When Where Who Why
109 | ).freeze
110 | end
111 |
112 | end
113 | end
114 | end
115 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/arabic_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Arabic, '(ar)' do
4 |
5 | context "Golden Rules" do
6 | it "Regular punctuation #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: "ar")
8 | expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."])
9 | end
10 |
11 | it "Abbreviations #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: "ar")
13 | expect(ps.segment).to eq(["وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."])
14 | end
15 |
16 | it "Numbers and Dates #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: "ar")
18 | expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."])
19 | end
20 |
21 | it "Time #004" do
22 | ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: "ar")
23 | expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."])
24 | end
25 |
26 | it "Comma #005" do
27 | ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: "ar")
28 | expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
29 | end
30 | end
31 |
32 | # Thanks to Mahmoud Holmez for the Arabic test examples.
33 | describe '#segment' do
34 | it 'correctly segments text #001' do
35 | ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: 'ar')
36 | expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."])
37 | end
38 |
39 | it 'correctly segments text #002' do
40 | ps = PragmaticSegmenter::Segmenter.new(text: "وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: 'ar')
41 | expect(ps.segment).to eq(["وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."])
42 | end
43 |
44 | it 'correctly segments text #003' do
45 | ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: 'ar')
46 | expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."])
47 | end
48 |
49 | it 'correctly segments text #004' do
50 | ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: 'ar')
51 | expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."])
52 | end
53 |
54 | it 'correctly segments text #005' do
55 | ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: 'ar')
56 | expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
57 | end
58 | end
59 | end
60 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/danish.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Danish
6 | include Languages::Common
7 |
8 | class Cleaner < Cleaner
9 | def clean
10 | super
11 | clean_quotations
12 | end
13 |
14 | private
15 |
16 | def clean_quotations
17 | @text.gsub(/`/, "'")
18 | end
19 |
20 | def abbreviations
21 | [].freeze
22 | end
23 | end
24 |
25 | module Abbreviation
26 | ABBREVIATIONS = Set.new(['adm', 'adr', 'afd', 'afs', 'al', 'alm', 'alm', 'ang', 'ank', 'anm', 'ann', 'ansvh', 'apr', 'arr', 'ass', 'att', 'aud', 'aug', 'aut', 'bd', 'bdt', 'bet', 'bhk', 'bio', 'biol', 'bk', 'bl.a', 'bot', 'br', 'bto', 'ca', 'cal', 'cirk', 'cit', 'co', 'cpr-nr', 'cvr-nr', 'd.d', 'd.e', 'd.m', 'd.s', 'd.s.s', 'd.y', 'd.å', 'd.æ', 'da', 'dav', 'dec', 'def', 'del', 'dep', 'diam', 'din', 'dir', 'disp', 'distr', 'do', 'dobb', 'dr', 'ds', 'dvs', 'e.b', 'e.kr', 'e.l', 'e.o', 'e.v.t', 'eftf', 'eftm', 'egl', 'eks', 'eksam', 'ekskl', 'eksp', 'ekspl', 'el', 'emer', 'endv', 'eng', 'enk', 'etc', 'eur', 'evt', 'exam', 'f', 'f', 'f.eks', 'f.kr', 'f.m', 'f.n', 'f.o', 'f.o.m', 'f.s.v', 'f.t', 'f.v.t', 'f.å', 'fa', 'fakt', 'feb', 'fec', 'ff', 'fg', 'fg', 'fhv', 'fig', 'fl', 'flg', 'fm', 'fm', 'fmd', 'forb', 'foreg', 'foren', 'forf', 'forh', 'fork', 'form', 'forr', 'fors', 'forsk', 'forts', 'fp', 'fr', 'frk', 'fuldm', 'fuldm', 'fung', 'fung', 'fys', 'fær', 'g', 'g.d', 'g.m', 'gd', 'gdr', 'gg', 'gh', 'gl', 'gn', 'gns', 'gr', 'grdl', 'gross', 'h.a', 'h.c', 'hdl', 'henh', 'henv', 'hf', 'hft', 'hhv', 'hort', 'hosp', 'hpl', 'hr', 'hrs', 'hum', 'i', 'i.e', 'ib', 'ibid', 'if', 'ifm', 'ill', 'indb', 'indreg', 'ing', 'inkl', 'insp', 'instr', 'isl', 'istf', 'jan', 'jf', 'jfr', 'jnr', 'jr', 'jul', 'jun', 'jur', 'jvf', 'kal', 'kap', 'kat', 'kbh', 'kem', 'kgl', 'kin', 'kl', 'kld', 'km/t', 'knsp', 'komm', 'kons', 'korr', 'kp', 'kr', 'kr', 'kst', 'kt', 'ktr', 'kv', 'kvt', 'l', 'l.c', 'lab', 'lat', 'lb', 'lb.', 'lb.nr', 'lejl', 'lgd', 'lic', 'lign', 'lin', 'ling.merc', 'litt', 'lok', 'lrs', 'ltr', 'lø', 'm', 'm.a.o', 'm.fl.st', 'm.m', 'm/', 'ma', 'mag', 'maks', 'mar', 'mat', 'matr.nr', 'md', 'mdl', 'mdr', 'mdtl', 'med', 'medd', 'medflg', 'medl', 'merc', 'mezz', 'mf', 'mfl', 'mgl', 'mhp', 'mht', 'mi', 'mia', 'mio', 'ml', 'mods', 'modsv', 'modt', 'mr', 'mrk', 'mrs', 'ms', 'mul', 'mv', 'mvh', 'n', 'n.br', 'n.f', 'nat', 'ned', 'nedenn', 'nedenst', 'nederl', 'nkr', 'nl', 'no', 'nord', 'nov', 'nr', 'nr', 'nto', 'nuv', 'o', 'o.a', 'o.fl.st', 'o.g', 'o.h', 'o.m.a', 'obj', 'obl', 'obs', 'odont', 'oecon', 'off', 'ofl', 'okt', 'omg', 'omr', 'omtr', 'on', 'op.cit', 'opg', 'opl', 'opr', 'org', 'orig', 'osfr', 'osv', 'ovenn', 'ovenst', 'overs', 'ovf', 'oz', 'p', 'p.a', 'p.b.v', 'p.c', 'p.m.v', 'p.p', 'p.s', 'p.t', 'p.v.a', 'p.v.c', 'par', 'partc', 'pass', 'pct', 'pd', 'pens', 'perf', 'pers', 'pg', 'pga', 'pgl', 'ph', 'ph.d', 'pharm', 'phil', 'pinx', 'pk', 'pkt', 'pl', 'pluskv', 'polit', 'polyt', 'port', 'pos', 'pp', 'pr', 'prc', 'priv', 'prod', 'prof', 'pron', 'præd', 'præf', 'præp', 'præs', 'præt', 'psych', 'pt', 'pæd', 'q.e.d', 'rad', 'red', 'ref', 'reg', 'regn', 'rel', 'rep', 'repr', 'rest', 'rk', 'russ', 's', 's.br', 's.d', 's.e', 's.f', 's.m.b.a', 's.u', 's.å', 's/', 'sa', 'sb', 'sc', 'scient', 'sek', 'sek', 'sekr', 'sem', 'sen', 'sep', 'sept', 'sg', 'sign', 'sj', 'skr', 'skt', 'slutn', 'sml', 'smp', 'sms', 'smst', 'soc', 'soc', 'sort', 'sp', 'spec', 'spm', 'spr', 'spsk', 'st', 'stk', 'str', 'stud', 'subj', 'subst', 'suff', 'sup', 'suppl', 'sv', 'såk', 'sædv', 'sø', 't', 't.h', 't.o.m', 't.v', 'tab', 'td', 'tdl', 'tdr', 'techn', 'tekn', 'temp', 'th', 'ti', 'tidl', 'tilf', 'tilh', 'till', 'tilsv', 'tjg', 'tlf', 'tlgr', 'to', 'tr', 'trp', 'tv', 'ty', 'u', 'u.p', 'u.st', 'u.å', 'uafh', 'ubf', 'ubøj', 'udb', 'udbet', 'udd', 'udg', 'uds', 'ugtl', 'ulin', 'ult', 'undt', 'univ', 'v.f', 'var', 'vb', 'vbsb', 'vedk', 'vedl', 'vedr', 'vejl', 'vh', 'vol', 'vs', 'vsa', 'vær', 'zool', 'årg', 'årh', 'årl', 'ø.f', 'øv', 'øvr']).freeze
27 | NUMBER_ABBREVIATIONS = Set.new(['nr', 's']).freeze
28 | PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'skt', 'dr', 'hr', 'fru', 'st']).freeze
29 | end
30 |
31 | # This handles the case where a dot is used to denote and ordinal (5. Juni)
32 | module Numbers
33 | # Rubular: http://rubular.com/r/hZxoyQwKT1
34 | NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
35 |
36 | # Rubular: http://rubular.com/r/ityNMwdghj
37 | NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
38 |
39 | All = [
40 | Common::Numbers::All,
41 | NumberPeriodSpaceRule,
42 | NegativeNumberPeriodSpaceRule
43 | ]
44 | end
45 |
46 | MONTHS = ['Januar', 'Februar', 'Marts', 'April', 'Maj', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'December'].freeze
47 |
48 | class AbbreviationReplacer < AbbreviationReplacer
49 | SENTENCE_STARTERS = %w(
50 | At De Dem Den Der Det Du En Et For Få Gjorde Han Hun Hvad Hvem Hvilke
51 | Hvor Hvordan Hvorfor Hvorledes Hvornår I Jeg Mange Vi Være
52 | ).freeze
53 |
54 | def replace_abbreviation_as_sentence_boundary(txt)
55 | # As we are being conservative and keeping ambiguous
56 | # sentence boundaries as one sentence instead of
57 | # splitting into two, we can split at words that
58 | # we know for certain never follow these abbreviations.
59 | # Some might say that the set of words that follow an
60 | # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
61 | # the set of words that could start a sentence and
62 | # never follow U.S. However, we are being conservative
63 | # and not splitting by default, so we need to look for places
64 | # where we definitely can split. Obviously SENTENCE_STARTERS
65 | # will never cover all cases, but as the gem is named
66 | # 'Pragmatic Segmenter' we need to be pragmatic
67 | # and try to cover the words that most often start a
68 | # sentence but could never follow one of the abbreviations below.
69 |
70 | @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
71 | escaped = Regexp.escape(word)
72 | txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s")
73 | txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
74 | txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
75 | txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
76 | txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
77 | txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
78 | txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
79 | txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
80 | txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
81 | txt.gsub!(/s.u∯\s#{escaped}\s/, "s\.u\.\s#{escaped}\s")
82 | txt.gsub!(/S.U∯\s#{escaped}\s/, "S\.U\.\s#{escaped}\s")
83 | end
84 | txt
85 | end
86 | end
87 | end
88 | end
89 | end
90 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/list.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | # frozen_string_literal: true
3 |
4 | module PragmaticSegmenter
5 | # This class searches for a list within a string and adds
6 | # newlines before each list item.
7 | class List
8 | ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
9 | LATIN_NUMERALS = ('a'..'z').to_a
10 |
11 | # Rubular: http://rubular.com/r/XcpaJKH0sz
12 | ALPHABETICAL_LIST_WITH_PERIODS =
13 | /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
14 |
15 | # Rubular: http://rubular.com/r/Gu5rQapywf
16 | ALPHABETICAL_LIST_WITH_PARENS =
17 | /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
18 |
19 | SubstituteListPeriodRule = Rule.new(/♨/, '∯')
20 | ListMarkerRule = Rule.new(/☝/, '')
21 |
22 | # Rubular: http://rubular.com/r/Wv4qLdoPx7
23 | SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")
24 |
25 | # Rubular: http://rubular.com/r/AizHXC6HxK
26 | SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")
27 |
28 | # Rubular: http://rubular.com/r/GE5q6yID2j
29 | SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")
30 |
31 | NUMBERED_LIST_REGEX_1 =
32 | /\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
33 | NUMBERED_LIST_REGEX_2 =
34 | /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
35 | NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/
36 |
37 | # Rubular: http://rubular.com/r/NsNFSqrNvJ
38 | EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
39 | /\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
40 |
41 | # Rubular: http://rubular.com/r/wMpnVedEIb
42 | ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
43 | /(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i
44 |
45 | # Rubular: http://rubular.com/r/GcnmQt4a3I
46 | ROMAN_NUMERALS_IN_PARENTHESES =
47 | /\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/
48 |
49 | attr_reader :text
50 | def initialize(text:)
51 | @text = text.dup
52 | end
53 |
54 | def add_line_break
55 | format_alphabetical_lists
56 | format_roman_numeral_lists
57 | format_numbered_list_with_periods
58 | format_numbered_list_with_parens
59 | end
60 |
61 | def replace_parens
62 | text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
63 | text
64 | end
65 |
66 | private
67 |
68 | def format_numbered_list_with_parens
69 | replace_parens_in_numbered_list
70 | add_line_breaks_for_numbered_list_with_parens
71 | Rule.apply(@text, ListMarkerRule)
72 | end
73 |
74 | def format_numbered_list_with_periods
75 | replace_periods_in_numbered_list
76 | add_line_breaks_for_numbered_list_with_periods
77 | Rule.apply(@text, SubstituteListPeriodRule)
78 | end
79 |
80 | def format_alphabetical_lists
81 | add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
82 | add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
83 | end
84 |
85 | def format_roman_numeral_lists
86 | add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
87 | add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
88 | end
89 |
90 | def replace_periods_in_numbered_list
91 | scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
92 | end
93 |
94 | def add_line_breaks_for_numbered_list_with_periods
95 | if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
96 | Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
97 | end
98 | end
99 |
100 | def replace_parens_in_numbered_list
101 | scan_lists(
102 | NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
103 | scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
104 | end
105 |
106 | def add_line_breaks_for_numbered_list_with_parens
107 | if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
108 | Rule.apply(@text, SpaceBetweenListItemsThirdRule)
109 | end
110 | end
111 |
112 | def scan_lists(regex1, regex2, replacement, strip: false)
113 | list_array = @text.scan(regex1).map(&:to_i)
114 | list_array.each_with_index do |a, i|
115 | next unless (a + 1).eql?(list_array[i + 1]) ||
116 | (a - 1).eql?(list_array[i - 1]) ||
117 | (a.eql?(0) && list_array[i - 1].eql?(9)) ||
118 | (a.eql?(9) && list_array[i + 1].eql?(0))
119 | substitute_found_list_items(regex2, a, strip, replacement)
120 | end
121 | end
122 |
123 | def substitute_found_list_items(regex, a, strip, replacement)
124 | @text.gsub!(regex).with_index do |m|
125 | if a.to_s.eql?(strip ? m.strip.chop : m)
126 | "#{Regexp.escape(a.to_s)}" + replacement
127 | else
128 | "#{m}"
129 | end
130 | end
131 | end
132 |
133 | def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
134 | iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
135 | end
136 |
137 | def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
138 | iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
139 | parens: true,
140 | roman_numeral: roman_numeral)
141 | end
142 |
143 | def replace_alphabet_list(a)
144 | @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
145 | a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
146 | end
147 | end
148 |
149 | def replace_alphabet_list_parens(a)
150 | @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
151 | if m.include?('(')
152 | a.eql?(Unicode::downcase(m.dup).gsub!(/\(/, '')) ? "\r&✂{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
153 | else
154 | a.eql?(Unicode::downcase(m.dup)) ? "\r#{Regexp.escape(m)}" : "#{m}"
155 | end
156 | end
157 | end
158 |
159 | def replace_correct_alphabet_list(a, parens)
160 | if parens
161 | replace_alphabet_list_parens(a)
162 | else
163 | replace_alphabet_list(a)
164 | end
165 | end
166 |
167 | def last_array_item_replacement(a, i, alphabet, list_array, parens)
168 | return if alphabet & list_array == [] ||
169 | !alphabet.include?(list_array[i - 1]) ||
170 | !alphabet.include?(a)
171 | return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
172 | replace_correct_alphabet_list(a, parens)
173 | end
174 |
175 | def other_items_replacement(a, i, alphabet, list_array, parens)
176 | return if alphabet & list_array == [] ||
177 | !alphabet.include?(list_array[i - 1]) ||
178 | !alphabet.include?(a) ||
179 | !alphabet.include?(list_array[i + 1])
180 | return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
181 | (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
182 | replace_correct_alphabet_list(a, parens)
183 | end
184 |
185 | def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
186 | list_array = @text.scan(regex).map { |s| Unicode::downcase(s) }
187 | if roman_numeral
188 | alphabet = ROMAN_NUMERALS
189 | else
190 | alphabet = LATIN_NUMERALS
191 | end
192 | list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
193 | list_array.each_with_index do |a, i|
194 | if i.eql?(list_array.length - 1)
195 | last_array_item_replacement(a, i, alphabet, list_array, parens)
196 | else
197 | other_items_replacement(a, i, alphabet, list_array, parens)
198 | end
199 | end
200 | end
201 | end
202 | end
203 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/italian_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Italian, "(it)" do
4 |
5 | context "Golden Rules" do
6 | it "Abbreviations #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: "it")
8 | expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"])
9 | end
10 |
11 | it "Quotations #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.».", language: "it")
13 | expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.»."])
14 | end
15 |
16 | it "Numbers #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: "it")
18 | expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"])
19 | end
20 | end
21 |
22 | # Thanks to Davide Fornelli for the Italian test examples.
23 | describe '#segment' do
24 |
25 | it 'correctly segments text #001' do
26 | ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: 'it')
27 | expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"])
28 | end
29 |
30 | it 'correctly segments text #002' do
31 | ps = PragmaticSegmenter::Segmenter.new(text: "Buongiorno! Sono l'Ing. Mengozzi. È presente l'Avv. Cassioni?", language: 'it')
32 | expect(ps.segment).to eq(["Buongiorno!", "Sono l'Ing. Mengozzi.", "È presente l'Avv. Cassioni?"])
33 | end
34 |
35 | it 'correctly segments text #003' do
36 | ps = PragmaticSegmenter::Segmenter.new(text: "Mi fissi un appuntamento per mar. 23 Nov.. Grazie.", language: 'it')
37 | expect(ps.segment).to eq(["Mi fissi un appuntamento per mar. 23 Nov..", "Grazie."])
38 | end
39 |
40 | it 'correctly segments text #004' do
41 | ps = PragmaticSegmenter::Segmenter.new(text: "Ecco il mio tel.:01234567. Mi saluti la Sig.na Manelli. Arrivederci.", language: 'it')
42 | expect(ps.segment).to eq(["Ecco il mio tel.:01234567.", "Mi saluti la Sig.na Manelli.", "Arrivederci."])
43 | end
44 |
45 | it 'correctly segments text #005' do
46 | ps = PragmaticSegmenter::Segmenter.new(text: "La centrale meteor. si è guastata. Gli idraul. son dovuti andare a sistemarla.", language: 'it')
47 | expect(ps.segment).to eq(["La centrale meteor. si è guastata.", "Gli idraul. son dovuti andare a sistemarla."])
48 | end
49 |
50 | it 'correctly segments text #006' do
51 | ps = PragmaticSegmenter::Segmenter.new(text: "Hanno creato un algoritmo allo st. d. arte. Si ringrazia lo psicol. Serenti.", language: 'it')
52 | expect(ps.segment).to eq(["Hanno creato un algoritmo allo st. d. arte.", "Si ringrazia lo psicol. Serenti."])
53 | end
54 |
55 | it 'correctly segments text #007' do
56 | ps = PragmaticSegmenter::Segmenter.new(text: "Chiamate il V.Cte. delle F.P., adesso!", language: 'it')
57 | expect(ps.segment).to eq(["Chiamate il V.Cte. delle F.P., adesso!"])
58 | end
59 |
60 | it 'correctly segments text #008' do
61 | ps = PragmaticSegmenter::Segmenter.new(text: "Giancarlo ha sostenuto l'esame di econ. az..", language: 'it')
62 | expect(ps.segment).to eq(["Giancarlo ha sostenuto l'esame di econ. az.."])
63 | end
64 |
65 | it 'correctly segments text #009' do
66 | ps = PragmaticSegmenter::Segmenter.new(text: "Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!", language: 'it')
67 | expect(ps.segment).to eq(["Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!"])
68 | end
69 |
70 | it 'correctly segments text #010' do
71 | ps = PragmaticSegmenter::Segmenter.new(text: "Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona.", language: 'it')
72 | expect(ps.segment).to eq(["Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona."])
73 | end
74 |
75 | it 'correctly segments text #011' do
76 | ps = PragmaticSegmenter::Segmenter.new(text: "Stava mangiando e/o dormendo.", language: 'it')
77 | expect(ps.segment).to eq(["Stava mangiando e/o dormendo."])
78 | end
79 |
80 | it 'correctly segments text #012' do
81 | ps = PragmaticSegmenter::Segmenter.new(text: "Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo.", language: 'it')
82 | expect(ps.segment).to eq(["Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo."])
83 | end
84 |
85 | it 'correctly segments text #013' do
86 | ps = PragmaticSegmenter::Segmenter.new(text: "La politica è quella della austerità; quindi verranno fatti tagli agli sprechi.", language: 'it')
87 | expect(ps.segment).to eq(["La politica è quella della austerità; quindi verranno fatti tagli agli sprechi."])
88 | end
89 |
90 | it 'correctly segments text #014' do
91 | ps = PragmaticSegmenter::Segmenter.new(text: "Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\".", language: 'it')
92 | expect(ps.segment).to eq(["Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\"."])
93 | end
94 |
95 | it 'correctly segments text #015' do
96 | ps = PragmaticSegmenter::Segmenter.new(text: "Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW.", language: 'it')
97 | expect(ps.segment).to eq(["Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW."])
98 | end
99 |
100 | it 'correctly segments text #016' do
101 | ps = PragmaticSegmenter::Segmenter.new(text: "La parola 'casa' è sinonimo di abitazione.", language: 'it')
102 | expect(ps.segment).to eq(["La parola 'casa' è sinonimo di abitazione."])
103 | end
104 |
105 | it 'correctly segments text #017' do
106 | ps = PragmaticSegmenter::Segmenter.new(text: "La \"Mulino Bianco\" fa alimentari pre-confezionati.", language: 'it')
107 | expect(ps.segment).to eq(["La \"Mulino Bianco\" fa alimentari pre-confezionati."])
108 | end
109 |
110 | it 'correctly segments text #018' do
111 | ps = PragmaticSegmenter::Segmenter.new(text: "\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni).", language: 'it')
112 | expect(ps.segment).to eq(["\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni)."])
113 | end
114 |
115 | it 'correctly segments text #019' do
116 | ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...».", language: 'it')
117 | expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...»."])
118 | end
119 |
120 | it 'correctly segments text #020' do
121 | ps = PragmaticSegmenter::Segmenter.new(text: "Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\"", language: 'it')
122 | expect(ps.segment).to eq(["Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\""])
123 | end
124 |
125 | it 'correctly segments text #021' do
126 | ps = PragmaticSegmenter::Segmenter.new(text: "Ai bambini è stato chiesto di fare \"4:2*2\"", language: 'it')
127 | expect(ps.segment).to eq(["Ai bambini è stato chiesto di fare \"4:2*2\""])
128 | end
129 |
130 | it 'correctly segments text #022' do
131 | ps = PragmaticSegmenter::Segmenter.new(text: "La maestra esclamò: \"Bambini, quanto fa '2/3 + 4/3?'\".", language: 'it')
132 | expect(ps.segment).to eq(["La maestra esclamò: \"Bambini, quanto fa \'2/3 + 4/3?\'\"."])
133 | end
134 |
135 | it 'correctly segments text #023' do
136 | ps = PragmaticSegmenter::Segmenter.new(text: "Il motore misurava 120°C.", language: 'it')
137 | expect(ps.segment).to eq(["Il motore misurava 120°C."])
138 | end
139 |
140 | it 'correctly segments text #024' do
141 | ps = PragmaticSegmenter::Segmenter.new(text: "Il volume era di 3m³.", language: 'it')
142 | expect(ps.segment).to eq(["Il volume era di 3m³."])
143 | end
144 |
145 | it 'correctly segments text #025' do
146 | ps = PragmaticSegmenter::Segmenter.new(text: "La stanza misurava 20m².", language: 'it')
147 | expect(ps.segment).to eq(["La stanza misurava 20m²."])
148 | end
149 |
150 | it 'correctly segments text #026' do
151 | ps = PragmaticSegmenter::Segmenter.new(text: "1°C corrisponde a 33.8°F.", language: 'it')
152 | expect(ps.segment).to eq(["1°C corrisponde a 33.8°F."])
153 | end
154 |
155 | it 'correctly segments text #027' do
156 | ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27-10-14.", language: 'it')
157 | expect(ps.segment).to eq(["Oggi è il 27-10-14."])
158 | end
159 |
160 | it 'correctly segments text #028' do
161 | ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: 'it')
162 | expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"])
163 | end
164 |
165 | it 'correctly segments text #029' do
166 | ps = PragmaticSegmenter::Segmenter.new(text: "Il corridore 103 è arrivato 4°.", language: 'it')
167 | expect(ps.segment).to eq(["Il corridore 103 è arrivato 4°."])
168 | end
169 |
170 | it 'correctly segments text #030' do
171 | ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27/10/2014.", language: 'it')
172 | expect(ps.segment).to eq(["Oggi è il 27/10/2014."])
173 | end
174 |
175 | it 'correctly segments text #031' do
176 | ps = PragmaticSegmenter::Segmenter.new(text: "Ecco l'elenco: 1.gelato, 2.carne, 3.riso.", language: 'it')
177 | expect(ps.segment).to eq(["Ecco l'elenco: 1.gelato, 2.carne, 3.riso."])
178 | end
179 |
180 | it 'correctly segments text #032' do
181 | ps = PragmaticSegmenter::Segmenter.new(text: "Devi comprare : 1)pesce 2)sale.", language: 'it')
182 | expect(ps.segment).to eq(["Devi comprare : 1)pesce 2)sale."])
183 | end
184 |
185 | it 'correctly segments text #033' do
186 | ps = PragmaticSegmenter::Segmenter.new(text: "La macchina viaggiava a 100 km/h.", language: 'it')
187 | expect(ps.segment).to eq(["La macchina viaggiava a 100 km/h."])
188 | end
189 | end
190 | end
191 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/russian_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Russian, "(ru)" do
4 |
5 | context "Golden Rules" do
6 | it "Abbreviations #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: "ru")
8 | expect(ps.segment).to eq(["Объем составляет 5 куб.м."])
9 | end
10 |
11 | it "Quotations #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: "ru")
13 | expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."])
14 | end
15 |
16 | it "Numbers #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: "ru")
18 | expect(ps.segment).to eq(["Сегодня 27.10.14"])
19 | end
20 | end
21 |
22 | # Thanks to Anastasiia Tsvitailo for the Russian test examples.
23 | describe '#segment' do
24 | it 'correctly segments text #001' do
25 | ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: 'ru')
26 | expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."])
27 | end
28 |
29 | it 'correctly segments text #002' do
30 | ps = PragmaticSegmenter::Segmenter.new(text: "«Я приду поздно», — сказал Андрей.", language: 'ru')
31 | expect(ps.segment).to eq(["«Я приду поздно», — сказал Андрей."])
32 | end
33 |
34 | it 'correctly segments text #003' do
35 | ps = PragmaticSegmenter::Segmenter.new(text: "«К чему ты готовишься? – спросила мама. – Завтра ведь выходной».", language: 'ru')
36 | expect(ps.segment).to eq(["«К чему ты готовишься? – спросила мама. – Завтра ведь выходной»."])
37 | end
38 |
39 | it 'correctly segments text #004' do
40 | ps = PragmaticSegmenter::Segmenter.new(text: "По словам Пушкина, «Привычка свыше дана, замена счастью она».", language: 'ru')
41 | expect(ps.segment).to eq(["По словам Пушкина, «Привычка свыше дана, замена счастью она»."])
42 | end
43 |
44 | it 'correctly segments text #005' do
45 | ps = PragmaticSegmenter::Segmenter.new(text: "Он сказал: «Я очень устал», и сразу же замолчал.", language: 'ru')
46 | expect(ps.segment).to eq(["Он сказал: «Я очень устал», и сразу же замолчал."])
47 | end
48 |
49 | it 'correctly segments text #006' do
50 | ps = PragmaticSegmenter::Segmenter.new(text: "Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей.", language: 'ru')
51 | expect(ps.segment).to eq(["Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей."])
52 | end
53 |
54 | it 'correctly segments text #007' do
55 | ps = PragmaticSegmenter::Segmenter.new(text: "Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…", language: 'ru')
56 | expect(ps.segment).to eq(["Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…"])
57 | end
58 |
59 | it 'correctly segments text #008' do
60 | ps = PragmaticSegmenter::Segmenter.new(text: "Слово «дом» является синонимом жилища", language: 'ru')
61 | expect(ps.segment).to eq(["Слово «дом» является синонимом жилища"])
62 | end
63 |
64 | it 'correctly segments text #009' do
65 | ps = PragmaticSegmenter::Segmenter.new(text: "В Санкт-Петербург на гастроли приехал театр «Современник»", language: 'ru')
66 | expect(ps.segment).to eq(["В Санкт-Петербург на гастроли приехал театр «Современник»"])
67 | end
68 |
69 | it 'correctly segments text #010' do
70 | ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru')
71 | expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."])
72 | end
73 |
74 | it 'correctly segments text #011' do
75 | ps = PragmaticSegmenter::Segmenter.new(text: "Я поем и/или лягу спать.", language: 'ru')
76 | expect(ps.segment).to eq(["Я поем и/или лягу спать."])
77 | end
78 |
79 | it 'correctly segments text #012' do
80 | ps = PragmaticSegmenter::Segmenter.new(text: "Он не мог справиться с примером \"3 + (14:7) = 5\"", language: 'ru')
81 | expect(ps.segment).to eq(["Он не мог справиться с примером \"3 + (14:7) = 5\""])
82 | end
83 |
84 | it 'correctly segments text #013' do
85 | ps = PragmaticSegmenter::Segmenter.new(text: "Вот список: 1.мороженое, 2.мясо, 3.рис.", language: 'ru')
86 | expect(ps.segment).to eq(["Вот список: 1.мороженое, 2.мясо, 3.рис."])
87 | end
88 |
89 | it 'correctly segments text #014' do
90 | ps = PragmaticSegmenter::Segmenter.new(text: "Квартира 234 находится на 4-ом этаже.", language: 'ru')
91 | expect(ps.segment).to eq(["Квартира 234 находится на 4-ом этаже."])
92 | end
93 |
94 | it 'correctly segments text #015' do
95 | ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru')
96 | expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."])
97 | end
98 |
99 | it 'correctly segments text #016' do
100 | ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5м³.", language: 'ru')
101 | expect(ps.segment).to eq(["Объем составляет 5м³."])
102 | end
103 |
104 | it 'correctly segments text #017' do
105 | ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: 'ru')
106 | expect(ps.segment).to eq(["Объем составляет 5 куб.м."])
107 | end
108 |
109 | it 'correctly segments text #018' do
110 | ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14м².", language: 'ru')
111 | expect(ps.segment).to eq(["Площадь комнаты 14м²."])
112 | end
113 |
114 | it 'correctly segments text #019' do
115 | ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14 кв.м.", language: 'ru')
116 | expect(ps.segment).to eq(["Площадь комнаты 14 кв.м."])
117 | end
118 |
119 | it 'correctly segments text #020' do
120 | ps = PragmaticSegmenter::Segmenter.new(text: "1°C соответствует 33.8°F.", language: 'ru')
121 | expect(ps.segment).to eq(["1°C соответствует 33.8°F."])
122 | end
123 |
124 | it 'correctly segments text #021' do
125 | ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: 'ru')
126 | expect(ps.segment).to eq(["Сегодня 27.10.14"])
127 | end
128 |
129 | it 'correctly segments text #022' do
130 | ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27 октября 2014 года.", language: 'ru')
131 | expect(ps.segment).to eq(["Сегодня 27 октября 2014 года."])
132 | end
133 |
134 | it 'correctly segments text #023' do
135 | ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит 150 000 дол.!", language: 'ru')
136 | expect(ps.segment).to eq(["Эта машина стоит 150 000 дол.!"])
137 | end
138 |
139 | it 'correctly segments text #024' do
140 | ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит $150 000!", language: 'ru')
141 | expect(ps.segment).to eq(["Эта машина стоит $150 000!"])
142 | end
143 |
144 | it 'correctly segments text #025' do
145 | ps = PragmaticSegmenter::Segmenter.new(text: "Вот номер моего телефона: +39045969798. Передавайте привет г-ну Шапочкину. До свидания.", language: 'ru')
146 | expect(ps.segment).to eq(["Вот номер моего телефона: +39045969798.", "Передавайте привет г-ну Шапочкину.", "До свидания."])
147 | end
148 |
149 | it 'correctly segments text #026' do
150 | ps = PragmaticSegmenter::Segmenter.new(text: "Постойте, разве можно указывать цены в у.е.!", language: 'ru')
151 | expect(ps.segment).to eq(["Постойте, разве можно указывать цены в у.е.!"])
152 | end
153 |
154 | it 'correctly segments text #027' do
155 | ps = PragmaticSegmenter::Segmenter.new(text: "Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!", language: 'ru')
156 | expect(ps.segment).to eq(["Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!"])
157 | end
158 |
159 | it 'correctly segments text #028' do
160 | ps = PragmaticSegmenter::Segmenter.new(text: "Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре.", language: 'ru')
161 | expect(ps.segment).to eq(["Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре."])
162 | end
163 |
164 | it 'correctly segments text #029' do
165 | ps = PragmaticSegmenter::Segmenter.new(text: "Уважаемый проф. Семенов! Просьба до 20.10 сдать отчет на кафедру.", language: 'ru')
166 | expect(ps.segment).to eq(["Уважаемый проф. Семенов!", "Просьба до 20.10 сдать отчет на кафедру."])
167 | end
168 |
169 | it 'correctly segments text #030' do
170 | ps = PragmaticSegmenter::Segmenter.new(text: "Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка. Предъявите дисконтную карту, пожалуйста!", language: 'ru')
171 | expect(ps.segment).to eq(["Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка.", "Предъявите дисконтную карту, пожалуйста!"])
172 | end
173 |
174 | it 'correctly segments text #031' do
175 | ps = PragmaticSegmenter::Segmenter.new(text: "Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая.", language: 'ru')
176 | expect(ps.segment).to eq(["Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая."])
177 | end
178 |
179 | it 'correctly segments text #032' do
180 | ps = PragmaticSegmenter::Segmenter.new(text: "Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок.", language: 'ru')
181 | expect(ps.segment).to eq(["Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок."])
182 | end
183 |
184 | it 'correctly segments text #033' do
185 | ps = PragmaticSegmenter::Segmenter.new(text: "В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно.", language: 'ru')
186 | expect(ps.segment).to eq(["В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно."])
187 | end
188 |
189 | it 'correctly segments text #034' do
190 | ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?»", language: 'ru')
191 | expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»"])
192 | end
193 |
194 | it 'correctly segments text #035' do
195 | ps = PragmaticSegmenter::Segmenter.new(text: "Кв. 234 находится на 4 этаже.", language: 'ru')
196 | expect(ps.segment).to eq(["Кв. 234 находится на 4 этаже."])
197 | end
198 |
199 | it 'correctly segments text #036' do
200 | ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru')
201 | expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."])
202 | end
203 |
204 | it 'correctly segments text #037' do
205 | ps = PragmaticSegmenter::Segmenter.new(text: "Нужно купить 1)рыбу 2)соль.", language: 'ru')
206 | expect(ps.segment).to eq(["Нужно купить 1)рыбу 2)соль."])
207 | end
208 |
209 | it 'correctly segments text #038' do
210 | ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru')
211 | expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."])
212 | end
213 |
214 | it 'correctly segments text #039' do
215 | ps = PragmaticSegmenter::Segmenter.new(text: "Л.Н. Толстой написал \"Войну и мир\". Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами. Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое.", language: 'ru')
216 | expect(ps.segment).to eq(["Л.Н. Толстой написал \"Войну и мир\".", "Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами.", "Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое."])
217 | end
218 | end
219 | end
220 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/armenian_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Armenian, '(hy)' do
4 |
5 | context "Golden Rules" do
6 | it "Sentence ending punctuation #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: "hy")
8 | expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"])
9 | end
10 |
11 | it "Ellipsis #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: "hy")
13 | expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"])
14 | end
15 |
16 | it "Period is not a sentence boundary #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: "hy")
18 | expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
19 | end
20 | end
21 |
22 | describe '#segment' do
23 | # Thanks to Armine Abelyan for the Armenian test examples.
24 |
25 | it 'correctly segments text #001' do
26 | ps = PragmaticSegmenter::Segmenter.new(text: "Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը: Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:", language: 'hy')
27 | expect(ps.segment).to eq(["Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը:", "Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:"])
28 | end
29 |
30 | it 'correctly segments text #002' do
31 | ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy')
32 | expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"])
33 | end
34 |
35 | it 'correctly segments text #003' do
36 | ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy')
37 | expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"])
38 | end
39 |
40 | it 'correctly segments text #004' do
41 | # "Hello world. My name is Armine." ==> ["Hello world.", "My name is Armine."]
42 | ps = PragmaticSegmenter::Segmenter.new(text: "Բարև Ձեզ: Իմ անունն էԱրմինե:", language: 'hy')
43 | expect(ps.segment).to eq(["Բարև Ձեզ:", "Իմ անունն էԱրմինե:"])
44 | end
45 |
46 | it 'correctly segments text #005' do
47 | # "Today is Monday. I am going to work." ==> ["Today is Monday.", "I am going to work."]
48 | ps = PragmaticSegmenter::Segmenter.new(text: "Այսօր երկուշաբթի է: Ես գնում եմ աշխատանքի:", language: 'hy')
49 | expect(ps.segment).to eq(["Այսօր երկուշաբթի է:", "Ես գնում եմ աշխատանքի:"])
50 | end
51 |
52 | it 'correctly segments text #006' do
53 | # "Tomorrow is September 1st. We are going to school." ==> ["Tomorrow is September 1st.", "We are going to school."]
54 | ps = PragmaticSegmenter::Segmenter.new(text: "Վաղը սեպտեմբերի 1-ն է: Մենք գնում ենք դպրոց:", language: 'hy')
55 | expect(ps.segment).to eq(["Վաղը սեպտեմբերի 1-ն է:", "Մենք գնում ենք դպրոց:"])
56 | end
57 |
58 | it 'correctly segments text #007' do
59 | # "Yes, I understood. I really love you." ==> ["Yes, I understood.", "I really love you."]
60 | ps = PragmaticSegmenter::Segmenter.new(text: "Այո, ես հասկացա: Ես իսկապես քեզ սիրում եմ:", language: 'hy')
61 | expect(ps.segment).to eq(["Այո, ես հասկացա:", "Ես իսկապես քեզ սիրում եմ:"])
62 | end
63 |
64 | it 'correctly segments text #008' do
65 | # "Close the windows. It is raining in the evening." ==> ["Close the windows.", "It is raining in the evening."]
66 | ps = PragmaticSegmenter::Segmenter.new(text: "Փակիր պատուհանները: Երեկոյան անձրևում է:", language: 'hy')
67 | expect(ps.segment).to eq(["Փակիր պատուհանները:", "Երեկոյան անձրևում է:"])
68 | end
69 |
70 | it 'correctly segments text #009' do
71 | # "It is dark. I should go home." ==> ["It is dark.", "I should go home."]
72 | ps = PragmaticSegmenter::Segmenter.new(text: "Մութ է: Ես պետք է տուն վերադառնամ:", language: 'hy')
73 | expect(ps.segment).to eq(["Մութ է:", "Ես պետք է տուն վերադառնամ:"])
74 | end
75 |
76 | it 'correctly segments text #010' do
77 | # "You know, I am starting to believe. Everything is changing." ==> ["You know, I am starting to believe.", "Everything is changing."]
78 | ps = PragmaticSegmenter::Segmenter.new(text: "Գիտես, սկսել եմ հավատալ: Ամեն ինչ փոխվում է:", language: 'hy')
79 | expect(ps.segment).to eq(["Գիտես, սկսել եմ հավատալ:", "Ամեն ինչ փոխվում է:"])
80 | end
81 |
82 | it 'correctly segments text #011' do
83 | # "It is a new Christmas tree. We should decorate it." ==> ["It is a new Christmas tree.", "We should decorate it."]
84 | ps = PragmaticSegmenter::Segmenter.new(text: "Տոնածառը նոր է: Պետք է այն զարդարել:", language: 'hy')
85 | expect(ps.segment).to eq(["Տոնածառը նոր է:", "Պետք է այն զարդարել:"])
86 | end
87 |
88 | it 'correctly segments text #012' do
89 | # "I am in hurry. I could not wait you." ==> ["I am in hurry.", "I could not wait you."]
90 | ps = PragmaticSegmenter::Segmenter.new(text: "Ես շտապում եմ: Ես քեզ չեմ կարող սպասել:", language: 'hy')
91 | expect(ps.segment).to eq(["Ես շտապում եմ:", "Ես քեզ չեմ կարող սպասել:"])
92 | end
93 |
94 | it 'correctly segments text #013' do
95 | # "Wait, we love each other. I want us to live together." ==> ["Wait, we love each other.", "I want us to live together."]
96 | ps = PragmaticSegmenter::Segmenter.new(text: "Սպասիր, մենք իրար սիրում ենք: Ցանկանում եմ միասին ապրենք:", language: 'hy')
97 | expect(ps.segment).to eq(["Սպասիր, մենք իրար սիրում ենք:", "Ցանկանում եմ միասին ապրենք:"])
98 | end
99 |
100 | it 'correctly segments text #014' do
101 | # "No, I do not think so. It is not true." ==> ["No, I do not think so.", "It is not true."]
102 | ps = PragmaticSegmenter::Segmenter.new(text: "Ոչ, այդպես չեմ կարծում: Դա ճիշտ չէ:", language: 'hy')
103 | expect(ps.segment).to eq(["Ոչ, այդպես չեմ կարծում:", "Դա ճիշտ չէ:"])
104 | end
105 |
106 | it 'correctly segments text #015' do
107 | # "April 24 it has started to rain... I was thinking about." ==> ["April 24 it has started to rain... I was thinking about."]
108 | ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: 'hy')
109 | expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"])
110 | end
111 |
112 | it 'correctly segments text #016' do
113 | # "It was 1960...it was winter...it was night. It was cold...emptiness." ==> ["It was 1960...it was winter...it was night.", "It was cold...emptiness."]
114 | ps = PragmaticSegmenter::Segmenter.new(text: "1960 թվական…ձմեռ…գիշեր: Սառն էր…դատարկություն:", language: 'hy')
115 | expect(ps.segment).to eq(["1960 թվական…ձմեռ…գիշեր:", "Սառն էր…դատարկություն:"])
116 | end
117 |
118 | it 'correctly segments text #017' do
119 | # "Why a computer could not do what a man could do? Simply it doesn't have a human brain." ==> ["Why a computer could not do what a man could do?", "Simply it doesn't have a human brain."]
120 | ps = PragmaticSegmenter::Segmenter.new(text: "Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը: Պարզապես չունի մարդկային ուղեղ:", language: 'hy')
121 | expect(ps.segment).to eq(["Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը:", "Պարզապես չունի մարդկային ուղեղ:"])
122 | end
123 |
124 | it 'correctly segments text #018' do
125 | # "Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity." ==> ["Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity."]
126 | ps = PragmaticSegmenter::Segmenter.new(text: "Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:", language: 'hy')
127 | expect(ps.segment).to eq(["Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:"])
128 | end
129 |
130 | it 'correctly segments text #019' do
131 | # "So, we are coming to the end. The logic is...simplicity and work" ==> ["So, we are coming to the end.", "Simplicity and work."]
132 | ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: 'hy')
133 | expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
134 | end
135 |
136 | it 'correctly segments text #020' do
137 | # "What are you thinking? Nothing!" ==> ["What are you thinking?", "Nothing!"]
138 | ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: 'hy')
139 | expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"])
140 | end
141 |
142 | it 'correctly segments text #021' do
143 | # "Can we work together ?. May be what you are thinking, is possible." ==> ["Can we work together?.", "May be what you are thinking is possible."]
144 | ps = PragmaticSegmenter::Segmenter.new(text: "Կարող ե՞նք միասին աշխատել: Գուցե այն ինչ մտածում ես, իրականանալի է:", language: 'hy')
145 | expect(ps.segment).to eq(["Կարող ե՞նք միասին աշխատել:", "Գուցե այն ինչ մտածում ես, իրականանալի է:"])
146 | end
147 |
148 | it 'correctly segments text #022' do
149 | # "Now what we have started, comes to the end. However the questions are numerous... ." ==> ["Now what we have started, comes to the end.", "However the questions are numerous... ."]
150 | ps = PragmaticSegmenter::Segmenter.new(text: "Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում: Հարցերը սակայն շատ են...:", language: 'hy')
151 | expect(ps.segment).to eq(["Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում:", "Հարցերը սակայն շատ են...:"])
152 | end
153 |
154 | it 'correctly segments text #023' do
155 | # "Honey... I am waiting. Shall I go... or?" ==> ["Honey... I am waiting.", "Shall I go... or?"]
156 | ps = PragmaticSegmenter::Segmenter.new(text: "Սիրելիս...սպասում եմ: Գնամ թ՟ե …:", language: 'hy')
157 | expect(ps.segment).to eq(["Սիրելիս...սպասում եմ:", "Գնամ թ՟ե …:"])
158 | end
159 | end
160 | end
161 |
--------------------------------------------------------------------------------
/spec/pragmatic_segmenter/languages/spanish_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe PragmaticSegmenter::Languages::Spanish, '(es)' do
4 |
5 | context "Golden Rules" do
6 | it "Question mark to end sentence #001" do
7 | ps = PragmaticSegmenter::Segmenter.new(text: "¿Cómo está hoy? Espero que muy bien.", language: "es")
8 | expect(ps.segment).to eq(["¿Cómo está hoy?", "Espero que muy bien."])
9 | end
10 |
11 | it "Exclamation point to end sentence #002" do
12 | ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola señorita! Espero que muy bien.", language: "es")
13 | expect(ps.segment).to eq(["¡Hola señorita!", "Espero que muy bien."])
14 | end
15 |
16 | it "Abbreviations #003" do
17 | ps = PragmaticSegmenter::Segmenter.new(text: "Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: "es")
18 | expect(ps.segment).to eq(["Hola Srta. Ledesma.", "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."])
19 | end
20 |
21 | it "Numbers #004" do
22 | ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: "es")
23 | expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."])
24 | end
25 |
26 | it "Quotations #005" do
27 | ps = PragmaticSegmenter::Segmenter.new(text: "«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles.", language: "es")
28 | expect(ps.segment).to eq(["«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles."])
29 | end
30 | end
31 |
32 | # Thanks to Alejandro Naser Pastoriza for the Spanish test examples.
33 | describe '#segment' do
34 | it 'correctly segments text #001' do
35 | ps = PragmaticSegmenter::Segmenter.new(text: '«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles. Pablo, ¿adónde vas? ¡¿Qué viste?!', language: 'es')
36 | expect(ps.segment).to eq(['«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles.', 'Pablo, ¿adónde vas?', '¡¿Qué viste?!'])
37 | end
38 |
39 | it 'correctly segments text #002' do
40 | ps = PragmaticSegmenter::Segmenter.new(text: 'Admón. es administración o me equivoco.', language: 'es')
41 | expect(ps.segment).to eq(['Admón. es administración o me equivoco.'])
42 | end
43 |
44 | it 'correctly segments text #003' do
45 | ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa", language: 'es')
46 | expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa"])
47 | end
48 |
49 | it 'correctly segments text #004' do
50 | ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa \n• 11. Hola", language: 'es')
51 | expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa", "• 11. Hola"])
52 | end
53 |
54 | it 'correctly segments text #005' do
55 | ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola Srta. Ledesma! ¿Cómo está hoy? Espero que muy bien.", language: 'es')
56 | expect(ps.segment).to eq(["¡Hola Srta. Ledesma!", "¿Cómo está hoy?", "Espero que muy bien."])
57 | end
58 |
59 | it 'correctly segments text #006' do
60 | ps = PragmaticSegmenter::Segmenter.new(text: "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: 'es')
61 | expect(ps.segment).to eq(["Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."])
62 | end
63 |
64 | it 'correctly segments text #007' do
65 | ps = PragmaticSegmenter::Segmenter.new(text: "He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014. Gracias.", language: 'es')
66 | expect(ps.segment).to eq(["He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014.", "Gracias."])
67 | end
68 |
69 | it 'correctly segments text #008' do
70 | ps = PragmaticSegmenter::Segmenter.new(text: "Núm. de tel: 351.123.465.4. Envíe mis saludos a la Sra. Rescia.", language: 'es')
71 | expect(ps.segment).to eq(["Núm. de tel: 351.123.465.4.", "Envíe mis saludos a la Sra. Rescia."])
72 | end
73 |
74 | it 'correctly segments text #009' do
75 | ps = PragmaticSegmenter::Segmenter.new(text: "Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin. Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K.", language: 'es')
76 | expect(ps.segment).to eq(["Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin.", "Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K."])
77 | end
78 |
79 | it 'correctly segments text #010' do
80 | ps = PragmaticSegmenter::Segmenter.new(text: "Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D.", language: 'es')
81 | expect(ps.segment).to eq(["Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D."])
82 | end
83 |
84 | it 'correctly segments text #011' do
85 | ps = PragmaticSegmenter::Segmenter.new(text: "Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \".", language: 'es')
86 | expect(ps.segment).to eq(["Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \"."])
87 | end
88 |
89 | it 'correctly segments text #012' do
90 | ps = PragmaticSegmenter::Segmenter.new(text: "Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado.", language: 'es')
91 | expect(ps.segment).to eq(["Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado."])
92 | end
93 |
94 | it 'correctly segments text #013' do
95 | ps = PragmaticSegmenter::Segmenter.new(text: "Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\". ¿Qué te parece?", language: 'es')
96 | expect(ps.segment).to eq(["Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\".", "¿Qué te parece?"])
97 | end
98 |
99 | it 'correctly segments text #014' do
100 | ps = PragmaticSegmenter::Segmenter.new(text: "Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU..", language: 'es')
101 | expect(ps.segment).to eq(["Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU.."])
102 | end
103 |
104 | it 'correctly segments text #015' do
105 | ps = PragmaticSegmenter::Segmenter.new(text: "Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\". Disponían de 1 min. para responder esa pregunta.", language: 'es')
106 | expect(ps.segment).to eq(["Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\".", "Disponían de 1 min. para responder esa pregunta."])
107 | end
108 |
109 | it 'correctly segments text #016' do
110 | ps = PragmaticSegmenter::Segmenter.new(text: "La temperatura del motor alcanzó los 120.5°C. Afortunadamente, pudo llegar al final de carrera.", language: 'es')
111 | expect(ps.segment).to eq(["La temperatura del motor alcanzó los 120.5°C.", "Afortunadamente, pudo llegar al final de carrera."])
112 | end
113 |
114 | it 'correctly segments text #017' do
115 | ps = PragmaticSegmenter::Segmenter.new(text: "El volumen del cuerpo es 3m³. ¿Cuál es la superficie de cada cara del prisma?", language: 'es')
116 | expect(ps.segment).to eq(["El volumen del cuerpo es 3m³.", "¿Cuál es la superficie de cada cara del prisma?"])
117 | end
118 |
119 | it 'correctly segments text #018' do
120 | ps = PragmaticSegmenter::Segmenter.new(text: "La habitación tiene 20.55m². El living tiene 50.0m².", language: 'es')
121 | expect(ps.segment).to eq(["La habitación tiene 20.55m².", "El living tiene 50.0m²."])
122 | end
123 |
124 | it 'correctly segments text #019' do
125 | ps = PragmaticSegmenter::Segmenter.new(text: "1°C corresponde a 33.8°F. ¿A cuánto corresponde 35°C?", language: 'es')
126 | expect(ps.segment).to eq(["1°C corresponde a 33.8°F.", "¿A cuánto corresponde 35°C?"])
127 | end
128 |
129 | it 'correctly segments text #020' do
130 | ps = PragmaticSegmenter::Segmenter.new(text: "Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos. De esta manera se consagró ¡Campeón mundial!", language: 'es')
131 | expect(ps.segment).to eq(["Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos.", "De esta manera se consagró ¡Campeón mundial!"])
132 | end
133 |
134 | it 'correctly segments text #021' do
135 | ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: 'es')
136 | expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."])
137 | end
138 |
139 | it 'correctly segments text #022' do
140 | ps = PragmaticSegmenter::Segmenter.new(text: "El corredor No. 103 arrivó 4°.", language: 'es')
141 | expect(ps.segment).to eq(["El corredor No. 103 arrivó 4°."])
142 | end
143 |
144 | it 'correctly segments text #023' do
145 | ps = PragmaticSegmenter::Segmenter.new(text: "Hoy es 27/04/2014, y es mi cumpleaños. ¿Cuándo es el tuyo?", language: 'es')
146 | expect(ps.segment).to eq(["Hoy es 27/04/2014, y es mi cumpleaños.", "¿Cuándo es el tuyo?"])
147 | end
148 |
149 | it 'correctly segments text #024' do
150 | ps = PragmaticSegmenter::Segmenter.new(text: "Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz. ¿Cuánto costará? Quizás $12.5.", language: 'es')
151 | expect(ps.segment).to eq(["Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz.", "¿Cuánto costará?", "Quizás $12.5."])
152 | end
153 |
154 | it 'correctly segments text #025' do
155 | ps = PragmaticSegmenter::Segmenter.new(text: "1 + 1 es 2. 2 + 2 es 4. El auto es de color rojo.", language: 'es')
156 | expect(ps.segment).to eq(["1 + 1 es 2.", "2 + 2 es 4.", "El auto es de color rojo."])
157 | end
158 |
159 | it 'correctly segments text #026' do
160 | ps = PragmaticSegmenter::Segmenter.new(text: "La máquina viajaba a 100 km/h. ¿En cuánto tiempo recorrió los 153 Km.?", language: 'es')
161 | expect(ps.segment).to eq(["La máquina viajaba a 100 km/h.", "¿En cuánto tiempo recorrió los 153 Km.?"])
162 | end
163 |
164 | it 'correctly segments text #027' do
165 | ps = PragmaticSegmenter::Segmenter.new(text: "\n \nCentro de Relaciones Interinstitucionales -CERI \n\nCra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia \n\nhttp://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co \n\n \n\nCERI 0908 \n \nBogotá, D.C. 6 de noviembre de 2014. \n \nSeñores: \nEMBAJADA DE UNITED KINGDOM \n \n", language: 'es')
166 | expect(ps.segment).to eq(["Centro de Relaciones Interinstitucionales -CERI", "Cra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia", "http://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co", "CERI 0908", "Bogotá, D.C. 6 de noviembre de 2014.", "Señores:", "EMBAJADA DE UNITED KINGDOM"])
167 | end
168 |
169 | it 'correctly segments text #028' do
170 | ps = PragmaticSegmenter::Segmenter.new(text: "N°. 1026.253.553", language: 'es')
171 | expect(ps.segment).to eq(["N°. 1026.253.553"])
172 | end
173 |
174 | it 'correctly segments text #029' do
175 | ps = PragmaticSegmenter::Segmenter.new(text: "\nA continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN \nSANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, \negresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por \nsu excelencia académica, actualmente cursa el programa de Maestría en \nIngeniería Industrial y se encuentra en un intercambio cultural en Bangalore – \nIndia.", language: 'es', doc_type: 'pdf')
176 | expect(ps.segment).to eq(["A continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN SANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, egresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por su excelencia académica, actualmente cursa el programa de Maestría en Ingeniería Industrial y se encuentra en un intercambio cultural en Bangalore – India."])
177 | end
178 |
179 | it 'correctly segments text #030' do
180 | ps = PragmaticSegmenter::Segmenter.new(text: "\n__________________________________________________________\nEl Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad.", language: 'es')
181 | expect(ps.segment).to eq(["El Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad."])
182 | end
183 |
184 | it 'correctly segments text #031' do
185 | ps = PragmaticSegmenter::Segmenter.new(text: "Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco.", language: 'es')
186 | expect(ps.segment).to eq(["Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco."])
187 | end
188 | end
189 | end
190 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/dutch.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Dutch
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(['a.2d', 'a.a', 'a.a.j.b', 'a.f.t', 'a.g.j.b', 'a.h.v', 'a.h.w', 'a.hosp', 'a.i', 'a.j.b', 'a.j.t', 'a.m', 'a.m.r', 'a.p.m', 'a.p.r', 'a.p.t', 'a.s', 'a.t.d.f', 'a.u.b', 'a.v.a', 'a.w', 'aanbev', 'aanbev.comm', 'aant', 'aanv.st', 'aanw', 'vnw', 'aanw.vnw', 'abd', 'abm', 'abs', 'acc.& fisc', 'acc.act', 'acc.bedr.m', 'acc.bedr.t', "acc.thema's m.", 'acc.thema’s m', 'achterv', 'act.dr', 'act.dr.fam', 'act.fisc', 'act.soc', 'adm.akk', 'adm.besl', 'adm.lex', 'adm.onderr', 'adm.ov', 'adv', 'adv', 'gen', 'adv.bl', 'afd', 'afl', 'aggl.verord', 'agr', 'al', 'alg', 'alg.richts', 'amén', 'ann.dr', 'ann.dr.lg', 'ann.dr.sc.pol', 'ann.ét.eur', 'ann.fac.dr.lg', 'ann.jur.créd', 'ann.jur.créd.règl.coll', 'ann.not', 'ann.parl', 'ann.prat.comm', 'app', 'arb', 'aud', 'arbbl', 'arbh', 'arbit.besl', 'arbrb', 'arr', 'arr.cass', 'arr.r.v.st', 'arr.verbr', 'arrondrb', 'art', 'artw', 'aud', 'b', 'b', 'en w', 'b.&w', 'b.a', 'b.a.s', 'b.b.o', 'b.best.dep', 'b.br.ex', 'b.coll.fr.gem.comm', 'b.coll.vl.gem.comm', 'b.d.cult.r', 'b.d.gem.ex', 'b.d.gem.reg', 'b.dep', 'b.e.b', 'b.f.r', 'b.fr.gem.ex', 'b.fr.gem.reg', 'b.i.h', 'b.inl.j.d', 'b.inl.s.reg', 'b.j', 'b.l', 'b.lid br.ex', 'b.lid d.gem.ex', 'b.lid fr.gem.ex', 'b.lid vl.ex', 'b.lid w.gew.ex', 'b.o.z', 'b.prov.r', 'b.r.h', 'b.s', 'b.sr', 'b.stb', 'b.t.i.r', 'b.t.s.z', 'b.t.w.rev', 'b.v', 'b.ver.coll.gem.gem.comm', 'b.verg.r.b', 'b.versl', 'b.vl.ex', 'b.voorl.reg', 'b.w', 'b.w.gew.ex', 'b.z.d.g', 'b.z.v', 'bab', 'bank fin', 'bank fin.r', 'bedr.org', 'begins', 'beheersov', 'bekendm.comm', 'bel', 'bel.besch', 'bel.w.p', 'beleidsov', 'belg', 'grondw', 'benelux jur', 'ber', 'ber.w', 'besch', 'besl', 'beslagr', 'besluitwet nr', 'bestuurswet', 'bet', 'betr', 'betr', 'vnw', 'bevest', 'bew', 'bijbl', 'ind', 'eig', 'bijbl.n.bijdr', 'bijl', 'bijv', 'bijw', 'bijz.decr', 'bin.b', 'bkh', 'bl', 'blz', 'bm', 'bn', 'bnlx merkw', 'bnlx tek', 'bnlx uitl', 'rh', 'bnw', 'bouwr', 'br drs', 'br.parl', 'bs', 'bt drs', 'btw rev', 'bull', 'bull.adm.pénit', 'bull.ass', 'bull.b.m.m', 'bull.bel', 'bull.best.strafinr', 'bull.bmm', 'bull.c.b.n', 'bull.c.n.c', 'bull.cbn', 'bull.centr.arb', 'bull.cnc', 'bull.contr', 'bull.doc.min.fin', 'bull.f.e.b', 'bull.feb', 'bull.fisc.fin.r', 'bull.i.u.m', 'bull.inf.ass.secr.soc', 'bull.inf.i.e.c', 'bull.inf.i.n.a.m.i', 'bull.inf.i.r.e', 'bull.inf.iec', 'bull.inf.inami', 'bull.inf.ire', 'bull.inst.arb', 'bull.ium', 'bull.jur.imm', 'bull.lég.b', 'bull.off', 'bull.trim.b.dr.comp', 'bull.us', 'bull.v.b.o', 'bull.vbo', 'bv i.o', 'bv', 'bw int.reg', 'bw', 'bxh', 'byz', 'c', 'c.& f', 'c.& f.p', 'c.a', 'c.a.-a', 'c.a.b.g', 'c.c', 'c.c.i', 'c.c.s', 'c.conc.jur', 'c.d.e', 'c.d.p.k', 'c.e', 'c.ex', 'c.f', 'c.h.a', 'c.i.f', 'c.i.f.i.c', 'c.j', 'c.l', 'c.n', 'c.o.d', 'c.p', 'c.pr.civ', 'c.q', 'c.r', 'c.r.a', 'c.s', 'c.s.a', 'c.s.q.n', 'c.v', 'c.v.a', 'c.v.o', 'ca', 'cadeaust', 'cah.const', 'cah.dr.europ', 'cah.dr.immo', 'cah.dr.jud', 'cal', '2d', 'cal', '3e', 'cal', 'rprt', 'cap', 'carg', 'cass', 'cass', 'verw', 'cert', 'cf', 'ch', 'chron', 'chron.d.s', 'chron.dr.not', 'cie', 'cie', 'verz.schr', 'cir', 'circ', 'circ.z', 'cit', 'cit.loc', 'civ', 'cl.et.b', 'cmt', 'co', 'cognoss.v', 'coll', 'v', 'b', 'colp.w', 'com', 'com', 'cas', 'com.v.min', 'comm', 'comm', 'v', 'comm.bijz.ov', 'comm.erf', 'comm.fin', 'comm.ger', 'comm.handel', 'comm.pers', 'comm.pub', 'comm.straf', 'comm.v', 'comm.v.en v', 'comm.venn', 'comm.verz', 'comm.voor', 'comp', 'compt.w', 'computerr', 'con.m', 'concl', 'concr', 'conf', 'confl.w', 'confl.w.huwbetr', 'cons', 'conv', 'coöp', 'ver', 'corr', 'corr.bl', 'cour de cass', 'cour.fisc', 'cour.immo', 'cridon', 'crim', 'cur', 'cur', 'crt', 'curs', 'd', 'd.-g', 'd.a', 'd.a.v', 'd.b.f', 'd.c', 'd.c.c.r', 'd.d', 'd.d.p', 'd.e.t', 'd.gem.r', 'd.h', 'd.h.z', 'd.i', 'd.i.t', 'd.j', 'd.l.r', 'd.m', 'd.m.v', 'd.o.v', 'd.parl', 'd.w.z', 'dact', 'dat', 'dbesch', 'dbesl', 'de advoc', 'de belg.acc', 'de burg.st', 'de gem', 'de gerechtsd', 'de venn', 'de verz', 'decr', 'decr.d', 'decr.fr', 'decr.vl', 'decr.w', 'def', 'dep.opv', 'dep.rtl', 'derg', 'desp', 'det.mag', 'deurw.regl', 'dez', 'dgl', 'dhr', 'disp', 'diss', 'div', 'div.act', 'div.bel', 'dl', 'dln', 'dnotz', 'doc', 'hist', 'doc.jur.b', 'doc.min.fin', 'doc.parl', 'doctr', 'dpl', 'dpl.besl', 'dr', 'dr.banc.fin', 'dr.circ', 'dr.inform', 'dr.mr', 'dr.pén.entr', 'dr.q.m', 'drs', 'dtp', 'dwz', 'dyn', 'e cont', 'e', 'e.a', 'e.b', 'tek.mod', 'e.c', 'e.c.a', 'e.d', 'e.e', 'e.e.a', 'e.e.g', 'e.g', 'e.g.a', 'e.h.a', 'e.i', 'e.j', 'e.m.a', 'e.n.a.c', 'e.o', 'e.p.c', 'e.r.c', 'e.r.f', 'e.r.h', 'e.r.o', 'e.r.p', 'e.r.v', 'e.s.r.a', 'e.s.t', 'e.v', 'e.v.a', 'e.w', 'e&o.e', 'ec.pol.r', 'echos log', 'econ', 'ed', 'ed(s)', 'eeg verd.v', 'eex san s', 'eff', 'eg rtl', 'eig', 'eig.mag', 'eil', 'elektr', 'enmb', 'entr.et dr', 'enz', 'err', 'et al', 'et seq', 'etc', 'etq', 'eur', 'parl', 'eur.t.s', 'eur.verd.overdracht strafv', 'ev rechtsh', 'ev uitl', 'ev', 'evt', 'ex', 'ex.crim', 'exec', 'f', 'f.a.o', 'f.a.q', 'f.a.s', 'f.i.b', 'f.j.f', 'f.o.b', 'f.o.r', 'f.o.s', 'f.o.t', 'f.r', 'f.supp', 'f.suppl', 'fa', 'facs', 'fare act', 'fasc', 'fg', 'fid.ber', 'fig', 'fin.verh.w', 'fisc', 'fisc', 'tijdschr', 'fisc.act', 'fisc.koer', 'fl', 'form', 'foro', 'it', 'fr', 'fr.cult.r', 'fr.gem.r', 'fr.parl', 'fra', 'ft', 'g', 'g.a', 'g.a.v', 'g.a.w.v', 'g.g.d', 'g.m.t', 'g.o', 'g.omt.e', 'g.p', 'g.s', 'g.v', 'g.w.w', 'geb', 'gebr', 'gebrs', 'gec', 'gec.decr', 'ged', 'ged.st', 'gedipl', 'gedr.st', 'geh', 'gem', 'gem', 'en gew', 'gem', 'en prov', 'gem.gem.comm', 'gem.st', 'gem.stem', 'gem.w', 'gem.wet, gem.wet', 'gemeensch.optr', 'gemeensch.standp', 'gemeensch.strat', 'gemeent', 'gemeent.b', 'gemeent.regl', 'gemeent.verord', 'geol', 'geopp', 'gepubl', 'ger.deurw', 'ger.w', 'gerekw', 'gereq', 'gesch', 'get', 'getr', 'gev.m', 'gev.maatr', 'gew', 'ghert', 'gir.eff.verk', 'gk', 'gr', 'gramm', 'grat.w', 'gron,opm.en leermed', 'grootb.w', 'grs', 'grur ausl', 'grur int', 'grvm', 'grw', 'gst', 'gw', 'h.a', 'h.a.v.o', 'h.b.o', 'h.e.a.o', 'h.e.g.a', 'h.e.geb', 'h.e.gestr', 'h.l', 'h.m', 'h.o', 'h.r', 'h.t.l', 'h.t.m', 'h.w.geb', 'hand', 'handelsn.w', 'handelspr', 'handelsr.w', 'handelsreg.w', 'handv', 'harv.l.rev', 'hc', 'herald', 'hert', 'herz', 'hfdst', 'hfst', 'hgrw', 'hhr', 'hist', 'hooggel', 'hoogl', 'hosp', 'hpw', 'hr', 'hr', 'ms', 'hr.ms', 'hregw', 'hrg', 'hst', 'huis.just', 'huisv.w', 'huurbl', 'hv.vn', 'hw', 'hyp.w', 'i.b.s', 'i.c', 'i.c.m.h', 'i.e', 'i.f', 'i.f.p', 'i.g.v', 'i.h', 'i.h.a', 'i.h.b', 'i.l.pr', 'i.o', 'i.p.o', 'i.p.r', 'i.p.v', 'i.pl.v', 'i.r.d.i', 'i.s.m', 'i.t.t', 'i.v', 'i.v.m', 'i.v.s', 'i.w.tr', 'i.z', 'ib', 'ibid', 'icip-ing.cons', 'iem', 'ind prop', 'indic.soc', 'indiv', 'inf', 'inf.i.d.a.c', 'inf.idac', 'inf.r.i.z.i.v', 'inf.riziv', 'inf.soc.secr', 'ing', 'ing', 'cons', 'ing.cons', 'inst', 'int', 'int', 'rechtsh', 'strafz', "int'l & comp.l.q.", 'interm', 'intern.fisc.act', 'intern.vervoerr', 'inv', 'inv', 'f', 'inv.w', 'inv.wet', 'invord.w', 'inz', 'ir', 'irspr', 'iwtr', 'j', 'j.-cl', 'j.c.b', 'j.c.e', 'j.c.fl', 'j.c.j', 'j.c.p', 'j.d.e', 'j.d.f', 'j.d.s.c', 'j.dr.jeun', 'j.j.d', 'j.j.p', 'j.j.pol', 'j.l', 'j.l.m.b', 'j.l.o', 'j.ordre pharm', 'j.p.a', 'j.r.s', 'j.t', 'j.t.d.e', 'j.t.dr.eur', 'j.t.o', 'j.t.t', 'jaarl', 'jb.hand', 'jb.kred', 'jb.kred.c.s', 'jb.l.r.b', 'jb.lrb', 'jb.markt', 'jb.mens', 'jb.t.r.d', 'jb.trd', 'jeugdrb', 'jeugdwerkg.w', 'jg', 'jis', 'jl', 'journ.jur', 'journ.prat.dr.fisc.fin', 'journ.proc', 'jrg', 'jur', 'jur.comm.fl', 'jur.dr.soc.b.l.n', 'jur.f.p.e', 'jur.fpe', 'jur.niv', 'jur.trav.brux', 'jura falc', 'jurambt', 'jv.cass', 'jv.h.r.j', 'jv.hrj', 'jw', 'k', 'k', 'en m', 'k.b', 'k.g', 'k.k', 'k.m.b.o', 'k.o.o', 'k.v.k', 'k.v.v.v', 'kadasterw', 'kaderb', 'kador', 'kbo-nr', 'kg', 'kh', 'kiesw', 'kind.bes.v', 'kkr', 'koopv', 'kr', 'krankz.w', 'ksbel', 'kt', 'ktg', 'ktr', 'kvdm', 'kw.r', 'kymr', 'kzr', 'kzw', 'l', 'l.b', 'l.b.o', 'l.bas', 'l.c', 'l.gew', 'l.j', 'l.k', 'l.l', 'l.o', 'l.r.b', 'l.u.v.i', 'l.v.r', 'l.v.w', 'l.w', "l'exp.-compt.b.", 'l’exp.-compt.b', 'landinr.w', 'landscrt', 'larcier cass', 'lat', 'law.ed', 'lett', 'levensverz', 'lgrs', 'lidw', 'limb.rechtsl', 'lit', 'litt', 'liw', 'liwet', 'lk', 'll', 'll.(l.)l.r', 'loonw', 'losbl', 'ltd', 'luchtv', 'luchtv.w', 'm', 'm', 'not', 'm.a.v.o', 'm.a.w', 'm.b', 'm.b.o', 'm.b.r', 'm.b.t', 'm.d.g.o', 'm.e.a.o', 'm.e.r', 'm.h', 'm.h.d', 'm.i.v', 'm.j.t', 'm.k', 'm.m', 'm.m.a', 'm.m.h.h', 'm.m.v', 'm.n', 'm.not.fisc', 'm.nt', 'm.o', 'm.r', 'm.s.a', 'm.u.p', 'm.v.a', 'm.v.h.n', 'm.v.t', 'm.z', 'maatr.teboekgest.luchtv', 'maced', 'mand', 'max', 'mbl.not', 'me', 'med', 'med', 'v.b.o', 'med.b.u.f.r', 'med.bufr', 'med.vbo', 'meerv', 'meetbr.w', 'mém.adm', 'mgr', 'mgrs', 'mhd', 'mi.verantw', 'mil', 'mil.bed', 'mil.ger', 'min', 'min', 'aanbev', 'min', 'circ', 'min', 'fin', 'min.j.omz', 'min.just.circ', 'mitt', 'mnd', 'mod', 'mon', 'monde ass', 'mouv.comm', 'mr', 'ms', 'muz', 'mv', 'mva ii inv', 'mva inv', 'n cont', 'n', 'chr', 'n.a', 'n.a.g', 'n.a.v', 'n.b', 'n.c', 'n.chr', 'n.d', 'n.d.r', 'n.e.a', 'n.g', 'n.h.b.c', 'n.j', 'n.j.b', 'n.j.w', 'n.l', 'n.m', 'n.m.m', 'n.n', 'n.n.b', 'n.n.g', 'n.n.k', 'n.o.m', 'n.o.t.k', 'n.rapp', 'n.tijd.pol', 'n.v', 'n.v.d.r', 'n.v.d.v', 'n.v.o.b', 'n.v.t', 'nat.besch.w', 'nat.omb', 'nat.pers', 'ned.cult.r', 'neg.verkl', 'nhd', 'nieuw arch', 'wisk', 'njcm-bull', 'nl', 'nnd', 'no', 'not.fisc.m', 'not.w', 'not.wet', 'nr', 'nrs', 'nste', 'nt', 'numism', 'o', 'o.a', 'o.b', 'o.c', 'o.g', 'o.g.v', 'o.i', 'o.i.d', 'o.m', 'o.o', 'o.o.d', 'o.o.v', 'o.p', 'o.r', 'o.regl', 'o.s', 'o.t.s', 'o.t.t', 'o.t.t.t', 'o.t.t.z', 'o.tk.t', 'o.v.t', 'o.v.t.t', 'o.v.tk.t', 'o.v.v', 'ob', 'obsv', 'octr', 'octr.gem.regl', 'octr.regl', 'oe', 'oecd mod', 'off.pol', 'ofra', 'ohd', 'omb', 'omnia frat', 'omnil', 'omz', 'on.ww', 'onderr', 'onfrank', 'onteig.w', 'ontw', 'b.w', 'onuitg', 'onz', 'oorl.w', 'op.cit', 'opin.pa', 'opm', 'or', 'ord.br', 'ord.gem', 'ors', 'orth', 'os', 'osm', 'ov', 'ov.w.i', 'ov.w.ii', 'ov.ww', 'overg.w', 'overw', 'ovkst', 'ow kadasterw', 'oz', 'p', 'p.& b', 'p.a', 'p.a.o', 'p.b.o', 'p.e', 'p.g', 'p.j', 'p.m', 'p.m.a', 'p.o', 'p.o.j.t', 'p.p', 'p.v', 'p.v.s', 'pachtw', 'pag', 'pan', 'pand.b', 'pand.pér', 'parl.gesch', 'parl.gesch', 'inv', 'parl.st', 'part.arb', 'pas', 'pasin', 'pat', 'pb.c', 'pb.l', 'pens', 'pensioenverz', 'per.ber.i.b.r', 'per.ber.ibr', 'pers', 'st', 'pft', 'pg wijz.rv', 'pk', 'pktg', 'pli jur', 'plv', 'po', 'pol', 'pol.off', 'pol.r', 'pol.w', 'politie j', 'postbankw', 'postw', 'pp', 'pr', 'preadv', 'pres', 'prf', 'prft', 'prg', 'prijz.w', 'pro jus', 'proc', 'procesregl', 'prof', 'prot', 'prov', 'prov.b', 'prov.instr.h.m.g', 'prov.regl', 'prov.verord', 'prov.w', 'publ', 'publ.cour eur.d.h', 'publ.eur.court h.r', 'pun', 'pw', 'q.b.d', 'q.e.d', 'q.q', 'q.r', 'r', 'r.a.b.g', 'r.a.c.e', 'r.a.j.b', 'r.b.d.c', 'r.b.d.i', 'r.b.s.s', 'r.c', 'r.c.b', 'r.c.d.c', 'r.c.j.b', 'r.c.s.j', 'r.cass', 'r.d.c', 'r.d.i', 'r.d.i.d.c', 'r.d.j.b', 'r.d.j.p', 'r.d.p.c', 'r.d.s', 'r.d.t.i', 'r.e', 'r.f.s.v.p', 'r.g.a.r', 'r.g.c.f', 'r.g.d.c', 'r.g.f', 'r.g.z', 'r.h.a', 'r.i.c', 'r.i.d.a', 'r.i.e.j', 'r.i.n', 'r.i.s.a', 'r.j.d.a', 'r.j.i', 'r.k', 'r.l', 'r.l.g.b', 'r.med', 'r.med.rechtspr', 'r.n.b', 'r.o', 'r.orde apoth', 'r.ov', 'r.p', 'r.p.d.b', 'r.p.o.t', 'r.p.r.j', 'r.p.s', 'r.r.d', 'r.r.s', 'r.s', 'r.s.v.p', 'r.stvb', 'r.t.d.f', 'r.t.d.h', 'r.t.l', 'r.trim.dr.eur', 'r.v.a', 'r.verkb', 'r.w', 'r.w.d', 'rap.ann.c.a', 'rap.ann.c.c', 'rap.ann.c.e', 'rap.ann.c.s.j', 'rap.ann.ca', 'rap.ann.cass', 'rap.ann.cc', 'rap.ann.ce', 'rap.ann.csj', 'rapp', 'rb', 'rb.kh', 'rb.van kh', 'rdn', 'rdnr', 're.pers', 'rec', 'rec.c.i.j', 'rec.c.j.c.e', 'rec.cij', 'rec.cjce', 'rec.cour eur.d.h', 'rec.gén.enr.not', 'rec.lois decr.arr', 'rechtsk.t', 'rechtspl.zeem', 'rechtspr.arb.br', 'rechtspr.b.f.e', 'rechtspr.bfe', 'rechtspr.soc.r.b.l.n', 'recl.reg', 'rect', 'red', 'reg', 'reg.huiz.bew', 'reg.w', 'registr.w', 'regl', 'regl', 'r.v.k', 'regl.besl', 'regl.onderr', 'regl.r.t', 'rep', 'rep.eur.court h.r', 'rép.fisc', 'rép.not', 'rep.r.j', 'rep.rj', 'req', 'res', 'resp', 'rev', 'rev', 'de dr', 'comp', 'rev', 'trim', 'de dr', 'civ', 'rev', 'trim', 'de dr', 'comm', 'rev.acc.trav', 'rev.adm', 'rev.b.compt', 'rev.b.dr.const', 'rev.b.dr.intern', 'rev.b.séc.soc', 'rev.banc.fin', 'rev.comm', 'rev.cons.prud', 'rev.dr.b', 'rev.dr.commun', 'rev.dr.étr', 'rev.dr.fam', 'rev.dr.intern.comp', 'rev.dr.mil', 'rev.dr.min', 'rev.dr.pén', 'rev.dr.pén.mil', 'rev.dr.rur', 'rev.dr.u.l.b', 'rev.dr.ulb', 'rev.exp', 'rev.faill', 'rev.fisc', 'rev.gd', 'rev.hist.dr', 'rev.i.p.c', 'rev.ipc', 'rev.not.b', 'rev.prat.dr.comm', 'rev.prat.not.b', 'rev.prat.soc', 'rev.rec', 'rev.rw', 'rev.trav', 'rev.trim.d.h', 'rev.trim.dr.fam', 'rev.urb', 'richtl', 'riv.dir.int', 'riv.dir.int."le priv', 'riv.dir.int.priv.proc', 'rk', 'rln', 'roln', 'rom', 'rondz', 'rov', 'rtl', 'rubr', 'ruilv.wet', 'rv.verdr', 'rvkb', 's', 's', 'en s', 's.a', 's.b.n', 's.ct', 's.d', 's.e.c', 's.e.et.o', 's.e.w', 's.exec.rept', 's.hrg', 's.j.b', 's.l', 's.l.e.a', 's.l.n.d', 's.p.a', 's.s', 's.t', 's.t.b', 's.v', 's.v.p', 'samenw', 'sc', 'sch', 'scheidsr.uitspr', 'schepel.besl', 'secr.comm', 'secr.gen', 'sect.soc', 'sess', 'cas', 'sir', 'soc', 'best', 'soc', 'handv', 'soc', 'verz', 'soc.act', 'soc.best', 'soc.kron', 'soc.r', 'soc.sw', 'soc.weg', 'sofi-nr', 'somm', 'somm.ann', 'sp.c.c', 'sr', 'ss', 'st.doc.b.c.n.a.r', 'st.doc.bcnar', 'st.vw', 'stagever', 'stas', 'stat', 'stb', 'stbl', 'stcrt', 'stichting i.v', 'stud.dipl', 'su', 'subs', 'subst', 'succ.w', 'suppl', 'sv', 'sw', 't', 't.a', 't.a.a', 't.a.n', 't.a.p', 't.a.s.n', 't.a.v', 't.a.v.w', 't.aann', 't.acc', 't.agr.r', 't.app', 't.b.b.r', 't.b.h', 't.b.m', 't.b.o', 't.b.p', 't.b.r', 't.b.s', 't.b.v', 't.bankw', 't.belg.not', 't.desk', 't.e.m', 't.e.p', 't.f.r', 't.fam', 't.fin.r', 't.g.r', 't.g.t', 't.g.v', 't.gem', 't.gez', 't.huur', 't.i.n', 't.in b.z', 't.j.k', 't.l.l', 't.l.v', 't.m', 't.m.r', 't.m.w', 't.mil.r', 't.mil.strafr', 't.not', 't.o', 't.o.r.b', 't.o.v', 't.ontv', 't.orde geneesh', 't.p.r', 't.pol', 't.r', 't.r.d.& i', 't.r.g', 't.r.o.s', 't.r.v', 't.s.r', 't.strafr', 't.t', 't.u', 't.v.c', 't.v.g', 't.v.m.r', 't.v.o', 't.v.v', 't.v.v.d.b', 't.v.w', 't.verz', 't.vred', 't.vreemd', 't.w', 't.w.k', 't.w.v', 't.w.v.r', 't.wrr', 't.z', 't.z.t', 't.z.v', 'taalk', 'tar.burg.z', 'td', 'techn', 'telecomm', 'toel', 'toel.st.v.w', 'toep', 'toep.regl', 'tom', 'top', 'trans.b', 'transp.r', 'trav.com.ét.et lég.not', 'trb', 'trib', 'trib.civ', 'trib.gr.inst', 'ts', 'ts', 'best', 'ts', 'verv', 'turnh.rechtsl', 'tvpol', 'tvpr', 'tvrechtsgesch', 'tw', 'u', 'u.a', 'u.a.r', 'u.a.v', 'u.c', 'u.c.c', 'u.g', 'u.p', 'u.s', 'u.s.d.c', 'uitdr', 'uitl.w', 'uitv.besch.div.b', 'uitv.besl', 'uitv.besl', 'succ.w', 'uitv.besl.bel.rv', 'uitv.besl.l.b', 'uitv.reg', 'inv.w', 'uitv.reg.bel.d', 'uitv.reg.afd.verm', 'uitv.reg.lb', 'uitv.reg.succ.w', 'univ', 'univ.verkl', 'v', 'v', 'chr', 'v.& f', 'v.a', 'v.a.v', 'v.bp prot', 'v.c', 'v.chr', 'v.h', 'v.huw.verm', 'v.i', 'v.i.o', 'v.k.a', 'v.m', 'v.o.f', 'v.o.n', 'v.onderh.verpl', 'v.p', 'v.r', 'v.s.o', 'v.t.t', 'v.t.t.t', 'v.tk.t', 'v.toep.r.vert', 'v.v.b', 'v.v.g', 'v.v.t', 'v.v.t.t', 'v.v.tk.t', 'v.w.b', 'v.z.m', 'vb', 'vb.bo', 'vbb', 'vc', 'vd', 'veldw', 'ver.k', 'ver.verg.gem', 'gem.comm', 'verbr', 'verd', 'verdr', 'verdr.v', 'verdrag benel.i.z', 'tek.mod', 'verenw', 'verg', 'verg.fr.gem', 'comm', 'verkl', 'verkl.herz.gw', 'verl', 'deelw', 'vern', 'verord', 'vers.r', 'versch', 'versl.c.s.w', 'versl.csw', 'vert', 'verw', 'verz', 'verz.w', 'verz.wett.besl', 'verz.wett.decr.besl', 'vgl', 'vid', 'vigiles jb', 'viss.w', 'vl.parl', 'vl.r', 'vl.t.gez', 'vl.w.reg', 'vl.w.succ', 'vlg', 'vn', 'vnl', 'vnw', 'vo', 'vo.bl', 'voegw', 'vol', 'volg', 'volt', 'deelw', 'voorl', 'voorz', 'vord.w', 'vorst.d', 'vr', 'en antw', 'vred', 'vrg', 'vnw', 'vrijgrs', 'vs', 'vt', 'vvsr jb', 'vw', 'vz', 'vzngr', 'vzr', 'w', 'w.a', 'w.b.r', 'w.c.h', 'w.conf.huw', 'w.conf.huwelijksb', 'w.consum.kr', 'w.f.r', 'w.g', 'w.gelijke beh', 'w.gew.r', 'w.ident.pl', 'w.just.doc', 'w.kh', 'w.l.r', 'w.l.v', 'w.mil.straf.spr', 'w.n', 'w.not.ambt', 'w.o', 'w.o.d.huurcomm', 'w.o.d.k', 'w.openb.manif', 'w.parl', 'w.r', 'w.reg', 'w.succ', 'w.u.b', 'w.uitv.pl.verord', 'w.v', 'w.v.k', 'w.v.m.s', 'w.v.r', 'w.v.w', 'w.venn', 'wac', 'wd', 'wet a.b', 'wet bel.rv', 'wet c.a.o', 'wet c.o', 'wet div.bel', 'wet ksbel', 'wet l.v', 'wetb', 'n.v.h', 'wgb', 'winkelt.w', 'wisk', 'wka-verkl', 'wnd', 'won.w', 'woningw', 'woonr.w', 'wrr', 'wrr.ber', 'wrsch', 'ws', 'wsch', 'wsr', 'wtvb', 'ww', 'x.d', 'z cont', 'z.a', 'z.g', 'z.i', 'z.j', 'z.o.z', 'z.p', 'z.s.m', 'zesde richtl', 'zg', 'zgn', 'zn', 'znw', 'zr', 'zr', 'ms', 'zr.ms']).freeze
10 | PREPOSITIVE_ABBREVIATIONS = [].freeze
11 | NUMBER_ABBREVIATIONS = [].freeze
12 | end
13 |
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/pragmatic_segmenter/languages/italian.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module PragmaticSegmenter
4 | module Languages
5 | module Italian
6 | include Languages::Common
7 |
8 | module Abbreviation
9 | ABBREVIATIONS = Set.new(['1°', 'a.c', 'a.c/a', 'a.cam', 'a.civ', 'a.cor', 'a.d.r', 'a.gov', 'a.mil', 'a.mon', 'a.smv', 'a.v', 'a/a', 'a/c', 'a/i', 'aa', 'aaaa', 'aaal', 'aacst', 'aamct', 'aams', 'aar', 'aato', 'ab', 'abbigl', 'abbrev', 'abc', 'abi', 'abl', 'abm', 'abr', 'abs', 'absp', 'ac', 'acam', 'acb', 'acbi', 'acc', 'accorc', 'accr', 'acd', 'ace', 'acec', 'acep', 'aci', 'acli', 'acp', 'acro', 'acsit', 'actl', 'ad', 'ad.mil', 'ada', 'adap', 'adatt', 'adc', 'add', 'adei', 'adeion', 'adhd', 'adi', 'adisco', 'adj', 'adm', 'adp', 'adr', 'ads', 'adsi', 'adsl', 'adv', 'ae.b', 'aefi', 'aer', 'aerodin', 'aeron', 'afa', 'afc', 'afci', 'affl', 'afi', 'afic', 'afm', 'afp', 'ag', 'agcm', 'agcom', 'age', 'agecs', 'agesci', 'agg', 'agip', 'agis', 'agm', 'ago', 'agr', 'agric', 'agt', 'ai', 'aia', 'aiab', 'aiac', 'aiace', 'aiap', 'aias', 'aiat', 'aib', 'aic', 'aica', 'aicel', 'aici', 'aics', 'aid', 'aida', 'aidaa', 'aidac', 'aidama', 'aidda', 'aidim', 'aido', 'aids', 'aies', 'aif', 'aih', 'aiip', 'aimi', 'aip', 'aipsc', 'airi', 'ais', 'aisa', 'aism', 'aiss', 'aissca', 'aitc', 'aiti', 'aitr', 'aits', 'aka', 'al', 'alai', 'alch', 'alg', 'ali', 'alim', 'all', 'allev', 'allus', 'alp', 'alq', 'alt', 'am', 'ama', 'amaci', 'amag', 'amami', 'amc', 'ammec', 'amn', 'ampas', 'amps', 'an', 'ana', 'anaai', 'anac', 'anaci', 'anad', 'anai', 'anaoo', 'anart', 'anat', 'anat. comp', 'ancci', 'anci', 'ancip', 'ancsa', 'andit', 'anec', 'anee', 'anem', 'anes', 'anffas', 'ani', 'ania', 'anica', 'anie', 'animi', 'anis', 'anisc', 'anm', 'anmfit', 'anmig', 'anmil', 'anmli', 'anms', 'anpa', 'anpas', 'anpci', 'anpe', 'anpi', 'ansi', 'ansv', 'ant', 'anta', 'antifr', 'antlo', 'anton', 'antrop', 'anusca', 'anvi', 'anx', 'ao', 'ap', 'apa', 'apd', 'apea', 'apec', 'apet', 'api', 'apos', 'app', 'app.sc', 'apr', 'aps', 'apt', 'aq', 'ar', 'ar.ind', 'ar.rep', 'arald', 'arame', 'arc', 'arch', 'archeol', 'arci', 'ardsu', 'are', 'arg', 'aritm', 'arpa', 'arpat', 'arred', 'arrt', 'arsia', 'art', 'arti min', 'artig', 'artigl', 'artt', 'as', 'asa', 'asae', 'asc', 'asci', 'ascii', 'ascom', 'ascop', 'asd', 'ase', 'asf', 'asfer', 'asg', 'asic', 'asifa', 'asl', 'asmdc', 'asmi', 'asp', 'aspic', 'aspp', 'assi', 'assic', 'assol', 'asst', 'aster', 'astr', 'astrol', 'astron', 'at', 'ata', 'atb', 'atic', 'atm', 'ats', 'att', 'attrav', 'atv', 'au', 'auc', 'aus', 'auser', 'aut', 'autom', 'av', 'avi', 'avis', 'avo', 'avv', 'avvers', 'awb', 'awdp', 'az', 'azh', 'b.a', 'b2b', 'b2c', 'ba', 'bafta', 'bal', 'ball', 'ban', 'banc', 'bar', 'bart', 'bas', 'bat', 'batt', 'bban', 'bbc', 'bbl', 'bbs', 'bbtc', 'bcc', 'bce', 'bcf', 'bdf', 'bei', 'bep', 'bers', 'bg', 'bi', 'bibl', 'bic', 'bioch', 'biol', 'bl', 'bld', 'bldg', 'blpc', 'bm', 'bmps', 'bmw', 'bn', 'bna', 'bncf', 'bncrm', 'bni', 'bnl', 'bo', 'bot', 'bpl', 'bpm', 'bpn', 'bpr', 'br', 'brd', 'bre', 'bric', 'brig', 'brig.ca', 'brig.gen', 'bros', 'bs', 'bsc', 'bsp', 'bsu', 'bt', 'btc', 'btg', 'btg.l', 'btr', 'bts', 'bu', 'bur', 'bz', 'c.a', 'c.a.p', 'c.c.p', 'c.cost', 'c.d a', 'c.d', 'c.le', 'c.m', 'c.opv', 'c.p', 'c.s', 'c.v', 'c.v.d', 'c/a', 'c/c', 'c/pag', 'ca', 'ca.rep', 'ca.sm', 'ca.sz', 'ca.uf', 'caaf', 'cab', 'cad', 'cae', 'cai', 'cal', 'cam', 'cap', 'capol', 'capt', 'car', 'car.sc', 'carat', 'card', 'cas', 'casaca', 'casd', 'cass.civ', 'cat', 'caus', 'cav', 'cavg', 'cb', 'cbd', 'cbr', 'cbs', 'cc', 'cca', 'ccap', 'ccda', 'ccdp', 'ccee', 'cciaa', 'ccie', 'ccip', 'cciss', 'ccna', 'ccnl', 'ccnp', 'ccpb', 'ccs', 'ccsp', 'cctld', 'cctv', 'ccv', 'cd', 'cda', 'cdma', 'cdo', 'cdpd', 'cdr', 'cds', 'cdw', 'ce', 'ced', 'cee', 'cei', 'cemat', 'cenelec', 'centr', 'cepis', 'ceps', 'cept', 'cerit', 'cese', 'cesis', 'cesvot', 'cet', 'cf', 'cfa', 'cfr', 'cg', 'cgi', 'cgil', 'cgs', 'ch', 'chf', 'chim', 'chim. ind', 'chir', 'ci', 'ci-europa', 'ciber', 'cicae', 'cid', 'cie', 'cif', 'cifej', 'cig', 'cigs', 'cii', 'cilea', 'cilo', 'cim', 'cime', 'cin', 'cinit', 'cio', 'cipe', 'cirm', 'cisal', 'ciscs', 'cisd', 'cisl', 'cism', 'citol', 'cl', 'class', 'cli', 'cm', 'cmdr', 'cme', 'cmo', 'cmr', 'cms', 'cmyk', 'cm²', 'cm³', 'cn', 'cna', 'cnb', 'cnc', 'cnel', 'cngei', 'cni', 'cnipa', 'cnit', 'cnn', 'cnr', 'cns', 'cnt', 'cnvvf', 'co', 'co.ing', 'co.sa', 'cobas', 'coc', 'cod', 'cod. civ', 'cod. deont. not', 'cod. pen', 'cod. proc. civ', 'cod. proc. pen', 'codec', 'coi', 'col', 'colf', 'coll', 'com', 'comdr', 'comm', 'comp', 'compar', 'compl', 'con', 'conai', 'conc', 'concl', 'condiz', 'confetra', 'confitarma', 'confr', 'cong', 'congeav', 'congiunt', 'coni', 'coniug', 'consec', 'consob', 'contab', 'contr', 'coreco', 'corp', 'corr', 'correl', 'corrisp', 'cosap', 'cospe', 'cost', 'costr', 'cpc', 'cpdel', 'cpe', 'cpi', 'cpl', 'cpt', 'cpu', 'cr', 'cral', 'credem', 'crf', 'cri', 'cric', 'cristall', 'crm', 'cro', 'cron', 'crsm', 'crt', 'cs', 'csa', 'csai', 'csc', 'csm', 'csn', 'css', 'ct', 'ctc', 'cti', 'ctr', 'ctsis', 'cuc', 'cud', 'cun', 'cup', 'cusi', 'cvb', 'cvbs', 'cwt', 'cz', 'd', 'd.c', 'd.i.a', 'dab', 'dac', 'dam', 'dams', 'dat', 'dau', 'db', 'dbms', 'dc', 'dca', 'dccc', 'dda', 'ddp', 'ddr', 'ddt', 'dea', 'decoraz', 'dect', 'dek', 'denom', 'deriv', 'derm', 'determ', 'df', 'dfp', 'dg', 'dga', 'dhcp', 'di', 'dia', 'dial', 'dic', 'dicomac', 'dif', 'difett', 'dig. iv', 'digos', 'dimin', 'dimostr', 'din', 'dipart', 'diplom', 'dir', 'dir. amm', 'dir. can', 'dir. civ', 'dir. d. lav', 'dir. giur', 'dir. internaz', 'dir. it', 'dir. pen', 'dir. priv', 'dir. proces', 'dir. pub', 'dir. rom', 'disus', 'diy', 'dl', 'dlf', 'dm', 'dme', 'dmf', 'dmo', 'dmoz', 'dm²', 'dm³', 'dnr', 'dns', 'doa', 'doc', 'docg', 'dom', 'dop', 'dos', 'dott', 'dpa', 'dpi', 'dpl', 'dpof', 'dps', 'dpt', 'dr', 'dra', 'drm', 'drs', 'dry pt', 'ds', 'dslam', 'dspn', 'dss', 'dtc', 'dtmf', 'dtp', 'dts', 'dv', 'dvb', 'dvb-t', 'dvd', 'dvi', 'dwdm', 'e.g', 'e.p.c', 'ead', 'eafrd', 'ean', 'eap', 'easw', 'eb', 'eban', 'ebr', 'ebri', 'ebtn', 'ecc', 'eccl', 'ecdl', 'ecfa', 'ecff', 'ecg', 'ecm', 'econ', 'econ. az', 'econ. dom', 'econ. pol', 'ecpnm', 'ed', 'ed agg', 'edge', 'edi', 'edil', 'edit', 'ef', 'efa', 'efcb', 'efp', 'efsa', 'efta', 'eg', 'egiz', 'egl', 'egr', 'ei', 'eisa', 'elab', 'elettr', 'elettron', 'ellitt', 'emap', 'emas', 'embr', 'emdr', 'emi', 'emr', 'en', 'enaip', 'enal', 'enaoli', 'enapi', 'encat', 'enclic', 'enea', 'enel', 'eni', 'enigm', 'enit', 'enol', 'enpa', 'enpaf', 'enpals', 'enpi', 'enpmf', 'ens', 'entom', 'epd', 'epigr', 'epirbs', 'epl', 'epo', 'ept', 'erc', 'ercom', 'ermes', 'erp', 'es', 'esa', 'escl', 'esist', 'eso', 'esp', 'estens', 'estr. min', 'etacs', 'etf', 'eti', 'etim', 'etn', 'etol', 'eu', 'eufem', 'eufic', 'eula', 'eva®', 'f.a', 'f.b', 'f.m', 'f.p', 'fa', 'fabi', 'fac', 'facl', 'facs', 'fad', 'fai', 'faile', 'failp', 'failpa', 'faisa', 'falcri', 'fam', 'famar', 'fans', 'fao', 'fapav', 'faq', 'farm', 'fasi', 'fasib', 'fatt', 'fbe', 'fbi', 'fc', 'fco', 'fcp', 'fcr', 'fcu', 'fdi', 'fe', 'feaog', 'feaosc', 'feb', 'fedic', 'fema', 'feoga', 'ferr', 'fesco', 'fesr', 'fess', 'fg', 'fi', 'fiaf', 'fiaip', 'fiais', 'fialtel', 'fiap', 'fiapf', 'fiat', 'fiavet', 'fic', 'ficc', 'fice', 'fidal', 'fidam', 'fidapa', 'fieg', 'fifa', 'fifo', 'fig', 'figc', 'figs', 'filat', 'filcams', 'file', 'filol', 'filos', 'fim', 'fima', 'fimmg', 'fin', 'finco', 'fio', 'fioto', 'fipe', 'fipresci', 'fis', 'fisar', 'fisc', 'fisg', 'fisiol', 'fisiopatol', 'fistel', 'fit', 'fita', 'fitav', 'fits', 'fiv', 'fivet', 'fivl', 'flo', 'flpd', 'fluid pt', 'fm', 'fmcg', 'fmi', 'fmth', 'fnas', 'fnomceo', 'fnsi', 'fob', 'fod', 'folcl', 'fon', 'fop', 'fotogr', 'fp', 'fpc', 'fpld', 'fr', 'fra', 'fs', 'fsc', 'fse', 'fsf', 'fsfi', 'fsh', 'ft', 'ftase', 'ftbcc', 'fte', 'ftp', 'fts', 'ft²', 'ft³', 'fuaav', 'fut', 'fv', 'fvg', 'g.fv', 'g.u', 'g.u.el', 'gal', 'gats', 'gatt', 'gb', 'gc', 'gccc', 'gco', 'gcost', 'gd', 'gdd', 'gdf', 'gdi', 'gdo', 'gdp', 'ge', 'gea', 'gel', 'gen', 'geneal', 'geod', 'geofis', 'geogr', 'geogr. antr', 'geogr. fis', 'geol', 'geom', 'gep', 'germ', 'gescal', 'gg', 'ggv', 'gi', 'gia', 'gides', 'gift', 'gio', 'giorn', 'gis', 'gisma', 'gismo', 'giu', 'gm', 'gmdss', 'gme', 'gmo', 'go', 'gov', 'gp', 'gpl', 'gprs', 'gps', 'gr', 'gr.sel.spec', 'gr.sel.tr', 'gr.sqd', 'gra', 'gram', 'grano', 'grd', 'grtn', 'grv', 'gsa', 'gsm', 'gsm-r', 'gsr', 'gtld', 'gu', 'guce', 'gui', 'gus', 'ha', 'haart', 'haccp', 'hba', 'hcg', 'hcrp', 'hd-dvd', 'hdcp', 'hdi', 'hdml', 'hdtv', 'hepa', 'hfpa', 'hg', 'hifi', 'hiperlan', 'hiv', 'hm', 'hmld', 'hon', 'hosp', 'hpv', 'hr', 'hrh', 'hrm', 'hrt', 'html', 'http', 'hvac', 'hz', 'i.e', 'i.g.m', 'iana', 'iasb', 'iasc', 'iass', 'iat', 'iata', 'iatse', 'iau', 'iban', 'ibid', 'ibm', 'icann', 'icao', 'icbi', 'iccu', 'ice', 'icf', 'ici', 'icm', 'icom', 'icon', 'ics', 'icsi', 'icstis', 'ict', 'icta', 'id', 'iden', 'idl', 'idraul', 'iec', 'iedm', 'ieee', 'ietf', 'ifat', 'ifel', 'ifla', 'ifrs', 'ifto', 'ifts', 'ig', 'igm', 'igmp', 'igp', 'iims', 'iipp', 'ilm', 'ilo', 'ilor', 'ils', 'im', 'imaie', 'imap', 'imc', 'imdb', 'imei', 'imi', 'imms', 'imo', 'imp', 'imper', 'imperf', 'impers', 'imq', 'ims', 'imsi', 'in', 'inail', 'inca', 'incb', 'inci', 'ind', 'ind. agr', 'ind. alim', 'ind. cart', 'ind. chim', 'ind. cuoio', 'ind. estratt', 'ind. graf', 'ind. mecc', 'ind. tess', 'indecl', 'indef', 'indeterm', 'indire', 'inea', 'inf', 'infea', 'infm', 'inform', 'ing', 'ingl', 'inmarsat', 'inpdai', 'inpdap', 'inpgi', 'inps', 'inr', 'inran', 'ins', 'insp', 'int', 'inter', 'intr', 'invar', 'invim', 'in²', 'in³', 'ioma', 'iosco', 'ip', 'ipab', 'ipasvi', 'ipi', 'ippc', 'ips', 'iptv', 'iq', 'ira', 'irap', 'ircc', 'ircs', 'irda', 'iref', 'ires', 'iron', 'irpef', 'irpeg', 'irpet', 'irreg', 'is', 'isae', 'isbd', 'isbn', 'isc', 'isdn', 'isee', 'isef', 'isfol', 'isg', 'isi', 'isia', 'ism', 'ismea', 'isnart', 'iso', 'isp', 'ispearmi', 'ispel', 'ispescuole', 'ispesl', 'ispo', 'ispro', 'iss', 'issn', 'istat', 'istol', 'isvap', 'it', 'iti', 'itt', 'ittiol', 'itu', 'iud', 'iugr', 'iulm', 'iva', 'iveco', 'ivg', 'ivr', 'ivs', 'iyhp', 'j', 'jal', 'jit', 'jr', 'jv', 'k', 'kb', 'kee', 'kg', 'kkk', 'klm', 'km', 'km/h', 'kmph', 'kmq', 'km²', 'kr', 'kw', 'kwh', 'l', 'l\'ing', 'l.n', 'l\'avv', 'la', 'lag', 'lan', 'lanc', 'larn', 'laser', 'lat', 'lav', 'lav. femm', 'lav. pubbl', 'laz', 'lb', 'lc', 'lcca', 'lcd', 'le', 'led', 'lett', 'lh', 'li', 'liaf', 'lib', 'lic', 'lic.ord', 'lic.strd', 'licd', 'lice', 'lida', 'lidci', 'liff', 'lifo', 'lig', 'liit', 'lila', 'lilt', 'linfa', 'ling', 'lipu', 'lis', 'lisaac', 'lism', 'lit', 'litab', 'lnp', 'lo', 'loc', 'loc. div', 'lolo', 'lom', 'long', 'lp', 'lrm', 'lrms', 'lsi', 'lsu', 'lt', 'ltd', 'lu', 'lug', 'luiss', 'lun', 'lwt', 'lww', 'm.a', 'm.b', 'm.o', 'm/s', 'ma', 'mac', 'macch', 'mag', 'magg.(maj)', 'magg.gen.(maj.gen.)', 'mai', 'maj', 'mar', 'mar.a', 'mar.ca', 'mar.ord', 'marc', 'mat', 'mater', 'max', 'mb', 'mbac', 'mc', 'mcl', 'mcpc', 'mcs', 'md', 'mdf', 'mdp', 'me', 'mec', 'mecc', 'med', 'mediev', 'mef', 'mer', 'merc', 'merid', 'mesa', 'messrs', 'metall', 'meteor', 'metr', 'metrol', 'mg', 'mgc', 'mgm', 'mi', 'mibac', 'mica', 'microb', 'mifed', 'miglio nautico', 'miglio nautico per ora', 'miglio nautico²', 'miglio²', 'mil', 'mile', 'miles/h', 'milesph', 'min', 'miner', 'mips', 'miptv', 'mit', 'mitol', 'miur', 'ml', 'mlle', 'mls', 'mm', 'mme', 'mms', 'mm²', 'mn', 'mnp', 'mo', 'mod', 'mol', 'mons', 'morf', 'mos', 'mpaa', 'mpd', 'mpeg', 'mpi', 'mps', 'mq', 'mr', 'mrs', 'ms', 'msgr', 'mss', 'mt', 'mto', 'murst', 'mus', 'mvds', 'mws', 'm²', 'm³', 'n.a', 'n.b', 'na', 'naa', 'nafta', 'napt', 'nars', 'nasa', 'nat', 'natas', 'nato', 'nb', 'nba', 'nbc', 'ncts', 'nd', 'nda', 'nde', 'ndr', 'ndt', 'ne', 'ned', 'neg', 'neol', 'netpac', 'neur', 'news!', 'ngcc', 'nhmf', 'nlcc', 'nmr', 'no', 'nodo', 'nom', 'nos', 'nov', 'novissdi', 'npi', 'nr', 'nt', 'nta', 'nts', 'ntsc', 'nu', 'nuct', 'numism', 'nwt', 'nyc', 'nz', 'o.m.i', 'oai-pmh', 'oav', 'oc', 'occ', 'occult', 'oci', 'ocr', 'ocse', 'oculist', 'od', 'odg', 'odp', 'oecd', 'oem', 'ofdm', 'oft', 'og', 'ogg', 'ogi', 'ogm', 'ohim', 'oic', 'oics', 'olaf', 'oland', 'ole', 'oled', 'omi', 'oms', 'on', 'ong', 'onig', 'onlus', 'onomat', 'onpi', 'onu', 'op', 'opac', 'opec', 'opord', 'opsosa', 'or', 'ord', 'ord. scol', 'ore', 'oref', 'orient', 'ornit', 'orogr', 'orp', 'ort', 'os', 'osa', 'osas', 'osd', 'ot', 'ote', 'ott', 'oz', 'p', 'p.a', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.m', 'p.r', 'p.s', 'p.t', 'p.v', 'pa', 'pac', 'pag./p', 'pagg./pp', 'pai', 'pal', 'paleobot', 'paleogr', 'paleont', 'paleozool', 'paletn', 'pamr', 'pan', 'papir', 'par', 'parapsicol', 'part', 'partic', 'pass', 'pat', 'patol', 'pb', 'pc', 'pci', 'pcm', 'pcmcia', 'pcs', 'pcss', 'pct', 'pd', 'pda', 'pdf', 'pdl', 'pds', 'pe', 'pec', 'ped', 'pedag', 'peg', 'pegg', 'per.ind', 'pers', 'pert', 'pesq', 'pet', 'petr', 'petrogr', 'pfc', 'pg', 'pga', 'pgp', 'pgut', 'ph', 'php', 'pi', 'pics', 'pie', 'pif', 'pii', 'pil', 'pime', 'pin', 'pine', 'pip', 'pir', 'pit', 'pitt', 'piuss', 'pkcs', 'pki', 'pko', 'pl', 'pli', 'plr', 'pm', 'pma', 'pmi', 'pmr', 'pn', 'pnf', 'pnl', 'po', 'poet', 'pof', 'pol', 'pop', 'popitt', 'popol', 'port', 'pos', 'poss', 'post', 'pots', 'pp', 'ppa', 'ppc', 'ppga', 'ppp', 'pps', 'pptt', 'ppv', 'pr', 'pra', 'praa', 'pref', 'preist', 'prep', 'pres', 'pret', 'prg', 'pri', 'priv', 'pro.civ', 'prof', 'pron', 'pronom', 'propr', 'prov', 'prs', 'prtl', 'prusst', 'ps', 'pse', 'psi', 'psicoan', 'psicol', 'pso', 'psp', 'pstn', 'pt', 'ptc', 'pti', 'ptsd', 'ptt', 'pu', 'pug', 'puk', 'put', 'pv', 'pvb', 'pvc', 'pvt', 'pz', 'qb', 'qcs', 'qfd', 'qg', 'qi', 'qlco', 'qlcu', 'qos', 'qualif', 'r-lan', 'r.s', 'ra', 'racc', 'radar', 'radc', 'radiotecn', 'raee', 'raf', 'rag', 'raid', 'ram', 'rar', 'ras', 'rass. avv. stato', 'rc', 'rca', 'rcdp', 'rcs', 'rdc', 'rdco', 'rdf', 'rdi', 'rdp', 'rds', 'rdt', 're', 'rea', 'recipr', 'recl', 'reg', 'region', 'rel', 'rem', 'rep', 'reps', 'res', 'retor', 'rev', 'rfi', 'rfid', 'rg', 'rgb', 'rgc', 'rge', 'rgi', 'rgi bdp', 'rgpt', 'rgt', 'ri', 'riaa', 'riaj', 'riba', 'ric', 'rid', 'rif', 'rifl', 'rina', 'rip', 'ris', 'rit', 'ritts', 'rm', 'rmn', 'rn', 'ro', 'roa', 'roc', 'roi', 'rom', 'roro', 'rov', 'rp', 'rpm', 'rr', 'rrf', 'rs', 'rsc', 'rspp', 'rss', 'rsu', 'rsvp', 'rt', 'rtdpc', 'rtg', 'rtn', 'rtp', 'rttt', 'rvm', 's-dab', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 's.ten', 's.v', 's/m', 'sa', 'sab', 'saca', 'sace', 'sact', 'sad', 'sag', 'sahm', 'sai', 'saisa', 'sam', 'san', 'sanas', 'sape', 'sar', 'sars', 'sart', 'sas', 'sbaf', 'sbas', 'sbn', 'sc', 'sca.sm', 'scherz', 'scien', 'scn', 'scsi', 'scuba', 'scult', 'scut', 'sdds', 'sdiaf', 'sds', 'sdsl', 'se', 'seat', 'sebc', 'sec', 'seca', 'secam', 'secc', 'see', 'seg', 'segg', 'segredifesa', 'sem', 'sempo', 'sen', 'sens', 'seo', 'serg', 'serg.magg.(sgm)', 'serg.magg.ca', 'set', 'sfc', 'sfis', 'sfx', 'sg', 'sga', 'sgc', 'sgg', 'sgml', 'sgt', 'si', 'si@lt', 'sia', 'siae', 'siaic', 'siap', 'sias', 'sic', 'sicav', 'sid', 'sido', 'sie', 'sif', 'sig', 'sig.na', 'sig.ra', 'sige', 'sigg', 'sigill', 'sigo', 'siia', 'simb', 'simbdea', 'simg', 'simo', 'sin', 'sinalv', 'sing', 'sins', 'sinu', 'siocmf', 'siog', 'sioi', 'siommms', 'siot', 'sip', 'sipem', 'sips', 'sirf', 'sirm', 'sis', 'sisde', 'sismi', 'sissa', 'sit', 'siulp', 'siusa', 'sla', 'sldn', 'slm', 'slr', 'sm', 'sma', 'smau', 'smd', 'sme', 'smes', 'smm', 'smpt', 'sms', 'sn', 'snad', 'snai', 'snc', 'sncci', 'sncf', 'sngci', 'snit', 'so', 'soc', 'sociol', 'sogg', 'soho', 'soi', 'sol', 'somipar', 'somm', 'sonar', 'sp', 'spa', 'spe', 'spett', 'spi', 'spm', 'spot', 'spp', 'spreg', 'sq', 'sqd', 'sr', 'srd', 'srl', 'srr', 'ss', 'ssi', 'ssn', 'ssr', 'sss', 'st', 'st. d. arte', 'st. d. dir', 'st. d. filos', 'st. d. rel', 'stat', 'stg', 'stp', 'stw', 'su', 'suap', 'suem', 'suff', 'sup', 'superl', 'supt', 'surg', 'surl', 'susm', 'sut', 'suv', 'sv', 'svga', 'swics', 'swift', 'swot', 'sxga', 'sz', 't-dab', 't.sg', 'ta', 'taa', 'tac', 'tacan', 'tacs', 'taeg', 'tai', 'tan', 'tar', 'targa', 'tav', 'tb', 'tbt', 'tci', 'tcp', 'tcp/ip', 'tcsm', 'tdm', 'tdma', 'te', 'tecn', 'tecnol', 'ted', 'tel', 'telecom', 'temp', 'ten.(lt)', 'ten.col.(ltc)', 'ten.gen', 'teol', 'term', 'tesa', 'tese', 'tesol', 'tess', 'tet', 'tetra', 'tfr', 'tft', 'tfts', 'tgv', 'thx', 'tim', 'tipogr', 'tir', 'tit', 'tld', 'tm', 'tmc', 'tn', 'to', 'toefl', 'ton', 'top', 'topog', 'tos', 'tosap', 'tosc', 'tp', 'tpl', 'tr', 'trad', 'tramat', 'trasp', 'ts', 'tso', 'tuir', 'tuld', 'tv', 'twa', 'twain', 'u.ad', 'u.s', 'ucai', 'ucca', 'ucei', 'ucina', 'uclaf', 'ucoi', 'ucoii', 'ucsi', 'ud', 'udc', 'udi', 'udp', 'ue', 'uefa', 'uemri', 'ufo', 'ugc', 'uhci', 'uhf', 'uht', 'uibm', 'uic', 'uicc', 'uiga', 'uil', 'uilps', 'uisp', 'uits', 'uk', 'ul', 'ull', 'uma', 'umb', 'ummc', 'umss', 'umts', 'unac', 'unar', 'unasp', 'uncem', 'unctad', 'undp', 'unefa', 'unep', 'unesco', 'ungh', 'unhcr', 'uni', 'unicef', 'unitec', 'unpredep', 'unsa', 'upa', 'upc', 'urar', 'urban', 'url', 'urp', 'urss', 'usa', 'usb', 'usfi', 'usga', 'usl', 'usp', 'uspi', 'ussr', 'utap', 'v', 'v.brig', 'v.cte', 'v.m', 'v.p', 'v.r', 'v.s', 'va', 'vab', 'vaio', 'val', 'vas', 'vb', 'vbr', 'vc', 'vcc', 'vcr', 'vda', 've', 'ven', 'ves', 'vesa', 'veter', 'vezz', 'vfb', 'vfp', 'vfx', 'vga', 'vhf', 'vhs', 'vi', 'via', 'vip', 'vis', 'vn', 'vo', 'voc', 'voip', 'vol', 'volg', 'voll', 'vor', 'vpdn', 'vpn', 'vr', 'vs', 'vsp', 'vt', 'vtc', 'vts', 'vtt', 'vv', 'vvf', 'wai', 'wais', 'wan', 'wap', 'wasp', 'wc', 'wcdma', 'wcm', 'wga', 'wi-fi', 'wipo', 'wisp', 'wll', 'wml', 'wms', 'worm', 'wp', 'wpan', 'wssn', 'wto', 'wwan', 'wwf', 'www', 'wygiwys', 'xl', 'xml', 'xs', 'xxl', 'xxs', 'yaf', 'yb', 'yci', 'yd', 'yd²', 'yd³', 'ymca', 'zat', 'zb', 'zcs', 'zdf', 'zdg', 'zift', 'zool', 'zoot', 'ztc', 'ztl', '°c', '°f', '°n', '°ra', '°ré', 'µg']).freeze
10 | PREPOSITIVE_ABBREVIATIONS = Set.new(['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']).freeze
11 | NUMBER_ABBREVIATIONS = Set.new(['art', 'no', 'nos', 'nr', 'pp']).freeze
12 | end
13 |
14 | class AbbreviationReplacer < AbbreviationReplacer
15 | SENTENCE_STARTERS = [].freeze
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------