├── VERSION
├── spec
    ├── spec.opts
    ├── spec_helper.rb
    └── lingua
    │   └── en
    │       ├── paragraph_spec.rb
    │       ├── readability_spec.rb
    │       └── sentence_spec.rb
├── .document
├── lib
    ├── lingua.rb
    └── lingua
    │   └── en
    │       ├── paragraph.rb
    │       ├── syllable.rb
    │       ├── sentence.rb
    │       ├── syllable
    │           └── guess.rb
    │       └── readability.rb
├── CHANGELOG.markdown
├── .gitignore
├── README.rdoc
├── LICENSE
├── Rakefile
└── lingua.gemspec


/VERSION:
--------------------------------------------------------------------------------
1 | 0.6.2
2 | 


--------------------------------------------------------------------------------
/spec/spec.opts:
--------------------------------------------------------------------------------
1 | --color
2 | 


--------------------------------------------------------------------------------
/.document:
--------------------------------------------------------------------------------
1 | README.rdoc
2 | lib/**/*.rb
3 | bin/*
4 | features/**/*.feature
5 | LICENSE
6 | 


--------------------------------------------------------------------------------
/lib/lingua.rb:
--------------------------------------------------------------------------------
1 | prefix = File.dirname(__FILE__) + "/"
2 | $LOAD_PATH.unshift prefix
3 | 
4 | Dir.glob(prefix + "**/*.rb").each do |f|
5 |   require File.expand_path(f)
6 | end
7 | 


--------------------------------------------------------------------------------
/CHANGELOG.markdown:
--------------------------------------------------------------------------------
1 | Current master
2 | -------------
3 | * Added more specs for readability [stuartellis, GH-1]
4 | 
5 | 0.6.2
6 | -----
7 | * Fix for sentences that end in abbreviations (e.g. dr, mrs, ms) [chad]
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## MAC OS
 2 | .DS_Store
 3 | 
 4 | ## TEXTMATE
 5 | *.tmproj
 6 | tmtags
 7 | 
 8 | ## EMACS
 9 | *~
10 | \#*
11 | .\#*
12 | 
13 | ## VIM
14 | *.swp
15 | 
16 | ## PROJECT::GENERAL
17 | coverage
18 | rdoc
19 | pkg
20 | 
21 | ## PROJECT::SPECIFIC
22 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift(File.dirname(__FILE__))
 2 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 3 | require 'lingua'
 4 | require 'spec'
 5 | require 'spec/autorun'
 6 | 
 7 | Spec::Runner.configure do |config|
 8 |   
 9 | end
10 | 


--------------------------------------------------------------------------------
/lib/lingua/en/paragraph.rb:
--------------------------------------------------------------------------------
 1 | module Lingua
 2 |   module EN
 3 |     module Paragraph
 4 |       # Splits text into an array of paragraphs.
 5 |       def self.paragraphs(text)
 6 |         text.strip.split(/(?:\n[\r\t ]*)+/).collect { |p| p.strip }
 7 |       end
 8 |     end
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
 1 | = lingua
 2 | 
 3 | This library is originally from http://pressure.to/ruby, by Alex Fenton <alex@pressure.to>.
 4 | 
 5 | It is currently maintained by David Balatero <dbalatero@gmail.com>.
 6 | 
 7 | Slowly but surely, specs are being added (original codebase was not tested), as well as better functionality.
 8 | 
 9 | == Note on Patches/Pull Requests
10 |  
11 | * Fork the project.
12 | * Make your feature addition or bug fix.
13 | * Add tests for it. This is important so I don't break it in a
14 |   future version unintentionally.
15 | * Commit, do not mess with rakefile, version, or history.
16 |   (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
17 | * Send me a pull request. Bonus points for topic branches.
18 | 
19 | == Copyright
20 | 
21 | Copyright (c) 2010 David Balatero. See LICENSE for details.
22 | 


--------------------------------------------------------------------------------
/spec/lingua/en/paragraph_spec.rb:
--------------------------------------------------------------------------------
 1 | require File.dirname(__FILE__) + "/../../spec_helper"
 2 | 
 3 | describe Lingua::EN::Paragraph do
 4 |   describe "#paragraphs" do
 5 |     it "should return paragraphs with extra whitespace in the line breaks" do
 6 |       text = "Ok.\n    \nTest."
 7 |       result = Lingua::EN::Paragraph.paragraphs(text)
 8 |       result.should have(2).things
 9 |       result[0].should == "Ok."
10 |       result[1].should == "Test."
11 |     end
12 | 
13 |     it "should break up paragraphs with > 2 line breaks" do
14 |       text = "Ok.\n\n\nTest."
15 |       result = Lingua::EN::Paragraph.paragraphs(text)
16 |       result.should have(2).things
17 |       result[0].should == "Ok."
18 |       result[1].should == "Test."
19 |     end
20 | 
21 |     it "should ignore trailing newline chars" do
22 |       text = "Ok.\n  \n\nTest.\n  \r\n  \n\n"
23 |       result = Lingua::EN::Paragraph.paragraphs(text)
24 |       result.should have(2).things
25 |       result[0].should == "Ok."
26 |       result[1].should == "Test."
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/lingua/en/syllable.rb:
--------------------------------------------------------------------------------
 1 | require 'lingua/en/syllable/guess'
 2 | 
 3 | module Lingua
 4 |   module EN
 5 |     # The module Lingua::EN::Syllable contains a single class method,
 6 |     # +syllable+, which will use the most accurate technique available to
 7 |     # determine the number syllables in a string containing a word passed to it.
 8 |     #
 9 |     ########## REMOVED BY dbalatero:
10 |     # The exact definition of the function depends on the availability of the
11 |     # Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
12 |     # the number of syllables as determined by the dictionary will be returned.
13 |     # If the dictionary is not available, or if a word not contained in the
14 |     # dictionary is passed, it will return the number of syllables as determined
15 |     # by the module Lingua::EN::Syllable::Guess. For more details, see there and
16 |     # Lingua::EN::Syllable::Dictionary.
17 |     module Syllable
18 |       def self.syllables(word)
19 |         Guess::syllables word
20 |       end
21 |     end
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009 David Balatero
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'rake'
 3 | 
 4 | begin
 5 |   require 'jeweler'
 6 |   Jeweler::Tasks.new do |gem|
 7 |     gem.name = "lingua"
 8 |     gem.summary = %Q{This is a maintained version of Ruby's Lingua port.}
 9 |     gem.description = %Q{Provides sentence splitting, syllable, and text-quality algorithms.}
10 |     gem.email = "dbalatero@gmail.com"
11 |     gem.homepage = "http://github.com/dbalatero/lingua"
12 |     gem.authors = ["David Balatero"]
13 |     gem.add_development_dependency "rspec", ">= 1.2.9"
14 |     # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15 |   end
16 |   Jeweler::GemcutterTasks.new
17 | rescue LoadError
18 |   puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19 | end
20 | 
21 | require 'spec/rake/spectask'
22 | Spec::Rake::SpecTask.new(:spec) do |spec|
23 |   spec.libs << 'lib' << 'spec'
24 |   spec.spec_files = FileList['spec/**/*_spec.rb']
25 | end
26 | 
27 | Spec::Rake::SpecTask.new(:rcov) do |spec|
28 |   spec.libs << 'lib' << 'spec'
29 |   spec.pattern = 'spec/**/*_spec.rb'
30 |   spec.rcov = true
31 | end
32 | 
33 | task :spec => :check_dependencies
34 | 
35 | task :default => :spec
36 | 
37 | require 'rake/rdoctask'
38 | Rake::RDocTask.new do |rdoc|
39 |   version = File.exist?('VERSION') ? File.read('VERSION') : ""
40 | 
41 |   rdoc.rdoc_dir = 'rdoc'
42 |   rdoc.title = "lingua #{version}"
43 |   rdoc.rdoc_files.include('README*')
44 |   rdoc.rdoc_files.include('lib/**/*.rb')
45 | end
46 | 


--------------------------------------------------------------------------------
/lingua.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
 4 | # -*- encoding: utf-8 -*-
 5 | 
 6 | Gem::Specification.new do |s|
 7 |   s.name = %q{lingua}
 8 |   s.version = "0.6.2"
 9 | 
10 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11 |   s.authors = ["David Balatero"]
12 |   s.date = %q{2010-07-25}
13 |   s.description = %q{Provides sentence splitting, syllable, and text-quality algorithms.}
14 |   s.email = %q{dbalatero@gmail.com}
15 |   s.extra_rdoc_files = [
16 |     "LICENSE",
17 |      "README.rdoc"
18 |   ]
19 |   s.files = [
20 |     ".document",
21 |      ".gitignore",
22 |      "CHANGELOG.markdown",
23 |      "LICENSE",
24 |      "README.rdoc",
25 |      "Rakefile",
26 |      "VERSION",
27 |      "lib/lingua.rb",
28 |      "lib/lingua/en/paragraph.rb",
29 |      "lib/lingua/en/readability.rb",
30 |      "lib/lingua/en/sentence.rb",
31 |      "lib/lingua/en/syllable.rb",
32 |      "lib/lingua/en/syllable/guess.rb",
33 |      "lingua.gemspec",
34 |      "spec/lingua/en/paragraph_spec.rb",
35 |      "spec/lingua/en/readability_spec.rb",
36 |      "spec/lingua/en/sentence_spec.rb",
37 |      "spec/spec.opts",
38 |      "spec/spec_helper.rb"
39 |   ]
40 |   s.homepage = %q{http://github.com/dbalatero/lingua}
41 |   s.rdoc_options = ["--charset=UTF-8"]
42 |   s.require_paths = ["lib"]
43 |   s.rubygems_version = %q{1.3.6}
44 |   s.summary = %q{This is a maintained version of Ruby's Lingua port.}
45 |   s.test_files = [
46 |     "spec/lingua/en/paragraph_spec.rb",
47 |      "spec/lingua/en/readability_spec.rb",
48 |      "spec/lingua/en/sentence_spec.rb",
49 |      "spec/spec_helper.rb"
50 |   ]
51 | 
52 |   if s.respond_to? :specification_version then
53 |     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
54 |     s.specification_version = 3
55 | 
56 |     if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
57 |       s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
58 |     else
59 |       s.add_dependency(%q<rspec>, [">= 1.2.9"])
60 |     end
61 |   else
62 |     s.add_dependency(%q<rspec>, [">= 1.2.9"])
63 |   end
64 | end
65 | 
66 | 


--------------------------------------------------------------------------------
/lib/lingua/en/sentence.rb:
--------------------------------------------------------------------------------
 1 | module Lingua
 2 |   module EN
 3 |     # The class Lingua::EN::Sentence takes English text, and attempts to
 4 |     # split it up into sentences, respecting abbreviations.
 5 | 
 6 |     class Sentence
 7 |       class << self
 8 |         attr_reader :abbreviations
 9 |         attr_reader :abbr_regex
10 |       end
11 | 
12 |       EOS = "\001" unless defined?(EOS) # temporary end of sentence marker
13 | 
14 |       Titles   = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
15 |         'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
16 |         'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles)
17 |       Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
18 |         'corp', 'plc' ] unless defined?(Entities)
19 |       Months   = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
20 |         'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months)
21 |       Days     = [ 'mon', 'tue', 'wed', 'thu',
22 |                    'fri', 'sat', 'sun' ] unless defined?(Days)
23 |       Misc     = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc)
24 |       Streets  = [ 'ave', 'bld', 'blvd', 'cl', 'ct',
25 |                    'cres', 'dr', 'rd', 'st' ] unless defined?(Streets)
26 | 
27 | 
28 |       # Finds abbreviations, like e.g., i.e., U.S., u.S., U.S.S.R.
29 |       ABBR_DETECT = /(?:\s(?:(?:(?:\w\.){2,}\w?)|(?:\w\.\w)))/ unless defined?(ABBR_DETECT)
30 | 
31 |       # Finds punctuation that ends paragraphs.
32 |       PUNCTUATION_DETECT = /((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/ unless defined?(PUNCTUATION_DETECT)
33 | 
34 |       CORRECT_ABBR = /(#{ABBR_DETECT})#{EOS}(\s+[a-z0-9])/
35 | 
36 |       # Split the passed text into individual sentences, trim these and return
37 |       # as an array. A sentence is marked by one of the punctuation marks ".", "?"
38 |       # or "!" followed by whitespace. Sequences of full stops (such as an
39 |       # ellipsis marker "..." and stops after a known abbreviation are ignored.
40 |       def self.sentences(text)
41 |         # Make sure we work with a duplicate, as we are modifying the
42 |         # text with #gsub!
43 |         text = text.dup
44 | 
45 |         # Mark end of sentences with EOS marker.
46 |         # We preserve the trailing whitespace ($2) so that we can
47 |         # fix ellipses (...)!
48 |         text.gsub!(PUNCTUATION_DETECT) { $1 << EOS << $2 }
49 | 
50 |         # Correct ellipsis marks.
51 |         text.gsub!(/(\.\.\.*)#{EOS}/) { $1 }
52 | 
53 |         # Correct e.g, i.e. marks.
54 |         text.gsub!(CORRECT_ABBR, "\\1\\2")
55 | 
56 |         # Correct abbreviations
57 |         text.gsub!(@abbr_regex) { $1 << '.' }
58 | 
59 |         # Split on EOS marker, get rid of trailing whitespace.
60 |         # Remove empty sentences.
61 |         text.split(EOS).
62 |           map { |sentence| sentence.strip }.
63 |           delete_if { |sentence| sentence.nil? || sentence.empty? }
64 |       end
65 | 
66 |       # Adds a list of abbreviations to the list that's used to detect false
67 |       # sentence ends. Return the current list of abbreviations in use.
68 |       def self.abbreviation(*abbreviations)
69 |         @abbreviations += abbreviations
70 |         @abbreviations.uniq!
71 |         set_abbr_regex!
72 |         @abbreviations
73 |       end
74 | 
75 |       def self.initialize_abbreviations!
76 |         @abbreviations = Titles + Entities + Months + Days + Streets + Misc
77 |         set_abbr_regex!
78 |       end
79 | 
80 |       def self.set_abbr_regex!
81 |         @abbr_regex = / (#{abbreviations.join("|")})\.#{EOS}/i
82 |       end
83 | 
84 |       initialize_abbreviations!
85 |     end
86 |   end
87 | end
88 | 


--------------------------------------------------------------------------------
/spec/lingua/en/readability_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.dirname(__FILE__) + "/../../spec_helper"
  2 | 
  3 | describe Lingua::EN::Readability do
  4 |   before(:each) do
  5 |     @text = <<-EOF
  6 |     After marriage, the next big event in the couples lives will be their honeymoon. It is a time when the newly weds can get away from relatives and friends to spend some significant time getting to know one another. This time alone together that the couple shares is called the honeymoon. A great gift idea for the married couple would be to give them a surprise tour package. Most women would like to go on a honeymoon.
  7 | 
  8 |     The week or two before the ceremonies would be the best time to schedule a tour because then the budget for this event could be considered. In winter there are more opportunities for the couple to get close to one another because of the cold weather. It is easier to snuggle when the weather is not favorable to outdoor activities. This would afford the couple ample time to know more about themselves during the honeymoon.
  9 | 
 10 |     Honeymoon plans should be discussed with the wife to ensure that the shock is pleasant and not a negative experience to her. It is also a good idea in this case, to ask her probing questions as to where she would like to go. Perhaps you could get a friend or family member to ask her what would be her favorite travel location. That would ensure that you know just what she is looking for.
 11 | 
 12 |     Make sure that the trip is exactly what she wants. Then on the wedding night tell her about the adventure so that the needed accommodations can be made.
 13 |     EOF
 14 | 
 15 |     @report = Lingua::EN::Readability.new(@text)
 16 |   end
 17 | 
 18 |   describe "#flesch" do
 19 |     it "should be the correct Flesch Reading Ease" do
 20 |       @report.flesch.should be_close(71.471, 0.001)
 21 |     end
 22 |   end
 23 | 
 24 |   describe "#fog" do
 25 |     it "should be the correct Gunning Fog Index" do
 26 |       @report.fog.should be_close(10.721, 0.001)
 27 |     end
 28 |   end
 29 | 
 30 |   describe "#kincaid" do
 31 |     it "should be the correct Flesch-Kincaid grade level" do
 32 |       @report.kincaid.should be_close(7.5, 0.1)
 33 |     end
 34 |   end
 35 | 
 36 |   describe "#num_chars" do
 37 |     it "should be the correct count of characters" do
 38 |       @report.num_chars.should == 1405
 39 |     end
 40 |   end
 41 | 
 42 |   describe "#num_paragraphs" do
 43 |     it "should return the correct count of paragraphs" do
 44 |       @report.num_paragraphs.should == 4
 45 |     end
 46 |   end
 47 | 
 48 |   describe "#num_sentences" do
 49 |     it "should be the correct count of sentences" do
 50 |       @report.num_sentences.should == 15
 51 |     end
 52 |   end
 53 | 
 54 |   describe "#num_syllables" do
 55 |     it "should be the correct count of syllables" do
 56 |       @report.num_syllables.should == 356
 57 |     end
 58 |   end
 59 | 
 60 |   describe "#num_unique_words" do
 61 |     it "should be the correct count of unique words" do
 62 |       @report.num_unique_words.should == 141
 63 |     end
 64 |   end
 65 | 
 66 |   describe "#num_words" do
 67 |     it "should be the correct count of words" do
 68 |       @report.num_words.should == 255
 69 |     end
 70 |   end
 71 | 
 72 |   describe "#occurrences" do
 73 |     it "should return the correct count of occurrences of the word 'the'" do
 74 |       @report.occurrences('the').should == 20
 75 |     end
 76 |   end
 77 | 
 78 |   describe "#percent_fog_complex_words" do
 79 |     it "should be the correct percentage of complex words according to Fog Index" do
 80 |       @report.percent_fog_complex_words.should be_close(9.803, 0.001)
 81 |     end
 82 |   end
 83 | 
 84 |   describe "#syllables_per_word" do
 85 |     it "should be the correct average of syllables per word" do
 86 |       @report.syllables_per_word.should be_close(1.396, 0.001)
 87 |     end
 88 |   end
 89 | 
 90 |   describe "#unique_words" do
 91 |     it "should be an array of unique words" do
 92 |       unique_words = @report.unique_words
 93 |       unique_words.should be_a(Array)
 94 |       unique_words.length.should == 141
 95 |     end
 96 |   end
 97 | 
 98 |   describe "#words_per_sentence" do
 99 |     it "should be the correct count of words per sentence" do
100 |       @report.words_per_sentence.should be_close(17.0, 0.001)
101 |     end
102 |   end
103 |   
104 | end
105 | 


--------------------------------------------------------------------------------
/lib/lingua/en/syllable/guess.rb:
--------------------------------------------------------------------------------
 1 | module Lingua
 2 |   module EN
 3 |     module Syllable
 4 |       # Uses English word patterns to guess the number of syllables. A single
 5 |       # module method is made available, +syllables+, which, when passed an
 6 |       # English word, will return the number of syllables it estimates are in
 7 |       # the word.
 8 |       #
 9 |       # English orthography (the representation of spoken sounds as written
10 |       # signs) is not regular. The same spoken sound can be represented in
11 |       # multiple different ways in written English (e.g. rough/cuff), and the
12 |       # same written letters can be pronounced in different ways in different
13 |       # words (e.g. rough/bough).
14 |       #
15 |       # As the same series of letters can be pronounced in different ways, it is
16 |       # not possible to write an algorithm which can always guess the number of
17 |       # syllables in an english word correctly. However, it is possible to use
18 |       # frequently recurring patterns in english (such as "a final -e is usually
19 |       # silent") to guess with a level of accuracy that is acceptable for
20 |       # applications like syllable counting for readability scoring. This module
21 |       # implements such an algorithm.
22 |       #
23 |       # This module is inspired by the Perl Lingua::EN::Syllable module.
24 |       # However, it uses a different (though not larger) set of patterns to
25 |       # compensate for the 'special cases' which arise out of English's
26 |       # irregular orthography. A number of extra patterns (particularly for
27 |       # derived word forms) means that this module is somewhat more accurate
28 |       # than the Perl original. It also omits a number of patterns found in the
29 |       # original which seem to me to apply to such a small number of cases, or
30 |       # to be of dubious value. Testing the guesses against the Carnegie Mellon
31 |       # Pronouncing Dictionary, this module guesses right around 90% of the
32 |       # time, as against about 85% of the time for the Perl module. However, the
33 |       # dictionary contains a large number of foreign loan words and proper
34 |       # names, and so when the algorithm is tested against 'real world' english,
35 |       # its accuracy is a good deal better. Testing against a range of samples,
36 |       # it guesses right about 95-97% of the time.
37 |       module Guess
38 |         # special cases - 1 syllable less than expected
39 |         SubSyl = [
40 |           /[^aeiou]e$/, # give, love, bone, done, ride ...
41 |           /[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/,
42 |           # (passive) past participles and 3rd person sing present verbs:
43 |           # bared, liked, called, tricked, bashed, matched
44 | 
45 |           /.e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/,
46 |           # nominal, adjectival and adverbial derivatives from -e$ roots:
47 |           # absolutely, nicely, likeness, basement, hopeless
48 |           # hopeful, tastefully, wasteful
49 | 
50 |           /ion/, # action, diction, fiction
51 |           /[ct]ia[nl]/, # special(ly), initial, physician, christian
52 |           /[^cx]iou/, # illustrious, NOT spacious, gracious, anxious, noxious
53 |           /sia$/, # amnesia, polynesia
54 |           /.gue$/ # dialogue, intrigue, colleague
55 |         ] unless defined?(SubSyl)
56 | 
57 |         # special cases - 1 syllable more than expected
58 |         AddSyl = [
59 |           /i[aiou]/, # alias, science, phobia
60 |           /[dls]ien/, # salient, gradient, transient
61 |           /[aeiouym]ble$/, # -Vble, plus -mble
62 |           /[aeiou]{3}/, # agreeable
63 |           /^mc/, # mcwhatever
64 |           /ism$/, # sexism, racism
65 |           /(?:([^aeiouy])\1|ck|mp|ng)le$/, # bubble, cattle, cackle, sample, angle
66 |           /dnt$/, # couldn/t
67 |           /[aeiou]y[aeiou]/ # annoying, layer
68 |         ] unless defined?(AddSyl)
69 | 
70 |         # special cases not actually used - these seem to me to be either very
71 |         # marginal or actually break more stuff than they fix
72 |         NotUsed = [
73 |           /^coa[dglx]./, # +1 coagulate, coaxial, coalition, coalesce - marginal
74 |           /[^gq]ua[^auieo]/, # +1 'du-al' - only for some speakers, and breaks
75 |           /riet/, # variety, parietal, notoriety - marginal?
76 |         ] unless defined?(NotUsed)
77 | 
78 |         def self.syllables(word)
79 |           return 1 if word.length == 1
80 |           word = word.downcase.delete("'")
81 | 
82 |           syllables = word.scan(/[aeiouy]+/).length
83 | 
84 |           # special cases
85 |           for pat in SubSyl
86 |             syllables -= 1 if pat.match(word)
87 |           end
88 |           for pat in AddSyl
89 |             syllables += 1 if pat.match(word)
90 |           end
91 | 
92 |           syllables = 1 if syllables < 1 # no vowels?
93 |           syllables
94 |         end
95 |       end
96 |     end
97 |   end
98 | end
99 | 


--------------------------------------------------------------------------------
/lib/lingua/en/readability.rb:
--------------------------------------------------------------------------------
  1 | module Lingua
  2 |   module EN
  3 |     # The class Lingua::EN::Readability takes English text and analyses formal
  4 |     # characteristic
  5 |     class Readability
  6 |       attr_reader :text, :paragraphs, :sentences, :words, :frequencies
  7 | 
  8 |       # The constructor accepts the text to be analysed, and returns a report
  9 |       # object which gives access to the
 10 |       def initialize(text)
 11 |         @text                = text.dup
 12 |         @paragraphs          = Lingua::EN::Paragraph.paragraphs(self.text)
 13 |         @sentences           = Lingua::EN::Sentence.sentences(self.text)
 14 |         @words               = []
 15 |         @frequencies         = {}
 16 |         @frequencies.default = 0
 17 |         @syllables           = 0
 18 |         @complex_words       = 0
 19 |         count_words
 20 |       end
 21 | 
 22 |       # The number of paragraphs in the sample. A paragraph is defined as a
 23 |       # newline followed by one or more empty or whitespace-only lines.
 24 |       def num_paragraphs
 25 |         paragraphs.length
 26 |       end
 27 | 
 28 |       # The number of sentences in the sample. The meaning of a "sentence" is
 29 |       # defined by Lingua::EN::Sentence.
 30 |       def num_sentences
 31 |         sentences.length
 32 |       end
 33 | 
 34 |       # The number of characters in the sample.
 35 |       def num_chars
 36 |         text.length
 37 |       end
 38 |       alias :num_characters :num_chars
 39 | 
 40 |       # The total number of words used in the sample. Numbers as digits are not
 41 |       # counted.
 42 |       def num_words
 43 |         words.length
 44 |       end
 45 | 
 46 |       # The total number of syllables in the text sample. Just for completeness.
 47 |       def num_syllables
 48 |         @syllables
 49 |       end
 50 | 
 51 |       # The number of different unique words used in the text sample.
 52 |       def num_unique_words
 53 |         @frequencies.keys.length
 54 |       end
 55 | 
 56 |       # An array containing each unique word used in the text sample.
 57 |       def unique_words
 58 |         @frequencies.keys
 59 |       end
 60 | 
 61 |       # The number of occurences of the word +word+ in the text sample.
 62 |       def occurrences(word)
 63 |         @frequencies[word]
 64 |       end
 65 | 
 66 |       # The average number of words per sentence.
 67 |       def words_per_sentence
 68 |         words.length.to_f / sentences.length.to_f
 69 |       end
 70 | 
 71 |       # The average number of syllables per word. The syllable count is
 72 |       # performed by Lingua::EN::Syllable, and so may not be completely
 73 |       # accurate, especially if the Carnegie-Mellon Pronouncing Dictionary
 74 |       # is not installed.
 75 |       def syllables_per_word
 76 |         @syllables.to_f / words.length.to_f
 77 |       end
 78 | 
 79 |       # Flesch-Kincaid level of the text sample. This measure scores text based
 80 |       # on the American school grade system; a score of 7.0 would indicate that
 81 |       # the text is readable by a seventh grader. A score of 7.0 to 8.0 is
 82 |       # regarded as optimal for ordinary text.
 83 |       def kincaid
 84 |         (11.8 * syllables_per_word) +  (0.39 * words_per_sentence) - 15.59
 85 |       end
 86 | 
 87 |       # Flesch reading ease of the text sample. A higher score indicates text
 88 |       # that is easier to read. The score is on a 100-point scale, and a score
 89 |       # of 60-70 is regarded as optimal for ordinary text.
 90 |       def flesch
 91 |         206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
 92 |       end
 93 | 
 94 |       # The Gunning Fog Index of the text sample. The index indicates the number
 95 |       # of years of formal education that a reader of average intelligence would
 96 |       # need to comprehend the text. A higher score indicates harder text; a
 97 |       # value of around 12 is indicated as ideal for ordinary text.
 98 |       def fog
 99 |         ( words_per_sentence +  percent_fog_complex_words ) * 0.4
100 |       end
101 | 
102 |       # The percentage of words that are defined as "complex" for the purpose of
103 |       # the Fog Index. This is non-hyphenated words of three or more syllabes.
104 |       def percent_fog_complex_words
105 |         ( @complex_words.to_f / words.length.to_f ) * 100
106 |       end
107 | 
108 |       # Return a nicely formatted report on the sample, showing most the useful
109 |       # statistics about the text sample.
110 |       def report
111 |         sprintf "Number of paragraphs           %d \n" <<
112 |         "Number of sentences            %d \n" <<
113 |         "Number of words                %d \n" <<
114 |         "Number of characters           %d \n\n" <<
115 |         "Average words per sentence     %.2f \n" <<
116 |         "Average syllables per word     %.2f \n\n" <<
117 |         "Flesch score                   %2.2f \n" <<
118 |         "Flesh-Kincaid grade level      %2.2f \n" <<
119 |         "Fog Index                      %2.2f \n",
120 |           num_paragraphs, num_sentences, num_words, num_characters,
121 |           words_per_sentence, syllables_per_word,
122 |           flesch, kincaid, fog
123 |       end
124 | 
125 |       private
126 |       def count_words
127 |         @text.scan(/\b([a-z][a-z\-']*)\b/i).each do |match|
128 |           word = match[0]
129 |           @words << word
130 | 
131 |           # up frequency counts
132 |           @frequencies[word] += 1
133 | 
134 |           # syllable counts
135 |           syllables = Lingua::EN::Syllable.syllables(word)
136 |           @syllables += syllables
137 |           if syllables > 2 && !word.include?('-')
138 |             @complex_words += 1 # for Fog Index
139 |           end
140 |         end
141 |       end
142 |     end
143 |   end
144 | end
145 | 


--------------------------------------------------------------------------------
/spec/lingua/en/sentence_spec.rb:
--------------------------------------------------------------------------------
  1 | require File.dirname(__FILE__) + "/../../spec_helper"
  2 | 
  3 | describe Lingua::EN::Sentence do
  4 |   klass = Lingua::EN::Sentence
  5 |   
  6 |   describe "#sentences" do
  7 |     describe "multi-paragraph text" do
  8 |       before(:each) do
  9 |         text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\"\n\n"
 10 |         text << "Visit http://www.google.com and check out my site. Thanks very much!"
 11 |         @sentences = klass.sentences(text)
 12 |       end
 13 |   
 14 |       it "should get the correct number of sentences" do
 15 |         @sentences.should have(5).things
 16 |       end
 17 |   
 18 |       it "should get the correct sentences" do
 19 |         @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
 20 |         @sentences[1].should == "And I'm inclined to agree."
 21 |         @sentences[2].should == "\"Why can't we be friends?\""
 22 |         @sentences[3].should == "Visit http://www.google.com and check out my site."
 23 |         @sentences[4].should == "Thanks very much!"
 24 |       end
 25 |     end
 26 |   
 27 |     describe "quoted sentences" do
 28 |       before(:each) do
 29 |         text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\""
 30 |         @sentences = klass.sentences(text)
 31 |       end
 32 |   
 33 |       it "should get the correct number of sentences" do
 34 |         @sentences.should have(3).things
 35 |       end
 36 |   
 37 |       it "should get the correct sentences" do
 38 |         @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
 39 |         @sentences[1].should == "And I'm inclined to agree."
 40 |         @sentences[2].should == "\"Why can't we be friends?\""
 41 |       end
 42 |     end
 43 |   
 44 |     describe "ellipses correction" do
 45 |       before(:each) do
 46 |         text = "Well... why would you do that? Let's not fight."
 47 |         @sentences = klass.sentences(text)
 48 |       end
 49 |   
 50 |       it "should get the correct number of sentences" do
 51 |         @sentences.should have(2).things
 52 |       end
 53 |   
 54 |       it "should get the right sentences" do
 55 |         @sentences[0].should == "Well... why would you do that?"
 56 |         @sentences[1].should == "Let's not fight."
 57 |       end
 58 |     end
 59 |   
 60 |     describe "simple URL matching" do
 61 |       before(:each) do
 62 |         text = "Hello, visit http://www.google.com/index.php?ok=ok for more info. Ok?"
 63 |         @sentences = klass.sentences(text)
 64 |       end
 65 |   
 66 |       it "should get the correct number of sentences" do
 67 |         @sentences.should have(2).things
 68 |       end
 69 |   
 70 |       it "should get the right sentences" do
 71 |         @sentences[0].should == "Hello, visit http://www.google.com/index.php?ok=ok for more info."
 72 |         @sentences[1].should == "Ok?"
 73 |       end
 74 |     end
 75 |   
 76 |     describe "ending a sentence with an abbreviation" do
 77 |       before(:each) do
 78 |         text = "I was born in the U.S.S.R. My parents were from the U.S. This is not weird."
 79 |         @sentences = klass.sentences(text)
 80 |       end
 81 |   
 82 |       it "should get the correct number of sentences" do
 83 |         @sentences.should have(3).things
 84 |       end
 85 |   
 86 |       it "should get the correct sentences" do
 87 |         @sentences[0].should == "I was born in the U.S.S.R."
 88 |         @sentences[1].should == "My parents were from the U.S."
 89 |         @sentences[2].should == "This is not weird."
 90 |       end
 91 |       
 92 |       describe "which is hard-coded (like st, dr, mrs...)" do
 93 |         before(:each) do
 94 |           text = "This is a test. The word 'test' ends with the abbreviation for 'street'. This should still be three sentences."
 95 |           @sentences = klass.sentences(text)
 96 |         end
 97 |         
 98 |         it "should have the correct number of sentences" do
 99 |           @sentences.should have(3).things
100 |         end
101 |         
102 |         it "should get the correct sentences" do
103 |           @sentences[0].should == "This is a test."
104 |           @sentences[1].should == "The word 'test' ends with the abbreviation for 'street'."
105 |           @sentences[2].should == "This should still be three sentences."
106 |         end
107 |       end
108 |     end
109 |     
110 |     describe "basic sentences" do
111 |       before(:each) do
112 |         text = "Hello, my name is David. What is your name?"
113 |         @sentences = klass.sentences(text)
114 |       end
115 |   
116 |       it "should get the correct number of sentences" do
117 |         @sentences.should have(2).things
118 |       end
119 |     end
120 |   
121 |     describe "short sentences w/ line breaks" do
122 |       before(:each) do
123 |         @doc = <<-EOF
124 |         So how does the 401(k) plan work?  Let's see -
125 |   
126 |         The 401(k) consists of - first, asking your employer to set aside a portion (upto 15% of your total income) in keeping with the plan.
127 |         EOF
128 |         @sentences = klass.sentences(@doc)
129 |       end
130 |   
131 |       it "should find 3 sentences" do
132 |         @sentences.should have(3).things
133 |       end
134 |   
135 |       it "should stop at line breaks" do
136 |         @sentences[1].should == "Let's see -"
137 |       end
138 |     end
139 |   
140 |     describe "sentences with URLs and abbreviation" do
141 |       before(:each) do
142 |         text = "Many of these leading names now have their own website, e.g.  http://www.kaptest.com/. Hello, e.g. you don't know what you mean. I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
143 |         @sentences = klass.sentences(text)
144 |       end
145 |   
146 |       it "should get the correct number of sentences" do
147 |         @sentences[0].should == "Many of these leading names now have their own website, e.g.  http://www.kaptest.com/."
148 |         @sentences[1].should == "Hello, e.g. you don't know what you mean."
149 |         @sentences[2].should == "I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
150 |         @sentences.should have(3).things
151 |       end
152 |     end
153 |   end
154 |   
155 |   describe "#abbreviation" do
156 |     it "should change the abbreviations list" do
157 |       klass.abbreviation('monkey', 'pig')
158 |       klass.abbreviations.should include('monkey')
159 |       klass.abbreviations.should include('pig')
160 |     end
161 |   
162 |     it "should change the regex for abbreviations" do
163 |       lambda {
164 |         klass.abbreviation('monkey')
165 |       }.should change(klass, :abbr_regex)
166 |     end
167 |   
168 |     after(:each) do
169 |       klass.initialize_abbreviations!
170 |     end
171 |   end
172 | 
173 | end
174 | 


--------------------------------------------------------------------------------