├── VERSION ├── spec ├── spec.opts ├── spec_helper.rb └── lingua │ └── en │ ├── paragraph_spec.rb │ ├── readability_spec.rb │ └── sentence_spec.rb ├── .document ├── lib ├── lingua.rb └── lingua │ └── en │ ├── paragraph.rb │ ├── syllable.rb │ ├── sentence.rb │ ├── syllable │ └── guess.rb │ └── readability.rb ├── CHANGELOG.markdown ├── .gitignore ├── README.rdoc ├── LICENSE ├── Rakefile └── lingua.gemspec /VERSION: -------------------------------------------------------------------------------- 1 | 0.6.2 2 | -------------------------------------------------------------------------------- /spec/spec.opts: -------------------------------------------------------------------------------- 1 | --color 2 | -------------------------------------------------------------------------------- /.document: -------------------------------------------------------------------------------- 1 | README.rdoc 2 | lib/**/*.rb 3 | bin/* 4 | features/**/*.feature 5 | LICENSE 6 | -------------------------------------------------------------------------------- /lib/lingua.rb: -------------------------------------------------------------------------------- 1 | prefix = File.dirname(__FILE__) + "/" 2 | $LOAD_PATH.unshift prefix 3 | 4 | Dir.glob(prefix + "**/*.rb").each do |f| 5 | require File.expand_path(f) 6 | end 7 | -------------------------------------------------------------------------------- /CHANGELOG.markdown: -------------------------------------------------------------------------------- 1 | Current master 2 | ------------- 3 | * Added more specs for readability [stuartellis, GH-1] 4 | 5 | 0.6.2 6 | ----- 7 | * Fix for sentences that end in abbreviations (e.g. dr, mrs, ms) [chad] 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## MAC OS 2 | .DS_Store 3 | 4 | ## TEXTMATE 5 | *.tmproj 6 | tmtags 7 | 8 | ## EMACS 9 | *~ 10 | \#* 11 | .\#* 12 | 13 | ## VIM 14 | *.swp 15 | 16 | ## PROJECT::GENERAL 17 | coverage 18 | rdoc 19 | pkg 20 | 21 | ## PROJECT::SPECIFIC 22 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 2 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 3 | require 'lingua' 4 | require 'spec' 5 | require 'spec/autorun' 6 | 7 | Spec::Runner.configure do |config| 8 | 9 | end 10 | -------------------------------------------------------------------------------- /lib/lingua/en/paragraph.rb: -------------------------------------------------------------------------------- 1 | module Lingua 2 | module EN 3 | module Paragraph 4 | # Splits text into an array of paragraphs. 5 | def self.paragraphs(text) 6 | text.strip.split(/(?:\n[\r\t ]*)+/).collect { |p| p.strip } 7 | end 8 | end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = lingua 2 | 3 | This library is originally from http://pressure.to/ruby, by Alex Fenton . 4 | 5 | It is currently maintained by David Balatero . 6 | 7 | Slowly but surely, specs are being added (original codebase was not tested), as well as better functionality. 8 | 9 | == Note on Patches/Pull Requests 10 | 11 | * Fork the project. 12 | * Make your feature addition or bug fix. 13 | * Add tests for it. This is important so I don't break it in a 14 | future version unintentionally. 15 | * Commit, do not mess with rakefile, version, or history. 16 | (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull) 17 | * Send me a pull request. Bonus points for topic branches. 18 | 19 | == Copyright 20 | 21 | Copyright (c) 2010 David Balatero. See LICENSE for details. 22 | -------------------------------------------------------------------------------- /spec/lingua/en/paragraph_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + "/../../spec_helper" 2 | 3 | describe Lingua::EN::Paragraph do 4 | describe "#paragraphs" do 5 | it "should return paragraphs with extra whitespace in the line breaks" do 6 | text = "Ok.\n \nTest." 7 | result = Lingua::EN::Paragraph.paragraphs(text) 8 | result.should have(2).things 9 | result[0].should == "Ok." 10 | result[1].should == "Test." 11 | end 12 | 13 | it "should break up paragraphs with > 2 line breaks" do 14 | text = "Ok.\n\n\nTest." 15 | result = Lingua::EN::Paragraph.paragraphs(text) 16 | result.should have(2).things 17 | result[0].should == "Ok." 18 | result[1].should == "Test." 19 | end 20 | 21 | it "should ignore trailing newline chars" do 22 | text = "Ok.\n \n\nTest.\n \r\n \n\n" 23 | result = Lingua::EN::Paragraph.paragraphs(text) 24 | result.should have(2).things 25 | result[0].should == "Ok." 26 | result[1].should == "Test." 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/lingua/en/syllable.rb: -------------------------------------------------------------------------------- 1 | require 'lingua/en/syllable/guess' 2 | 3 | module Lingua 4 | module EN 5 | # The module Lingua::EN::Syllable contains a single class method, 6 | # +syllable+, which will use the most accurate technique available to 7 | # determine the number syllables in a string containing a word passed to it. 8 | # 9 | ########## REMOVED BY dbalatero: 10 | # The exact definition of the function depends on the availability of the 11 | # Carnegie Mellon Pronouncing Dictionary on the system. If it is available, 12 | # the number of syllables as determined by the dictionary will be returned. 13 | # If the dictionary is not available, or if a word not contained in the 14 | # dictionary is passed, it will return the number of syllables as determined 15 | # by the module Lingua::EN::Syllable::Guess. For more details, see there and 16 | # Lingua::EN::Syllable::Dictionary. 17 | module Syllable 18 | def self.syllables(word) 19 | Guess::syllables word 20 | end 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 David Balatero 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | 4 | begin 5 | require 'jeweler' 6 | Jeweler::Tasks.new do |gem| 7 | gem.name = "lingua" 8 | gem.summary = %Q{This is a maintained version of Ruby's Lingua port.} 9 | gem.description = %Q{Provides sentence splitting, syllable, and text-quality algorithms.} 10 | gem.email = "dbalatero@gmail.com" 11 | gem.homepage = "http://github.com/dbalatero/lingua" 12 | gem.authors = ["David Balatero"] 13 | gem.add_development_dependency "rspec", ">= 1.2.9" 14 | # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings 15 | end 16 | Jeweler::GemcutterTasks.new 17 | rescue LoadError 18 | puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler" 19 | end 20 | 21 | require 'spec/rake/spectask' 22 | Spec::Rake::SpecTask.new(:spec) do |spec| 23 | spec.libs << 'lib' << 'spec' 24 | spec.spec_files = FileList['spec/**/*_spec.rb'] 25 | end 26 | 27 | Spec::Rake::SpecTask.new(:rcov) do |spec| 28 | spec.libs << 'lib' << 'spec' 29 | spec.pattern = 'spec/**/*_spec.rb' 30 | spec.rcov = true 31 | end 32 | 33 | task :spec => :check_dependencies 34 | 35 | task :default => :spec 36 | 37 | require 'rake/rdoctask' 38 | Rake::RDocTask.new do |rdoc| 39 | version = File.exist?('VERSION') ? File.read('VERSION') : "" 40 | 41 | rdoc.rdoc_dir = 'rdoc' 42 | rdoc.title = "lingua #{version}" 43 | rdoc.rdoc_files.include('README*') 44 | rdoc.rdoc_files.include('lib/**/*.rb') 45 | end 46 | -------------------------------------------------------------------------------- /lingua.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command 4 | # -*- encoding: utf-8 -*- 5 | 6 | Gem::Specification.new do |s| 7 | s.name = %q{lingua} 8 | s.version = "0.6.2" 9 | 10 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 11 | s.authors = ["David Balatero"] 12 | s.date = %q{2010-07-25} 13 | s.description = %q{Provides sentence splitting, syllable, and text-quality algorithms.} 14 | s.email = %q{dbalatero@gmail.com} 15 | s.extra_rdoc_files = [ 16 | "LICENSE", 17 | "README.rdoc" 18 | ] 19 | s.files = [ 20 | ".document", 21 | ".gitignore", 22 | "CHANGELOG.markdown", 23 | "LICENSE", 24 | "README.rdoc", 25 | "Rakefile", 26 | "VERSION", 27 | "lib/lingua.rb", 28 | "lib/lingua/en/paragraph.rb", 29 | "lib/lingua/en/readability.rb", 30 | "lib/lingua/en/sentence.rb", 31 | "lib/lingua/en/syllable.rb", 32 | "lib/lingua/en/syllable/guess.rb", 33 | "lingua.gemspec", 34 | "spec/lingua/en/paragraph_spec.rb", 35 | "spec/lingua/en/readability_spec.rb", 36 | "spec/lingua/en/sentence_spec.rb", 37 | "spec/spec.opts", 38 | "spec/spec_helper.rb" 39 | ] 40 | s.homepage = %q{http://github.com/dbalatero/lingua} 41 | s.rdoc_options = ["--charset=UTF-8"] 42 | s.require_paths = ["lib"] 43 | s.rubygems_version = %q{1.3.6} 44 | s.summary = %q{This is a maintained version of Ruby's Lingua port.} 45 | s.test_files = [ 46 | "spec/lingua/en/paragraph_spec.rb", 47 | "spec/lingua/en/readability_spec.rb", 48 | "spec/lingua/en/sentence_spec.rb", 49 | "spec/spec_helper.rb" 50 | ] 51 | 52 | if s.respond_to? :specification_version then 53 | current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION 54 | s.specification_version = 3 55 | 56 | if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then 57 | s.add_development_dependency(%q, [">= 1.2.9"]) 58 | else 59 | s.add_dependency(%q, [">= 1.2.9"]) 60 | end 61 | else 62 | s.add_dependency(%q, [">= 1.2.9"]) 63 | end 64 | end 65 | 66 | -------------------------------------------------------------------------------- /lib/lingua/en/sentence.rb: -------------------------------------------------------------------------------- 1 | module Lingua 2 | module EN 3 | # The class Lingua::EN::Sentence takes English text, and attempts to 4 | # split it up into sentences, respecting abbreviations. 5 | 6 | class Sentence 7 | class << self 8 | attr_reader :abbreviations 9 | attr_reader :abbr_regex 10 | end 11 | 12 | EOS = "\001" unless defined?(EOS) # temporary end of sentence marker 13 | 14 | Titles = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep', 15 | 'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt', 16 | 'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles) 17 | Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co', 18 | 'corp', 'plc' ] unless defined?(Entities) 19 | Months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 20 | 'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months) 21 | Days = [ 'mon', 'tue', 'wed', 'thu', 22 | 'fri', 'sat', 'sun' ] unless defined?(Days) 23 | Misc = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc) 24 | Streets = [ 'ave', 'bld', 'blvd', 'cl', 'ct', 25 | 'cres', 'dr', 'rd', 'st' ] unless defined?(Streets) 26 | 27 | 28 | # Finds abbreviations, like e.g., i.e., U.S., u.S., U.S.S.R. 29 | ABBR_DETECT = /(?:\s(?:(?:(?:\w\.){2,}\w?)|(?:\w\.\w)))/ unless defined?(ABBR_DETECT) 30 | 31 | # Finds punctuation that ends paragraphs. 32 | PUNCTUATION_DETECT = /((?:[\.?!]|[\r\n]+)(?:\"|\'|\)|\]|\})?)(\s+)/ unless defined?(PUNCTUATION_DETECT) 33 | 34 | CORRECT_ABBR = /(#{ABBR_DETECT})#{EOS}(\s+[a-z0-9])/ 35 | 36 | # Split the passed text into individual sentences, trim these and return 37 | # as an array. A sentence is marked by one of the punctuation marks ".", "?" 38 | # or "!" followed by whitespace. Sequences of full stops (such as an 39 | # ellipsis marker "..." and stops after a known abbreviation are ignored. 40 | def self.sentences(text) 41 | # Make sure we work with a duplicate, as we are modifying the 42 | # text with #gsub! 43 | text = text.dup 44 | 45 | # Mark end of sentences with EOS marker. 46 | # We preserve the trailing whitespace ($2) so that we can 47 | # fix ellipses (...)! 48 | text.gsub!(PUNCTUATION_DETECT) { $1 << EOS << $2 } 49 | 50 | # Correct ellipsis marks. 51 | text.gsub!(/(\.\.\.*)#{EOS}/) { $1 } 52 | 53 | # Correct e.g, i.e. marks. 54 | text.gsub!(CORRECT_ABBR, "\\1\\2") 55 | 56 | # Correct abbreviations 57 | text.gsub!(@abbr_regex) { $1 << '.' } 58 | 59 | # Split on EOS marker, get rid of trailing whitespace. 60 | # Remove empty sentences. 61 | text.split(EOS). 62 | map { |sentence| sentence.strip }. 63 | delete_if { |sentence| sentence.nil? || sentence.empty? } 64 | end 65 | 66 | # Adds a list of abbreviations to the list that's used to detect false 67 | # sentence ends. Return the current list of abbreviations in use. 68 | def self.abbreviation(*abbreviations) 69 | @abbreviations += abbreviations 70 | @abbreviations.uniq! 71 | set_abbr_regex! 72 | @abbreviations 73 | end 74 | 75 | def self.initialize_abbreviations! 76 | @abbreviations = Titles + Entities + Months + Days + Streets + Misc 77 | set_abbr_regex! 78 | end 79 | 80 | def self.set_abbr_regex! 81 | @abbr_regex = / (#{abbreviations.join("|")})\.#{EOS}/i 82 | end 83 | 84 | initialize_abbreviations! 85 | end 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /spec/lingua/en/readability_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + "/../../spec_helper" 2 | 3 | describe Lingua::EN::Readability do 4 | before(:each) do 5 | @text = <<-EOF 6 | After marriage, the next big event in the couples lives will be their honeymoon. It is a time when the newly weds can get away from relatives and friends to spend some significant time getting to know one another. This time alone together that the couple shares is called the honeymoon. A great gift idea for the married couple would be to give them a surprise tour package. Most women would like to go on a honeymoon. 7 | 8 | The week or two before the ceremonies would be the best time to schedule a tour because then the budget for this event could be considered. In winter there are more opportunities for the couple to get close to one another because of the cold weather. It is easier to snuggle when the weather is not favorable to outdoor activities. This would afford the couple ample time to know more about themselves during the honeymoon. 9 | 10 | Honeymoon plans should be discussed with the wife to ensure that the shock is pleasant and not a negative experience to her. It is also a good idea in this case, to ask her probing questions as to where she would like to go. Perhaps you could get a friend or family member to ask her what would be her favorite travel location. That would ensure that you know just what she is looking for. 11 | 12 | Make sure that the trip is exactly what she wants. Then on the wedding night tell her about the adventure so that the needed accommodations can be made. 13 | EOF 14 | 15 | @report = Lingua::EN::Readability.new(@text) 16 | end 17 | 18 | describe "#flesch" do 19 | it "should be the correct Flesch Reading Ease" do 20 | @report.flesch.should be_close(71.471, 0.001) 21 | end 22 | end 23 | 24 | describe "#fog" do 25 | it "should be the correct Gunning Fog Index" do 26 | @report.fog.should be_close(10.721, 0.001) 27 | end 28 | end 29 | 30 | describe "#kincaid" do 31 | it "should be the correct Flesch-Kincaid grade level" do 32 | @report.kincaid.should be_close(7.5, 0.1) 33 | end 34 | end 35 | 36 | describe "#num_chars" do 37 | it "should be the correct count of characters" do 38 | @report.num_chars.should == 1405 39 | end 40 | end 41 | 42 | describe "#num_paragraphs" do 43 | it "should return the correct count of paragraphs" do 44 | @report.num_paragraphs.should == 4 45 | end 46 | end 47 | 48 | describe "#num_sentences" do 49 | it "should be the correct count of sentences" do 50 | @report.num_sentences.should == 15 51 | end 52 | end 53 | 54 | describe "#num_syllables" do 55 | it "should be the correct count of syllables" do 56 | @report.num_syllables.should == 356 57 | end 58 | end 59 | 60 | describe "#num_unique_words" do 61 | it "should be the correct count of unique words" do 62 | @report.num_unique_words.should == 141 63 | end 64 | end 65 | 66 | describe "#num_words" do 67 | it "should be the correct count of words" do 68 | @report.num_words.should == 255 69 | end 70 | end 71 | 72 | describe "#occurrences" do 73 | it "should return the correct count of occurrences of the word 'the'" do 74 | @report.occurrences('the').should == 20 75 | end 76 | end 77 | 78 | describe "#percent_fog_complex_words" do 79 | it "should be the correct percentage of complex words according to Fog Index" do 80 | @report.percent_fog_complex_words.should be_close(9.803, 0.001) 81 | end 82 | end 83 | 84 | describe "#syllables_per_word" do 85 | it "should be the correct average of syllables per word" do 86 | @report.syllables_per_word.should be_close(1.396, 0.001) 87 | end 88 | end 89 | 90 | describe "#unique_words" do 91 | it "should be an array of unique words" do 92 | unique_words = @report.unique_words 93 | unique_words.should be_a(Array) 94 | unique_words.length.should == 141 95 | end 96 | end 97 | 98 | describe "#words_per_sentence" do 99 | it "should be the correct count of words per sentence" do 100 | @report.words_per_sentence.should be_close(17.0, 0.001) 101 | end 102 | end 103 | 104 | end 105 | -------------------------------------------------------------------------------- /lib/lingua/en/syllable/guess.rb: -------------------------------------------------------------------------------- 1 | module Lingua 2 | module EN 3 | module Syllable 4 | # Uses English word patterns to guess the number of syllables. A single 5 | # module method is made available, +syllables+, which, when passed an 6 | # English word, will return the number of syllables it estimates are in 7 | # the word. 8 | # 9 | # English orthography (the representation of spoken sounds as written 10 | # signs) is not regular. The same spoken sound can be represented in 11 | # multiple different ways in written English (e.g. rough/cuff), and the 12 | # same written letters can be pronounced in different ways in different 13 | # words (e.g. rough/bough). 14 | # 15 | # As the same series of letters can be pronounced in different ways, it is 16 | # not possible to write an algorithm which can always guess the number of 17 | # syllables in an english word correctly. However, it is possible to use 18 | # frequently recurring patterns in english (such as "a final -e is usually 19 | # silent") to guess with a level of accuracy that is acceptable for 20 | # applications like syllable counting for readability scoring. This module 21 | # implements such an algorithm. 22 | # 23 | # This module is inspired by the Perl Lingua::EN::Syllable module. 24 | # However, it uses a different (though not larger) set of patterns to 25 | # compensate for the 'special cases' which arise out of English's 26 | # irregular orthography. A number of extra patterns (particularly for 27 | # derived word forms) means that this module is somewhat more accurate 28 | # than the Perl original. It also omits a number of patterns found in the 29 | # original which seem to me to apply to such a small number of cases, or 30 | # to be of dubious value. Testing the guesses against the Carnegie Mellon 31 | # Pronouncing Dictionary, this module guesses right around 90% of the 32 | # time, as against about 85% of the time for the Perl module. However, the 33 | # dictionary contains a large number of foreign loan words and proper 34 | # names, and so when the algorithm is tested against 'real world' english, 35 | # its accuracy is a good deal better. Testing against a range of samples, 36 | # it guesses right about 95-97% of the time. 37 | module Guess 38 | # special cases - 1 syllable less than expected 39 | SubSyl = [ 40 | /[^aeiou]e$/, # give, love, bone, done, ride ... 41 | /[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/, 42 | # (passive) past participles and 3rd person sing present verbs: 43 | # bared, liked, called, tricked, bashed, matched 44 | 45 | /.e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/, 46 | # nominal, adjectival and adverbial derivatives from -e$ roots: 47 | # absolutely, nicely, likeness, basement, hopeless 48 | # hopeful, tastefully, wasteful 49 | 50 | /ion/, # action, diction, fiction 51 | /[ct]ia[nl]/, # special(ly), initial, physician, christian 52 | /[^cx]iou/, # illustrious, NOT spacious, gracious, anxious, noxious 53 | /sia$/, # amnesia, polynesia 54 | /.gue$/ # dialogue, intrigue, colleague 55 | ] unless defined?(SubSyl) 56 | 57 | # special cases - 1 syllable more than expected 58 | AddSyl = [ 59 | /i[aiou]/, # alias, science, phobia 60 | /[dls]ien/, # salient, gradient, transient 61 | /[aeiouym]ble$/, # -Vble, plus -mble 62 | /[aeiou]{3}/, # agreeable 63 | /^mc/, # mcwhatever 64 | /ism$/, # sexism, racism 65 | /(?:([^aeiouy])\1|ck|mp|ng)le$/, # bubble, cattle, cackle, sample, angle 66 | /dnt$/, # couldn/t 67 | /[aeiou]y[aeiou]/ # annoying, layer 68 | ] unless defined?(AddSyl) 69 | 70 | # special cases not actually used - these seem to me to be either very 71 | # marginal or actually break more stuff than they fix 72 | NotUsed = [ 73 | /^coa[dglx]./, # +1 coagulate, coaxial, coalition, coalesce - marginal 74 | /[^gq]ua[^auieo]/, # +1 'du-al' - only for some speakers, and breaks 75 | /riet/, # variety, parietal, notoriety - marginal? 76 | ] unless defined?(NotUsed) 77 | 78 | def self.syllables(word) 79 | return 1 if word.length == 1 80 | word = word.downcase.delete("'") 81 | 82 | syllables = word.scan(/[aeiouy]+/).length 83 | 84 | # special cases 85 | for pat in SubSyl 86 | syllables -= 1 if pat.match(word) 87 | end 88 | for pat in AddSyl 89 | syllables += 1 if pat.match(word) 90 | end 91 | 92 | syllables = 1 if syllables < 1 # no vowels? 93 | syllables 94 | end 95 | end 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /lib/lingua/en/readability.rb: -------------------------------------------------------------------------------- 1 | module Lingua 2 | module EN 3 | # The class Lingua::EN::Readability takes English text and analyses formal 4 | # characteristic 5 | class Readability 6 | attr_reader :text, :paragraphs, :sentences, :words, :frequencies 7 | 8 | # The constructor accepts the text to be analysed, and returns a report 9 | # object which gives access to the 10 | def initialize(text) 11 | @text = text.dup 12 | @paragraphs = Lingua::EN::Paragraph.paragraphs(self.text) 13 | @sentences = Lingua::EN::Sentence.sentences(self.text) 14 | @words = [] 15 | @frequencies = {} 16 | @frequencies.default = 0 17 | @syllables = 0 18 | @complex_words = 0 19 | count_words 20 | end 21 | 22 | # The number of paragraphs in the sample. A paragraph is defined as a 23 | # newline followed by one or more empty or whitespace-only lines. 24 | def num_paragraphs 25 | paragraphs.length 26 | end 27 | 28 | # The number of sentences in the sample. The meaning of a "sentence" is 29 | # defined by Lingua::EN::Sentence. 30 | def num_sentences 31 | sentences.length 32 | end 33 | 34 | # The number of characters in the sample. 35 | def num_chars 36 | text.length 37 | end 38 | alias :num_characters :num_chars 39 | 40 | # The total number of words used in the sample. Numbers as digits are not 41 | # counted. 42 | def num_words 43 | words.length 44 | end 45 | 46 | # The total number of syllables in the text sample. Just for completeness. 47 | def num_syllables 48 | @syllables 49 | end 50 | 51 | # The number of different unique words used in the text sample. 52 | def num_unique_words 53 | @frequencies.keys.length 54 | end 55 | 56 | # An array containing each unique word used in the text sample. 57 | def unique_words 58 | @frequencies.keys 59 | end 60 | 61 | # The number of occurences of the word +word+ in the text sample. 62 | def occurrences(word) 63 | @frequencies[word] 64 | end 65 | 66 | # The average number of words per sentence. 67 | def words_per_sentence 68 | words.length.to_f / sentences.length.to_f 69 | end 70 | 71 | # The average number of syllables per word. The syllable count is 72 | # performed by Lingua::EN::Syllable, and so may not be completely 73 | # accurate, especially if the Carnegie-Mellon Pronouncing Dictionary 74 | # is not installed. 75 | def syllables_per_word 76 | @syllables.to_f / words.length.to_f 77 | end 78 | 79 | # Flesch-Kincaid level of the text sample. This measure scores text based 80 | # on the American school grade system; a score of 7.0 would indicate that 81 | # the text is readable by a seventh grader. A score of 7.0 to 8.0 is 82 | # regarded as optimal for ordinary text. 83 | def kincaid 84 | (11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59 85 | end 86 | 87 | # Flesch reading ease of the text sample. A higher score indicates text 88 | # that is easier to read. The score is on a 100-point scale, and a score 89 | # of 60-70 is regarded as optimal for ordinary text. 90 | def flesch 91 | 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word) 92 | end 93 | 94 | # The Gunning Fog Index of the text sample. The index indicates the number 95 | # of years of formal education that a reader of average intelligence would 96 | # need to comprehend the text. A higher score indicates harder text; a 97 | # value of around 12 is indicated as ideal for ordinary text. 98 | def fog 99 | ( words_per_sentence + percent_fog_complex_words ) * 0.4 100 | end 101 | 102 | # The percentage of words that are defined as "complex" for the purpose of 103 | # the Fog Index. This is non-hyphenated words of three or more syllabes. 104 | def percent_fog_complex_words 105 | ( @complex_words.to_f / words.length.to_f ) * 100 106 | end 107 | 108 | # Return a nicely formatted report on the sample, showing most the useful 109 | # statistics about the text sample. 110 | def report 111 | sprintf "Number of paragraphs %d \n" << 112 | "Number of sentences %d \n" << 113 | "Number of words %d \n" << 114 | "Number of characters %d \n\n" << 115 | "Average words per sentence %.2f \n" << 116 | "Average syllables per word %.2f \n\n" << 117 | "Flesch score %2.2f \n" << 118 | "Flesh-Kincaid grade level %2.2f \n" << 119 | "Fog Index %2.2f \n", 120 | num_paragraphs, num_sentences, num_words, num_characters, 121 | words_per_sentence, syllables_per_word, 122 | flesch, kincaid, fog 123 | end 124 | 125 | private 126 | def count_words 127 | @text.scan(/\b([a-z][a-z\-']*)\b/i).each do |match| 128 | word = match[0] 129 | @words << word 130 | 131 | # up frequency counts 132 | @frequencies[word] += 1 133 | 134 | # syllable counts 135 | syllables = Lingua::EN::Syllable.syllables(word) 136 | @syllables += syllables 137 | if syllables > 2 && !word.include?('-') 138 | @complex_words += 1 # for Fog Index 139 | end 140 | end 141 | end 142 | end 143 | end 144 | end 145 | -------------------------------------------------------------------------------- /spec/lingua/en/sentence_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + "/../../spec_helper" 2 | 3 | describe Lingua::EN::Sentence do 4 | klass = Lingua::EN::Sentence 5 | 6 | describe "#sentences" do 7 | describe "multi-paragraph text" do 8 | before(:each) do 9 | text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\"\n\n" 10 | text << "Visit http://www.google.com and check out my site. Thanks very much!" 11 | @sentences = klass.sentences(text) 12 | end 13 | 14 | it "should get the correct number of sentences" do 15 | @sentences.should have(5).things 16 | end 17 | 18 | it "should get the correct sentences" do 19 | @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\"" 20 | @sentences[1].should == "And I'm inclined to agree." 21 | @sentences[2].should == "\"Why can't we be friends?\"" 22 | @sentences[3].should == "Visit http://www.google.com and check out my site." 23 | @sentences[4].should == "Thanks very much!" 24 | end 25 | end 26 | 27 | describe "quoted sentences" do 28 | before(:each) do 29 | text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\"" 30 | @sentences = klass.sentences(text) 31 | end 32 | 33 | it "should get the correct number of sentences" do 34 | @sentences.should have(3).things 35 | end 36 | 37 | it "should get the correct sentences" do 38 | @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\"" 39 | @sentences[1].should == "And I'm inclined to agree." 40 | @sentences[2].should == "\"Why can't we be friends?\"" 41 | end 42 | end 43 | 44 | describe "ellipses correction" do 45 | before(:each) do 46 | text = "Well... why would you do that? Let's not fight." 47 | @sentences = klass.sentences(text) 48 | end 49 | 50 | it "should get the correct number of sentences" do 51 | @sentences.should have(2).things 52 | end 53 | 54 | it "should get the right sentences" do 55 | @sentences[0].should == "Well... why would you do that?" 56 | @sentences[1].should == "Let's not fight." 57 | end 58 | end 59 | 60 | describe "simple URL matching" do 61 | before(:each) do 62 | text = "Hello, visit http://www.google.com/index.php?ok=ok for more info. Ok?" 63 | @sentences = klass.sentences(text) 64 | end 65 | 66 | it "should get the correct number of sentences" do 67 | @sentences.should have(2).things 68 | end 69 | 70 | it "should get the right sentences" do 71 | @sentences[0].should == "Hello, visit http://www.google.com/index.php?ok=ok for more info." 72 | @sentences[1].should == "Ok?" 73 | end 74 | end 75 | 76 | describe "ending a sentence with an abbreviation" do 77 | before(:each) do 78 | text = "I was born in the U.S.S.R. My parents were from the U.S. This is not weird." 79 | @sentences = klass.sentences(text) 80 | end 81 | 82 | it "should get the correct number of sentences" do 83 | @sentences.should have(3).things 84 | end 85 | 86 | it "should get the correct sentences" do 87 | @sentences[0].should == "I was born in the U.S.S.R." 88 | @sentences[1].should == "My parents were from the U.S." 89 | @sentences[2].should == "This is not weird." 90 | end 91 | 92 | describe "which is hard-coded (like st, dr, mrs...)" do 93 | before(:each) do 94 | text = "This is a test. The word 'test' ends with the abbreviation for 'street'. This should still be three sentences." 95 | @sentences = klass.sentences(text) 96 | end 97 | 98 | it "should have the correct number of sentences" do 99 | @sentences.should have(3).things 100 | end 101 | 102 | it "should get the correct sentences" do 103 | @sentences[0].should == "This is a test." 104 | @sentences[1].should == "The word 'test' ends with the abbreviation for 'street'." 105 | @sentences[2].should == "This should still be three sentences." 106 | end 107 | end 108 | end 109 | 110 | describe "basic sentences" do 111 | before(:each) do 112 | text = "Hello, my name is David. What is your name?" 113 | @sentences = klass.sentences(text) 114 | end 115 | 116 | it "should get the correct number of sentences" do 117 | @sentences.should have(2).things 118 | end 119 | end 120 | 121 | describe "short sentences w/ line breaks" do 122 | before(:each) do 123 | @doc = <<-EOF 124 | So how does the 401(k) plan work? Let's see - 125 | 126 | The 401(k) consists of - first, asking your employer to set aside a portion (upto 15% of your total income) in keeping with the plan. 127 | EOF 128 | @sentences = klass.sentences(@doc) 129 | end 130 | 131 | it "should find 3 sentences" do 132 | @sentences.should have(3).things 133 | end 134 | 135 | it "should stop at line breaks" do 136 | @sentences[1].should == "Let's see -" 137 | end 138 | end 139 | 140 | describe "sentences with URLs and abbreviation" do 141 | before(:each) do 142 | text = "Many of these leading names now have their own website, e.g. http://www.kaptest.com/. Hello, e.g. you don't know what you mean. I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok." 143 | @sentences = klass.sentences(text) 144 | end 145 | 146 | it "should get the correct number of sentences" do 147 | @sentences[0].should == "Many of these leading names now have their own website, e.g. http://www.kaptest.com/." 148 | @sentences[1].should == "Hello, e.g. you don't know what you mean." 149 | @sentences[2].should == "I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok." 150 | @sentences.should have(3).things 151 | end 152 | end 153 | end 154 | 155 | describe "#abbreviation" do 156 | it "should change the abbreviations list" do 157 | klass.abbreviation('monkey', 'pig') 158 | klass.abbreviations.should include('monkey') 159 | klass.abbreviations.should include('pig') 160 | end 161 | 162 | it "should change the regex for abbreviations" do 163 | lambda { 164 | klass.abbreviation('monkey') 165 | }.should change(klass, :abbr_regex) 166 | end 167 | 168 | after(:each) do 169 | klass.initialize_abbreviations! 170 | end 171 | end 172 | 173 | end 174 | --------------------------------------------------------------------------------