├── .rspec ├── spec ├── spec_helper.rb └── word_count_analyzer │ ├── analyzer_spec.rb │ ├── performance_spec.rb │ ├── number_spec.rb │ ├── xhtml_spec.rb │ ├── hyperlink_spec.rb │ ├── hyphenated_word_spec.rb │ ├── numbered_list_spec.rb │ ├── punctuation_spec.rb │ ├── slash_spec.rb │ ├── ellipsis_spec.rb │ ├── contraction_spec.rb │ ├── date_spec.rb │ └── counter_spec.rb ├── .travis.yml ├── lib ├── word_count_analyzer │ ├── version.rb │ ├── number.rb │ ├── hyphenated_word.rb │ ├── xhtml.rb │ ├── ellipsis.rb │ ├── punctuation.rb │ ├── hyperlink.rb │ ├── numbered_list.rb │ ├── analyzer.rb │ ├── slash.rb │ ├── date.rb │ ├── contraction.rb │ └── counter.rb └── word_count_analyzer.rb ├── Gemfile ├── Rakefile ├── .gitignore ├── LICENSE.txt ├── word_count_analyzer.gemspec └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --color -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'word_count_analyzer' -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - "2.1.0" 4 | - "2.1.5" 5 | - "2.2.0" 6 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/version.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | VERSION = "1.0.1" 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in word_count_analyzer.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require 'rspec/core/rake_task' 3 | RSpec::Core::RakeTask.new(:spec) 4 | task :default => :spec 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | .idea/ 11 | *.bundle 12 | *.so 13 | *.o 14 | *.a 15 | mkmf.log 16 | -------------------------------------------------------------------------------- /lib/word_count_analyzer.rb: -------------------------------------------------------------------------------- 1 | require "word_count_analyzer/version" 2 | require "word_count_analyzer/analyzer" 3 | require "word_count_analyzer/counter" 4 | require "word_count_analyzer/contraction" 5 | require "word_count_analyzer/hyperlink" 6 | require "word_count_analyzer/hyphenated_word" 7 | require "word_count_analyzer/date" 8 | require "word_count_analyzer/ellipsis" 9 | require "word_count_analyzer/numbered_list" 10 | require "word_count_analyzer/xhtml" 11 | require "word_count_analyzer/number" 12 | require "word_count_analyzer/slash" 13 | require "word_count_analyzer/punctuation" 14 | require "engtagger" 15 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/number.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Number 3 | # Rubular: http://rubular.com/r/OGj82uEu8d 4 | NUMBER_REGEX = /(?<=\A)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$|$)/ 5 | 6 | attr_reader :string 7 | def initialize(string:) 8 | @string = string 9 | end 10 | 11 | def includes_number? 12 | !(string !~ NUMBER_REGEX) 13 | end 14 | 15 | def replace 16 | string.gsub(NUMBER_REGEX, ' wsnumword ') 17 | end 18 | 19 | def occurrences 20 | replace.scan(/wsnumword/).size 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/hyphenated_word.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class HyphenatedWord 3 | # Rubular: http://rubular.com/r/RjZ7qi0uFf 4 | DASHED_LINE_REGEX = /\s-{2,}(\s|$)|\A-{2,}(\s|$)/ 5 | 6 | attr_reader :token 7 | def initialize(token:) 8 | @token = token.gsub(DASHED_LINE_REGEX, '') 9 | end 10 | 11 | def hyphenated_word? 12 | (token.include?('-') || token.include?('﹘')) && !WordCountAnalyzer::Hyperlink.new.hyperlink?(token) 13 | end 14 | 15 | def count_as_multiple 16 | token.split(/[﹘,-]/).length 17 | end 18 | 19 | def replace 20 | token.split(/[﹘,-]/).join(' ') 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/xhtml.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Xhtml 3 | # Rubular: http://rubular.com/r/ENrVFMdJ8v 4 | XHTML_REGEX = /<\/?[^>]*>/ 5 | attr_reader :string 6 | def initialize(string:) 7 | @string = string 8 | end 9 | 10 | def includes_xhtml? 11 | !(string !~ XHTML_REGEX) 12 | end 13 | 14 | def replace 15 | string.gsub(XHTML_REGEX, ' ') 16 | end 17 | 18 | def count_difference_word_boundary 19 | string.split(/\s+/).size - replace.strip.split(/\s+/).size 20 | end 21 | 22 | def occurrences 23 | string.gsub(XHTML_REGEX, ' wsxhtmlword ').scan(/wsxhtmlword/).size / 2 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Kevin S. Dias 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/ellipsis.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Ellipsis 3 | # Rubular: http://rubular.com/r/mfdtSeuIf2 4 | FOUR_CONSECUTIVE_REGEX = /(?<=[^\.]|\A)\.{3}\.(?=[^\.]|$)/ 5 | 6 | # Rubular: http://rubular.com/r/YBG1dIHTRu 7 | THREE_SPACE_REGEX = /(\s\.){3}\s/ 8 | 9 | # Rubular: http://rubular.com/r/2VvZ8wRbd8 10 | FOUR_SPACE_REGEX = /(?<=[a-z]|\A)(\.\s){3}\.(\z|$|\n)/ 11 | 12 | OTHER_THREE_PERIOD_REGEX = /(?<=[^\.]|\A)\.{3}(?=([^\.]|$))/ 13 | 14 | UNICODE_ELLIPSIS = /(?<=[^…]|\A)…{1}(?=[^…]|$)/ 15 | 16 | def includes_ellipsis?(text) 17 | !(text !~ FOUR_CONSECUTIVE_REGEX) || 18 | !(text !~ THREE_SPACE_REGEX) || 19 | !(text !~ FOUR_SPACE_REGEX) || 20 | !(text !~ OTHER_THREE_PERIOD_REGEX) || 21 | !(text !~ UNICODE_ELLIPSIS) 22 | end 23 | 24 | def replace(text) 25 | text.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ') 26 | .gsub(THREE_SPACE_REGEX, ' wseword ') 27 | .gsub(FOUR_SPACE_REGEX, ' wseword ') 28 | .gsub(OTHER_THREE_PERIOD_REGEX, ' wseword ') 29 | .gsub(UNICODE_ELLIPSIS, ' wseword ') 30 | end 31 | 32 | def occurrences(text) 33 | count = 0 34 | replace(text).split(' ').map { |token| count += 1 if token.strip.eql?('wseword') } 35 | count 36 | end 37 | end 38 | end -------------------------------------------------------------------------------- /spec/word_count_analyzer/analyzer_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Analyzer do 4 | context '#analysis' do 5 | it 'should analyze the gray areas #001' do 6 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 7 | ws = WordCountAnalyzer::Analyzer.new(text: text) 8 | expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>2, "contraction"=>4, "hyphenated_word"=>2, "date"=>2, "number"=>5, "numbered_list"=>3, "xhtml"=>1, "forward_slash"=>1, "backslash"=>1, "dotted_line"=>1, "dashed_line"=>1, "underscore"=>1, "stray_punctuation"=>5}) 9 | end 10 | 11 | it 'should analyze the gray areas #002' do 12 | text = "hello world ..." 13 | ws = WordCountAnalyzer::Analyzer.new(text: text) 14 | expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>0, "contraction"=>0, "hyphenated_word"=>0, "date"=>0, "number"=>0, "numbered_list"=>0, "xhtml"=>0, "forward_slash"=>0, "backslash"=>0, "dotted_line"=>0, "dashed_line"=>0, "underscore"=>0, "stray_punctuation"=>0}) 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/punctuation.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Punctuation 3 | # Rubular: http://rubular.com/r/ZVBsZVkiqC 4 | DOTTED_LINE_REGEX = /…{2,}|\.{5,}/ 5 | 6 | # Rubular: http://rubular.com/r/RjZ7qi0uFf 7 | DASHED_LINE_REGEX = /(?<=\s)-{2,}(\s|$)|\A-{2,}(?=(\s|$))/ 8 | 9 | # Rubular: http://rubular.com/r/hNofimZwdh 10 | UNDERSCORE_REGEX = /(?<=\s)_{2,}(\s|$)|\A_{2,}(?=(\s|$))/ 11 | 12 | # Rubular: http://rubular.com/r/FexKxGUuIe 13 | STRAY_PUNCTUATION_REGEX = /(?<=\s|\A)[[:punct:]](?=(\s|$))|(?<=\s|\A)\|(?=(\s|$))/ 14 | 15 | attr_reader :string 16 | def initialize(string:) 17 | @string = string 18 | end 19 | 20 | def dotted_line_ocurrances 21 | string.scan(DOTTED_LINE_REGEX).size 22 | end 23 | 24 | def dashed_line_ocurrances 25 | string.scan(DASHED_LINE_REGEX).size 26 | end 27 | 28 | def underscore_ocurrances 29 | string.scan(UNDERSCORE_REGEX).size 30 | end 31 | 32 | def stray_punctuation_occurences 33 | string.scan(STRAY_PUNCTUATION_REGEX).size 34 | end 35 | 36 | def replace_dotted_line 37 | string.gsub(DOTTED_LINE_REGEX, '') 38 | end 39 | 40 | def replace_dashed_line 41 | string.gsub(DASHED_LINE_REGEX, '') 42 | end 43 | 44 | def replace_underscore 45 | string.gsub(UNDERSCORE_REGEX, '') 46 | end 47 | 48 | def replace_stray_punctuation 49 | string.gsub(STRAY_PUNCTUATION_REGEX, '') 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /word_count_analyzer.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'word_count_analyzer/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "word_count_analyzer" 8 | spec.version = WordCountAnalyzer::VERSION 9 | spec.authors = ["Kevin S. Dias"] 10 | spec.email = ["diasks2@gmail.com"] 11 | spec.summary = %q{A word count analyzer - see what word count gray areas might be affecting your word count.} 12 | spec.description = %q{Word Count Analyzer is a Ruby gem that analyzes a string for potential areas of the text that might cause word count discrepancies depending on the tool used. It also provides comprehensive configuration options so you can easily customize how different gray areas should be counted and find the right word count for your purposes.} 13 | spec.homepage = "" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files -z`.split("\x0") 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | spec.required_ruby_version = '>= 2.1.0' 21 | 22 | spec.add_development_dependency "bundler" 23 | spec.add_development_dependency "rake", "~> 10.0" 24 | spec.add_development_dependency "rspec" 25 | spec.add_development_dependency "stackprof" 26 | spec.add_runtime_dependency "engtagger" 27 | end 28 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/hyperlink.rb: -------------------------------------------------------------------------------- 1 | require 'uri' 2 | 3 | module WordCountAnalyzer 4 | class Hyperlink 5 | NON_HYPERLINK_REGEX = /\A\w+:$/ 6 | 7 | # Rubular: http://rubular.com/r/fXa4lp0gfS 8 | HYPERLINK_REGEX = /(http|https|www)(\.|:)/ 9 | 10 | def hyperlink?(text) 11 | !(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX) 12 | end 13 | 14 | def occurrences(text) 15 | text.scan(URI.regexp).map { |link| link.compact.size > 1 ? 1 : 0 }.inject(0) { |sum, x| sum + x } 16 | end 17 | 18 | def replace(text) 19 | text.split(/\s+/).each do |token| 20 | if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">') 21 | text = text.gsub(/#{Regexp.escape(token.split('">')[0])}/, ' wslinkword ') 22 | elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) 23 | text = text.gsub(/#{Regexp.escape(token)}/, ' wslinkword ') 24 | end 25 | end 26 | text 27 | end 28 | 29 | def replace_split_at_period(text) 30 | text.split(/\s+/).each do |token| 31 | if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">') 32 | text.gsub!(/#{Regexp.escape(token.split('">')[0])}/) do |match| 33 | match.split('.').join(' ') 34 | end 35 | elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) 36 | text.gsub!(/#{Regexp.escape(token)}/) do |match| 37 | match.split('.').join(' ') 38 | end 39 | end 40 | end 41 | text 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/numbered_list.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class NumberedList 3 | # Rubular: http://rubular.com/r/RKmRH9Y4oO 4 | NUMBERED_LIST_REGEX = /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.\)|^\d{1,2}\.\)/ 5 | 6 | attr_reader :string 7 | def initialize(string:) 8 | @string = string 9 | end 10 | 11 | def includes_numbered_list? 12 | !(string !~ NUMBERED_LIST_REGEX) && has_at_least_two_items? 13 | end 14 | 15 | def replace 16 | new_string = string.dup 17 | list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i) 18 | skips = 0 19 | list_array.each_with_index do |a, i| 20 | if (a + 1).eql?(list_array[i + 1]) || 21 | (a - 1).eql?(list_array[i - 1]) || 22 | (a.eql?(0) && list_array[i - 1].eql?(9)) || 23 | (a.eql?(9) && list_array[i + 1].eql?(0)) 24 | new_string.gsub!(NUMBERED_LIST_REGEX).with_index do |match, index| 25 | if i.eql?(index + (i - skips)) && match.chomp('.').eql?(a.to_s) 26 | '' 27 | else 28 | match 29 | end 30 | end 31 | else 32 | skips +=1 33 | end 34 | end 35 | new_string 36 | end 37 | 38 | def occurrences 39 | count_list_items_in_array 40 | end 41 | 42 | private 43 | 44 | def has_at_least_two_items? 45 | count_list_items_in_array >= 2 46 | end 47 | 48 | def count_list_items_in_array 49 | list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i) 50 | counter = 0 51 | list_array.each_with_index do |a, i| 52 | next unless (a + 1).eql?(list_array[i + 1]) || 53 | (a - 1).eql?(list_array[i - 1]) || 54 | (a.eql?(0) && list_array[i - 1].eql?(9)) || 55 | (a.eql?(9) && list_array[i + 1].eql?(0)) 56 | counter += 1 57 | end 58 | counter 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/analyzer.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Analyzer 3 | attr_reader :text, :tagger 4 | def initialize(text:) 5 | @text = text 6 | @tagger = EngTagger.new 7 | end 8 | 9 | def analyze 10 | analysis = {} 11 | analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new.occurrences(text) 12 | contraction_count = 0 13 | hyphenated_word_count = 0 14 | WordCountAnalyzer::Xhtml.new(string: text).replace.split(/\s+/).each_with_index do |token, index| 15 | contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr: tagger, hyphen: 'single').contraction? 16 | hyphenated_word_count += 1 if WordCountAnalyzer::HyphenatedWord.new(token: token).hyphenated_word? 17 | end 18 | analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new.occurrences(text) 19 | analysis['contraction'] = contraction_count 20 | analysis['hyphenated_word'] = hyphenated_word_count 21 | analysis['date'] = WordCountAnalyzer::Date.new.occurrences(text) 22 | analysis['number'] = WordCountAnalyzer::Number.new(string: text).occurrences 23 | analysis['numbered_list'] = WordCountAnalyzer::NumberedList.new(string: text).occurrences 24 | analysis['xhtml'] = WordCountAnalyzer::Xhtml.new(string: text).occurrences 25 | analysis['forward_slash'] = WordCountAnalyzer::Slash.new(string: text).forward_slash_occurences 26 | analysis['backslash'] = WordCountAnalyzer::Slash.new(string: text).backslash_occurences 27 | analysis['dotted_line'] = WordCountAnalyzer::Punctuation.new(string: text).dotted_line_ocurrances 28 | analysis['dashed_line'] = WordCountAnalyzer::Punctuation.new(string: text).dashed_line_ocurrances 29 | analysis['underscore'] = WordCountAnalyzer::Punctuation.new(string: text).underscore_ocurrances 30 | analysis['stray_punctuation'] = WordCountAnalyzer::Punctuation.new(string: text).stray_punctuation_occurences 31 | analysis 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/performance_spec.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding : utf-8 -*- 2 | require 'benchmark' 3 | require 'spec_helper' 4 | require 'stackprof' 5 | 6 | RSpec.describe WordCountAnalyzer::Analyzer do 7 | it 'is fast?' do 8 | benchmark do 9 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 10 | ws = WordCountAnalyzer::Analyzer.new(text: text).analyze 11 | end 12 | end 13 | 14 | it 'is analyzed' do 15 | data = StackProf.run(mode: :cpu, interval: 1000) do 16 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 17 | ws = WordCountAnalyzer::Analyzer.new(text: text).analyze 18 | end 19 | puts StackProf::Report.new(data).print_text 20 | end 21 | 22 | it 'is analyzed 2' do 23 | data = StackProf.run(mode: :cpu, interval: 1000) do 24 | token = "when'd" 25 | following_token = nil 26 | WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: EngTagger.new, hyphen: nil).contraction? 27 | end 28 | puts StackProf::Report.new(data).print_text 29 | end 30 | 31 | it 'is analyzed 3' do 32 | benchmark do 33 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 34 | ws = WordCountAnalyzer::Counter.new(forward_slash: 'count_as_multiple') 35 | 300.times do 36 | ws.count(text) 37 | end 38 | end 39 | end 40 | end 41 | 42 | def benchmark 43 | yield 44 | time = Benchmark.realtime { yield } 45 | puts "RUNTIME: #{time}" 46 | end 47 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/number_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Number do 4 | context '#includes_number?' do 5 | it 'returns true if the string includes a number #001' do 6 | string = 'It cost $10,000 dollars.' 7 | ws = WordCountAnalyzer::Number.new(string: string) 8 | expect(ws.includes_number?).to eq(true) 9 | end 10 | 11 | it 'returns true if the string includes a number #002' do 12 | string = 'It cost 500 dollars.' 13 | ws = WordCountAnalyzer::Number.new(string: string) 14 | expect(ws.includes_number?).to eq(true) 15 | end 16 | 17 | it 'returns true if the string includes a number #003' do 18 | string = 'It was only 50% of the total.' 19 | ws = WordCountAnalyzer::Number.new(string: string) 20 | expect(ws.includes_number?).to eq(true) 21 | end 22 | 23 | it 'returns true if the string includes a number #004' do 24 | string = 'It was only 50 % of the total.' 25 | ws = WordCountAnalyzer::Number.new(string: string) 26 | expect(ws.includes_number?).to eq(true) 27 | end 28 | 29 | it 'returns true if the string includes a number #005' do 30 | string = 'I was born in 1993' 31 | ws = WordCountAnalyzer::Number.new(string: string) 32 | expect(ws.includes_number?).to eq(true) 33 | end 34 | 35 | it "returns false if the string doesn't includes a number #006" do 36 | string = 'Hello world.' 37 | ws = WordCountAnalyzer::Number.new(string: string) 38 | expect(ws.includes_number?).to eq(false) 39 | end 40 | 41 | it "returns false if the string doesn't includes a number #007" do 42 | string = 'Today is 2/18/2014.' 43 | ws = WordCountAnalyzer::Number.new(string: string) 44 | expect(ws.includes_number?).to eq(false) 45 | end 46 | end 47 | 48 | context '#replace' do 49 | it 'returns the string with number and unit substituted as one token #001' do 50 | string = 'It was only 50 % of the total. 500 total $300.' 51 | ws = WordCountAnalyzer::Number.new(string: string) 52 | expect(ws.replace).to eq("It was only wsnumword % of the total. wsnumword total wsnumword ") 53 | end 54 | end 55 | 56 | context '#occurrences' do 57 | it 'returns the number of occurrences of a number in the string #001' do 58 | string = 'It was only 50 % of the total. 500 total. That costs $300 and is 50% off.' 59 | ws = WordCountAnalyzer::Number.new(string: string) 60 | expect(ws.occurrences).to eq(4) 61 | end 62 | 63 | it 'does not ignore dates #002' do 64 | string = 'It was only 50 % of the total on Wednesday, June 4 2015. 500 total. That costs $300 and is 50% off only on Apr 5th 1999.' 65 | ws = WordCountAnalyzer::Number.new(string: string) 66 | expect(ws.occurrences).to eq(7) 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/xhtml_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Xhtml do 4 | context '#includes_xhtml?' do 5 | it 'returns true if the string includes XML or HTML #001' do 6 | string = 'Hello world' 7 | ws = WordCountAnalyzer::Xhtml.new(string: string) 8 | expect(ws.includes_xhtml?).to eq(true) 9 | end 10 | 11 | it 'returns true if the string includes XML or HTML #002' do 12 | string = 'Hello there. Another sentence Sentence here.' 13 | ws = WordCountAnalyzer::Xhtml.new(string: string) 14 | expect(ws.includes_xhtml?).to eq(true) 15 | end 16 | 17 | it "returns false if the string doesn't include XML or HTML #003" do 18 | string = 'Hello world.' 19 | ws = WordCountAnalyzer::Xhtml.new(string: string) 20 | expect(ws.includes_xhtml?).to eq(false) 21 | end 22 | end 23 | 24 | context '#replace' do 25 | it 'replaces XML or HTML with an empty string #001' do 26 | string = 'Hello world' 27 | ws = WordCountAnalyzer::Xhtml.new(string: string) 28 | expect(ws.replace).to eq(" Hello world ") 29 | end 30 | 31 | it 'replaces XML or HTML with an empty string #002' do 32 | string = 'Hello there. Another sentence Sentence here.' 33 | ws = WordCountAnalyzer::Xhtml.new(string: string) 34 | expect(ws.replace).to eq("Hello there. Another sentence Sentence here.") 35 | end 36 | end 37 | 38 | context '#count_difference_word_boundary' do 39 | it 'counts the difference in word count between with xhtml and without #001' do 40 | string = 'Hello world' 41 | ws = WordCountAnalyzer::Xhtml.new(string: string) 42 | expect(ws.count_difference_word_boundary).to eq(1) 43 | end 44 | 45 | it 'counts the difference in word count between with xhtml and without #002' do 46 | string = 'Hello there. Another sentence Sentence here.' 47 | ws = WordCountAnalyzer::Xhtml.new(string: string) 48 | expect(ws.count_difference_word_boundary).to eq(0) 49 | end 50 | 51 | it 'counts the difference in word count between with xhtml and without #003' do 52 | string = 'Hello world Hello there. Another sentence Sentence here. Hello world.' 53 | ws = WordCountAnalyzer::Xhtml.new(string: string) 54 | expect(ws.count_difference_word_boundary).to eq(1) 55 | end 56 | end 57 | 58 | context '#occurrences' do 59 | it 'counts the number of tags (1 opening set and 1 closing set of tags counts as 1)' do 60 | string = 'Hello world Hello there. Another sentence Sentence here. Hello world.' 61 | ws = WordCountAnalyzer::Xhtml.new(string: string) 62 | expect(ws.occurrences).to eq(2) 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/hyperlink_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Hyperlink do 4 | context '#hyperlink?(string)' do 5 | it 'returns true if the string is a hyperlink #001' do 6 | string = "http://www.example.com/this-IS-a_test/hello.html" 7 | ws = WordCountAnalyzer::Hyperlink.new 8 | expect(ws.hyperlink?(string)).to eq(true) 9 | end 10 | 11 | it 'returns true if the string is a hyperlink #002' do 12 | string = "http://www.google.co.uk" 13 | ws = WordCountAnalyzer::Hyperlink.new 14 | expect(ws.hyperlink?(string)).to eq(true) 15 | end 16 | 17 | it 'returns true if the string is a hyperlink #003' do 18 | string = "https://google.co.uk" 19 | ws = WordCountAnalyzer::Hyperlink.new 20 | expect(ws.hyperlink?(string)).to eq(true) 21 | end 22 | 23 | it 'returns false if the string is not a hyperlink #004' do 24 | string = "hello" 25 | ws = WordCountAnalyzer::Hyperlink.new 26 | expect(ws.hyperlink?(string)).to eq(false) 27 | end 28 | 29 | it 'returns false if the string is not a hyperlink #005' do 30 | string = "john@gmail.com" 31 | ws = WordCountAnalyzer::Hyperlink.new 32 | expect(ws.hyperlink?(string)).to eq(false) 33 | end 34 | 35 | it 'returns false if the string is not a hyperlink #006' do 36 | string = "date:" 37 | ws = WordCountAnalyzer::Hyperlink.new 38 | expect(ws.hyperlink?(string)).to eq(false) 39 | end 40 | 41 | it 'returns false if the string is not a hyperlink #007' do 42 | string = 'The file location is c:\Users\johndoe.' 43 | ws = WordCountAnalyzer::Hyperlink.new 44 | expect(ws.hyperlink?(string)).to eq(false) 45 | end 46 | end 47 | 48 | context '#occurrences' do 49 | it 'returns the occurrences of hyperlink tokens in a string #001' do 50 | string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk" 51 | ws = WordCountAnalyzer::Hyperlink.new 52 | expect(ws.occurrences(string)).to eq(2) 53 | end 54 | end 55 | 56 | context '#replace' do 57 | it 'replaces the hyperlinks in a string with regular tokens #001' do 58 | string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk" 59 | ws = WordCountAnalyzer::Hyperlink.new 60 | expect(ws.replace(string)).to eq("Today the date is: Jan 1. Visit wslinkword or wslinkword ") 61 | end 62 | 63 | it 'replaces the hyperlinks in a string with regular tokens #002' do 64 | string = 'The file location is c:\Users\johndoe or d:\Users\john\www' 65 | ws = WordCountAnalyzer::Hyperlink.new 66 | expect(ws.replace(string)).to eq('The file location is c:\Users\johndoe or d:\Users\john\www') 67 | end 68 | end 69 | 70 | context '#replace_split_at_period' do 71 | it 'replaces the hyperlinks in a string with regular tokens, split at periods #001' do 72 | string = "http://www.google.co.uk" 73 | ws = WordCountAnalyzer::Hyperlink.new 74 | expect(ws.replace_split_at_period(string)).to eq("http://www google co uk") 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/hyphenated_word_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::HyphenatedWord do 4 | context '#hyphenated_word?' do 5 | it 'returns true if the token is a hyphenated word #001' do 6 | token = 'devil-may-care' 7 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 8 | expect(ws.hyphenated_word?).to eq(true) 9 | end 10 | 11 | it 'returns true if the token is a hyphenated word #002' do 12 | token = '(2R)-2-methylsulfanyl-3-hydroxybutanedioate' 13 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 14 | expect(ws.hyphenated_word?).to eq(true) 15 | end 16 | 17 | it 'returns false if the token is not a hyphenated word' do 18 | token = 'hello' 19 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 20 | expect(ws.hyphenated_word?).to eq(false) 21 | end 22 | 23 | it 'returns false if the token is a hyperlink' do 24 | token = 'https://www.example-one.com' 25 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 26 | expect(ws.hyphenated_word?).to eq(false) 27 | end 28 | 29 | it 'returns false if the token is long string of dashes' do 30 | token = '------------' 31 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 32 | expect(ws.hyphenated_word?).to eq(false) 33 | end 34 | 35 | it 'returns true if the token is a hyphenated word (small em dashes)' do 36 | token = 'devil﹘may﹘care' 37 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 38 | expect(ws.hyphenated_word?).to eq(true) 39 | end 40 | end 41 | 42 | context '#count_as_multiple' do 43 | it 'returns the count of the individual words that are separated by the hyphen' do 44 | token = 'devil-may-care' 45 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 46 | expect(ws.count_as_multiple).to eq(3) 47 | end 48 | 49 | it 'handles small em dashes' do 50 | token = 'devil﹘may﹘care' 51 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 52 | expect(ws.count_as_multiple).to eq(3) 53 | end 54 | 55 | it 'returns the count of the individual words that are separated by the hyphen #002' do 56 | token = '(2R)-2-methylsulfanyl-3-hydroxybutanedioate' 57 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 58 | expect(ws.count_as_multiple).to eq(5) 59 | end 60 | end 61 | 62 | context '#replace' do 63 | it 'splits hyphenated words #001' do 64 | token = 'devil-may-care' 65 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 66 | expect(ws.replace).to eq('devil may care') 67 | end 68 | 69 | it 'splits hyphenated words #002' do 70 | token = 'devil﹘may﹘care' 71 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 72 | expect(ws.replace).to eq('devil may care') 73 | end 74 | 75 | it 'splits hyphenated words #003' do 76 | token = '(2R)-2-methylsulfanyl-3-hydroxybutanedioate' 77 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token) 78 | expect(ws.replace).to eq('(2R) 2 methylsulfanyl 3 hydroxybutanedioate') 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/numbered_list_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::NumberedList do 4 | context '#includes_numbered_list?' do 5 | it 'returns true if the string includes a numbered list #001' do 6 | string = "1. List item a\n\n2. List item b\n\n3. List item c." 7 | ws = WordCountAnalyzer::NumberedList.new(string: string) 8 | expect(ws.includes_numbered_list?).to eq(true) 9 | end 10 | 11 | it 'returns false if the string does not include a numbered list #002' do 12 | string = "I have 1.00 dollar and 2 cents." 13 | ws = WordCountAnalyzer::NumberedList.new(string: string) 14 | expect(ws.includes_numbered_list?).to eq(false) 15 | end 16 | 17 | it 'returns false if the string does not include at least 2 list items #003' do 18 | string = "I have 2." 19 | ws = WordCountAnalyzer::NumberedList.new(string: string) 20 | expect(ws.includes_numbered_list?).to eq(false) 21 | end 22 | end 23 | 24 | context '#replace' do 25 | it 'replaces any numbered list numbers with an empty string' do 26 | string = "1. List item a\n\n2. List item b\n\n3. List item c." 27 | ws = WordCountAnalyzer::NumberedList.new(string: string) 28 | expect(ws.replace).to eq(" List item a\n\n List item b\n\n List item c.") 29 | end 30 | 31 | it 'replaces any numbered list numbers with an empty string' do 32 | string = "It also shouldn't have too many contractions, maybe 2. Let's add a list 1. List item a\n\n2. List item b\n\n3. List item c." 33 | ws = WordCountAnalyzer::NumberedList.new(string: string) 34 | expect(ws.replace).to eq("It also shouldn't have too many contractions, maybe 2. Let's add a list List item a\n\n List item b\n\n List item c.") 35 | end 36 | end 37 | 38 | context '#occurrences' do 39 | it 'counts the occurrences of numbered lists #001' do 40 | string = "1. List item a\n\n2. List item b\n\n3. List item c." 41 | ws = WordCountAnalyzer::NumberedList.new(string: string) 42 | expect(ws.occurrences).to eq(3) 43 | end 44 | 45 | it 'counts the occurrences of numbered lists #002' do 46 | string = "I have 2." 47 | ws = WordCountAnalyzer::NumberedList.new(string: string) 48 | expect(ws.occurrences).to eq(0) 49 | end 50 | 51 | it 'counts the occurrences of numbered lists #003' do 52 | string = "1. List item a\n\n2. List item b\n\n3. List item c. Then more text. Ok start a new list. 1. item a 2. item b." 53 | ws = WordCountAnalyzer::NumberedList.new(string: string) 54 | expect(ws.occurrences).to eq(5) 55 | end 56 | 57 | it 'counts the occurrences of numbered lists #004' do 58 | string = "1. List item a\n\n2. List item b\n\n3. List item c. Then more text. Ok start a new non-list. I have 2." 59 | ws = WordCountAnalyzer::NumberedList.new(string: string) 60 | expect(ws.occurrences).to eq(3) 61 | end 62 | 63 | it 'counts the occurrences of numbered lists #005' do 64 | string = "It also shouldn't have too many contractions, maybe 2. Let's add a list 1. item a 2. item b 3. item c." 65 | ws = WordCountAnalyzer::NumberedList.new(string: string) 66 | expect(ws.occurrences).to eq(3) 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/slash.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Slash 3 | # Rubular: http://rubular.com/r/AqvcH29sgg 4 | FORWARD_SLASH_REGEX = /(?<=\s)(\S+\/)+\S+|(?<=\A)(\S+\/)+\S+/ 5 | 6 | # Rubular: http://rubular.com/r/tuFWtdMs4G 7 | BACKSLASH_REGEX = /\S+\\\S+/ 8 | 9 | attr_reader :string, :processed_string, :date, :xhtml, :hyperlink 10 | def initialize(string:, **args) 11 | @string = string 12 | @date = args[:date] || nil 13 | @xhtml = args[:xhtml] || nil 14 | @hyperlink = args[:hyperlink] || nil 15 | hyper = WordCountAnalyzer::Hyperlink.new 16 | if date.eql?('no_special_treatment') 17 | if xhtml.eql?('keep') 18 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period') 19 | @processed_string = string 20 | else 21 | @processed_string = hyper.replace(string) 22 | end 23 | else 24 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period') 25 | @processed_string = WordCountAnalyzer::Xhtml.new(string: string).replace 26 | else 27 | @processed_string = WordCountAnalyzer::Xhtml.new(string: hyper.replace(string)).replace 28 | end 29 | end 30 | else 31 | if xhtml.eql?('keep') 32 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period') 33 | @processed_string = WordCountAnalyzer::Date.new.replace(string) 34 | else 35 | @processed_string = WordCountAnalyzer::Date.new.replace(hyper.replace(string)) 36 | end 37 | else 38 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period') 39 | @processed_string = WordCountAnalyzer::Date.new.replace(WordCountAnalyzer::Xhtml.new(string: string).replace) 40 | else 41 | @processed_string = WordCountAnalyzer::Date.new.replace(WordCountAnalyzer::Xhtml.new(string: hyper.replace(string)).replace) 42 | end 43 | end 44 | end 45 | end 46 | 47 | def includes_forward_slash? 48 | !(processed_string !~ FORWARD_SLASH_REGEX) 49 | end 50 | 51 | def includes_backslash? 52 | !(processed_string !~ BACKSLASH_REGEX) 53 | end 54 | 55 | def forward_slash_occurences 56 | processed_string.scan(FORWARD_SLASH_REGEX).size 57 | end 58 | 59 | def replace_forward_slashes 60 | return processed_string if processed_string !~ FORWARD_SLASH_REGEX 61 | processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match| 62 | match.split(/\/+/).join(' ') 63 | end 64 | processed_string 65 | end 66 | 67 | def replace_forward_slashes_except_dates 68 | return processed_string if processed_string !~ FORWARD_SLASH_REGEX 69 | except_date_string = WordCountAnalyzer::Date.new.replace_number_only_date(processed_string) 70 | except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match| 71 | match.split(/\/+/).join(' ') 72 | end 73 | except_date_string 74 | end 75 | 76 | def backslash_occurences 77 | processed_string.scan(BACKSLASH_REGEX).size 78 | end 79 | 80 | def replace_backslashes 81 | return processed_string if processed_string !~ BACKSLASH_REGEX 82 | processed_string.gsub!(BACKSLASH_REGEX).each do |match| 83 | ' word ' * match.split(/\\+/).length 84 | end 85 | processed_string 86 | end 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/punctuation_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Punctuation do 4 | context '#dotted_line_ocurrances' do 5 | it 'returns the number of dotted line occurrences #001' do 6 | string = "Here is one …………………………………………………………………… and another ......" 7 | ws = WordCountAnalyzer::Punctuation.new(string: string) 8 | expect(ws.dotted_line_ocurrances).to eq(2) 9 | end 10 | 11 | it 'returns the number of dotted line occurrences #002' do 12 | string = "Hello world" 13 | ws = WordCountAnalyzer::Punctuation.new(string: string) 14 | expect(ws.dotted_line_ocurrances).to eq(0) 15 | end 16 | end 17 | 18 | context '#dashed_line_ocurrances' do 19 | it 'returns the number of dotted line occurrences #001' do 20 | string = "Here is one ----- and another -----" 21 | ws = WordCountAnalyzer::Punctuation.new(string: string) 22 | expect(ws.dashed_line_ocurrances).to eq(2) 23 | end 24 | 25 | it 'returns the number of dotted line occurrences #002' do 26 | string = "Hello world" 27 | ws = WordCountAnalyzer::Punctuation.new(string: string) 28 | expect(ws.dashed_line_ocurrances).to eq(0) 29 | end 30 | end 31 | 32 | context '#underscore_ocurrances' do 33 | it 'returns the number of undescore occurrences #001' do 34 | string = "Here is one ______ and another ______" 35 | ws = WordCountAnalyzer::Punctuation.new(string: string) 36 | expect(ws.underscore_ocurrances).to eq(2) 37 | end 38 | 39 | it 'returns the number of undescore occurrences #002' do 40 | string = "Hello world" 41 | ws = WordCountAnalyzer::Punctuation.new(string: string) 42 | expect(ws.underscore_ocurrances).to eq(0) 43 | end 44 | end 45 | 46 | context '#stray_punctuation_occurences' do 47 | it 'returns the number of stray punctuation occurrences #001' do 48 | string = "Hello world ? This is another - sentence ." 49 | ws = WordCountAnalyzer::Punctuation.new(string: string) 50 | expect(ws.stray_punctuation_occurences).to eq(3) 51 | end 52 | 53 | it 'returns the number of stray punctuation occurrences #002' do 54 | string = "Hello world. Great?" 55 | ws = WordCountAnalyzer::Punctuation.new(string: string) 56 | expect(ws.stray_punctuation_occurences).to eq(0) 57 | end 58 | 59 | it 'returns the number of stray punctuation occurrences #003' do 60 | string = "." 61 | ws = WordCountAnalyzer::Punctuation.new(string: string) 62 | expect(ws.stray_punctuation_occurences).to eq(1) 63 | end 64 | end 65 | 66 | context '#replace_dotted_line' do 67 | it 'replaces the dotted lines' do 68 | string = "Here is one …………………………………………………………………… and another ......" 69 | ws = WordCountAnalyzer::Punctuation.new(string: string) 70 | expect(ws.replace_dotted_line).to eq("Here is one and another ") 71 | end 72 | end 73 | 74 | context '#replace_dashed_line' do 75 | it 'replaces the dashed lines' do 76 | string = "Here is one ----- and another -----" 77 | ws = WordCountAnalyzer::Punctuation.new(string: string) 78 | expect(ws.replace_dashed_line).to eq("Here is one and another ") 79 | end 80 | end 81 | 82 | context '#replace_underscore' do 83 | it 'replaces the underscores' do 84 | string = "Here is one ______ and another ______" 85 | ws = WordCountAnalyzer::Punctuation.new(string: string) 86 | expect(ws.replace_underscore).to eq("Here is one and another ") 87 | end 88 | end 89 | 90 | context '#replace_stray_punctuation' do 91 | it 'replaces any stray punctutation' do 92 | string = "Hello world ? This is another - sentence ." 93 | ws = WordCountAnalyzer::Punctuation.new(string: string) 94 | expect(ws.replace_stray_punctuation).to eq("Hello world This is another sentence ") 95 | end 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/slash_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Slash do 4 | context '#includes_forward_slash?' do 5 | it 'returns true if the string includes a token with a forward slash #001' do 6 | string = "Using the solidus for he/she/it is often discouraged, except in this case." 7 | ws = WordCountAnalyzer::Slash.new(string: string) 8 | expect(ws.includes_forward_slash?).to eq(true) 9 | end 10 | 11 | it 'returns false if the string does not includes a token with a forward slash #002' do 12 | string = "Hello world." 13 | ws = WordCountAnalyzer::Slash.new(string: string) 14 | expect(ws.includes_forward_slash?).to eq(false) 15 | end 16 | 17 | it 'ignores hyperlinks #003' do 18 | string = "http://www.google.com/google" 19 | ws = WordCountAnalyzer::Slash.new(string: string) 20 | expect(ws.includes_forward_slash?).to eq(false) 21 | end 22 | 23 | it 'ignores dates #004' do 24 | string = "Today is 2/15/2013" 25 | ws = WordCountAnalyzer::Slash.new(string: string) 26 | expect(ws.includes_forward_slash?).to eq(false) 27 | end 28 | end 29 | 30 | context '#includes_backslash?' do 31 | it 'returns true if the string includes a token with a backslash #001' do 32 | string = 'The file location is c:\Users\johndoe.' 33 | ws = WordCountAnalyzer::Slash.new(string: string) 34 | expect(ws.includes_backslash?).to eq(true) 35 | end 36 | 37 | it 'returns false if the string does not includes a token with a backslash #002' do 38 | string = "Hello world." 39 | ws = WordCountAnalyzer::Slash.new(string: string) 40 | expect(ws.includes_backslash?).to eq(false) 41 | end 42 | end 43 | 44 | context '#forward_slash_occurences' do 45 | it 'returns the number of occurrences of tokens with a forward slash #001' do 46 | string = "Using the solidus for he/she/it is often discouraged, except in this case she/he said." 47 | ws = WordCountAnalyzer::Slash.new(string: string) 48 | expect(ws.forward_slash_occurences).to eq(2) 49 | end 50 | 51 | it 'returns the number of occurrences of tokens with a forward slash #002' do 52 | string = "Hello world." 53 | ws = WordCountAnalyzer::Slash.new(string: string) 54 | expect(ws.forward_slash_occurences).to eq(0) 55 | end 56 | end 57 | 58 | context '#backslash_occurences' do 59 | it 'returns the number of occurrences of tokens with a backslash #001' do 60 | string = 'The file location is c:\Users\johndoe or d:\Users\john\www' 61 | ws = WordCountAnalyzer::Slash.new(string: string) 62 | expect(ws.backslash_occurences).to eq(2) 63 | end 64 | 65 | it 'returns the number of occurrences of tokens with a backslash #002' do 66 | string = "Hello world." 67 | ws = WordCountAnalyzer::Slash.new(string: string) 68 | expect(ws.backslash_occurences).to eq(0) 69 | end 70 | 71 | it 'returns the number of occurrences of tokens with a backslash #003' do 72 | string = "Hello world." 73 | ws = WordCountAnalyzer::Slash.new(string: string) 74 | expect(ws.backslash_occurences).to eq(0) 75 | end 76 | end 77 | 78 | context '#replace_forward_slashes_multiple' do 79 | it 'replaces forward slashes with multiple tokens #001' do 80 | string = "he/she/it" 81 | ws = WordCountAnalyzer::Slash.new(string: string) 82 | expect(ws.replace_forward_slashes).to eq("he she it") 83 | end 84 | 85 | it 'replaces forward slashes with multiple tokens #002' do 86 | string = "hello//world" 87 | ws = WordCountAnalyzer::Slash.new(string: string) 88 | expect(ws.replace_forward_slashes).to eq("hello world") 89 | end 90 | end 91 | 92 | context '#replace_forward_slashes_except_dates' do 93 | it 'replaces forward slashes with multiple tokens #001' do 94 | string = "he/she/it 4/28/2013" 95 | ws = WordCountAnalyzer::Slash.new(string: string) 96 | expect(ws.replace_forward_slashes).to eq("he she it wsdateword ") 97 | end 98 | 99 | it 'replaces forward slashes with multiple tokens #002' do 100 | string = "hello//world 4/28/2013" 101 | ws = WordCountAnalyzer::Slash.new(string: string) 102 | expect(ws.replace_forward_slashes).to eq("hello world wsdateword ") 103 | end 104 | end 105 | end 106 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/ellipsis_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Ellipsis do 4 | context '#includes_ellipsis?(string)' do 5 | it 'returns true if the string includes an ellipsis #001' do 6 | string = 'Using an ellipsis … causes different counts.' 7 | ws = WordCountAnalyzer::Ellipsis.new 8 | expect(ws.includes_ellipsis?(string)).to eq(true) 9 | end 10 | 11 | it 'returns true if the string includes an ellipsis #002' do 12 | string = 'Using an ellipsis causes different counts…depending on the style that you use.' 13 | ws = WordCountAnalyzer::Ellipsis.new 14 | expect(ws.includes_ellipsis?(string)).to eq(true) 15 | end 16 | 17 | it 'returns true if the string includes an ellipsis #003' do 18 | string = 'Using an ellipsis causes different counts depending on the style . . . that you use.' 19 | ws = WordCountAnalyzer::Ellipsis.new 20 | expect(ws.includes_ellipsis?(string)).to eq(true) 21 | end 22 | 23 | it 'returns true if the string includes an ellipsis #004' do 24 | string = 'Using an ellipsis causes different counts depending on the style . . . . that you use.' 25 | ws = WordCountAnalyzer::Ellipsis.new 26 | expect(ws.includes_ellipsis?(string)).to eq(true) 27 | end 28 | 29 | it 'returns true if the string includes an ellipsis #005' do 30 | string = 'Using an ellipsis causes different counts depending on the style.... that you use.' 31 | ws = WordCountAnalyzer::Ellipsis.new 32 | expect(ws.includes_ellipsis?(string)).to eq(true) 33 | end 34 | 35 | it 'returns true if the string includes an ellipsis #006' do 36 | string = 'hello world ...' 37 | ws = WordCountAnalyzer::Ellipsis.new 38 | expect(ws.includes_ellipsis?(string)).to eq(true) 39 | end 40 | 41 | it 'returns true if the string includes an ellipsis #007' do 42 | string = '...' 43 | ws = WordCountAnalyzer::Ellipsis.new 44 | expect(ws.includes_ellipsis?(string)).to eq(true) 45 | end 46 | 47 | it 'returns true if the string includes an ellipsis #008' do 48 | string = '....' 49 | ws = WordCountAnalyzer::Ellipsis.new 50 | expect(ws.includes_ellipsis?(string)).to eq(true) 51 | end 52 | 53 | it 'returns true if the string includes an ellipsis #009' do 54 | string = ' . . . ' 55 | ws = WordCountAnalyzer::Ellipsis.new 56 | expect(ws.includes_ellipsis?(string)).to eq(true) 57 | end 58 | 59 | it 'returns true if the string includes an ellipsis #010' do 60 | string = ' . . . . ' 61 | ws = WordCountAnalyzer::Ellipsis.new 62 | expect(ws.includes_ellipsis?(string)).to eq(true) 63 | end 64 | 65 | it 'returns true if the string includes an ellipsis #011' do 66 | string = '…' 67 | ws = WordCountAnalyzer::Ellipsis.new 68 | expect(ws.includes_ellipsis?(string)).to eq(true) 69 | end 70 | 71 | it "returns false if the string doesn't include an ellipsis #012" do 72 | string = 'Hello world.' 73 | ws = WordCountAnalyzer::Ellipsis.new 74 | expect(ws.includes_ellipsis?(string)).to eq(false) 75 | end 76 | 77 | it "returns false if the string includes a dotted_line #0013" do 78 | string = '.....' 79 | ws = WordCountAnalyzer::Ellipsis.new 80 | expect(ws.includes_ellipsis?(string)).to eq(false) 81 | end 82 | 83 | it "returns false if the string includes a dotted_line #0014" do 84 | string = "Here is one …………………………………………………………………… and another ......" 85 | ws = WordCountAnalyzer::Ellipsis.new 86 | expect(ws.includes_ellipsis?(string)).to eq(false) 87 | end 88 | end 89 | 90 | context '#replace' do 91 | it 'returns a string with the ellipsis replaced #001' do 92 | string = 'Using an ellipsis … causes different counts…depending on the style . . . that you use. I never meant that.... She left the store. The practice was not abandoned. . . .' 93 | ws = WordCountAnalyzer::Ellipsis.new 94 | expect(ws.replace(string)).to eq("Using an ellipsis wseword causes different counts wseword depending on the style wseword that you use. I never meant that wseword She left the store. The practice was not abandoned wseword ") 95 | end 96 | end 97 | 98 | context '#occurrences' do 99 | it 'returns a string with the ellipsis replaced #001' do 100 | string = 'Using an ellipsis … causes different counts…depending on the style . . . that you use. I never meant that.... She left the store. The practice was not abandoned. . . .' 101 | ws = WordCountAnalyzer::Ellipsis.new 102 | expect(ws.occurrences(string)).to eq(5) 103 | end 104 | end 105 | end -------------------------------------------------------------------------------- /lib/word_count_analyzer/date.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Date 3 | DOW = %w(monday tuesday wednesday thursday friday saturday sunday) 4 | DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun) 5 | MONTHS = %w(january february march april may june july august september october november december) 6 | MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec) 7 | # Rubular: http://rubular.com/r/73CZ2HU0q6 8 | DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}\.?/ 9 | 10 | # Rubular: http://rubular.com/r/GWbuWXw4t0 11 | YMD_YDM_REGEX = /\d{4}(\/|\.|-)(\d{1,2}(\/|\.|-)){2}\.?/ 12 | 13 | # Rubular: http://rubular.com/r/SRZ27XNlvR 14 | DIGIT_ONLY_YEAR_FIRST_REGEX = /[12]\d{7}\D\.?/ 15 | 16 | # Rubular: http://rubular.com/r/mpVSeaKwdY 17 | DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D\.?/ 18 | 19 | def includes_date?(text) 20 | includes_long_date?(text) || includes_number_only_date?(text) 21 | end 22 | 23 | def replace(text) 24 | counter = 0 25 | DOW_ABBR.map { |day| counter +=1 if text.include?('day') } 26 | text = redact_dates(counter, text) 27 | redact_regex(text) 28 | end 29 | 30 | def occurrences(text) 31 | replace(text).scan(/wsdateword/).size 32 | end 33 | 34 | def replace_number_only_date(text) 35 | text.gsub(DMY_MDY_REGEX, ' wsdateword ') 36 | .gsub(YMD_YDM_REGEX, ' wsdateword ') 37 | .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ') 38 | .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ') 39 | end 40 | 41 | private 42 | 43 | def redact_dates(counter, text) 44 | if counter > 0 45 | text = redact_dow_abbr(text) 46 | text = redact_dow(text) 47 | else 48 | text = redact_dow(text) 49 | text = redact_dow_abbr(text) 50 | end 51 | text 52 | end 53 | 54 | def redact_regex(text) 55 | text.gsub(DMY_MDY_REGEX, ' wsdateword ') 56 | .gsub(YMD_YDM_REGEX, ' wsdateword ') 57 | .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ') 58 | .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ') 59 | end 60 | 61 | def redact_dow(text) 62 | DOW.each do |day| 63 | MONTHS.map { |month| text = redact_date(text, day, month) } 64 | MONTH_ABBR.map { |month| text = redact_date(text, day, month) } 65 | end 66 | text 67 | end 68 | 69 | def redact_dow_abbr(text) 70 | DOW_ABBR.each do |day| 71 | MONTHS.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') } 72 | MONTH_ABBR.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') } 73 | end 74 | text 75 | end 76 | 77 | def redact_date(text, day, month) 78 | text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') 79 | .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') 80 | .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ') 81 | .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ') 82 | .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ') 83 | .gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ') 84 | end 85 | 86 | def includes_long_date?(text) 87 | includes_long_date_1?(text) || includes_long_date_2?(text) 88 | end 89 | 90 | def includes_long_date_1?(text) 91 | DOW.each do |day| 92 | MONTHS.map { |month| return true if check_for_matches(day, month, text) } 93 | MONTH_ABBR.map { |month| return true if check_for_matches(day, month, text) } 94 | end 95 | false 96 | end 97 | 98 | def includes_long_date_2?(text) 99 | DOW_ABBR.each do |day| 100 | MONTHS.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) } 101 | MONTH_ABBR.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) } 102 | end 103 | false 104 | end 105 | 106 | def includes_number_only_date?(text) 107 | !(text !~ DMY_MDY_REGEX) || 108 | !(text !~ YMD_YDM_REGEX) || 109 | !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) || 110 | !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX) 111 | end 112 | 113 | def check_for_matches(day, month, text) 114 | !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) || 115 | !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) || 116 | !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) || 117 | !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) || 118 | !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) || 119 | !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i) 120 | end 121 | end 122 | end -------------------------------------------------------------------------------- /spec/word_count_analyzer/contraction_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Contraction do 4 | before do 5 | @tgr = EngTagger.new 6 | end 7 | context '#contraction?' do 8 | it 'returns true if the token is a contraction' do 9 | token = "when'd" 10 | following_token = nil 11 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 12 | expect(ws.contraction?).to eq(true) 13 | end 14 | 15 | it 'returns true if the token is an irregular contraction' do 16 | token = "o'clock" 17 | following_token = nil 18 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 19 | expect(ws.contraction?).to eq(true) 20 | end 21 | 22 | it 'returns false if the token is a possesive and not a contraction' do 23 | token = "Bob's" 24 | following_token = "car" 25 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 26 | expect(ws.contraction?).to eq(false) 27 | end 28 | 29 | it 'returns true if the token is a contraction' do 30 | token = "Bob's" 31 | following_token = "the" 32 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 33 | expect(ws.contraction?).to eq(true) 34 | end 35 | 36 | it 'returns true if the token is a contraction' do 37 | token = "Bob's" 38 | following_token = "open" 39 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 40 | expect(ws.contraction?).to eq(true) 41 | end 42 | 43 | it 'returns true if the token is a contraction' do 44 | token = "Don't" 45 | following_token = "count" 46 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 47 | expect(ws.contraction?).to eq(true) 48 | end 49 | end 50 | 51 | context '#expanded_count' do 52 | it 'returns the count of the contraction expanded #001' do 53 | token = "when'd" 54 | following_token = nil 55 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 56 | expect(ws.expanded_count).to eq(2) 57 | end 58 | 59 | it 'returns the count of the contraction expanded #002' do 60 | token = "o'clock" 61 | following_token = nil 62 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 63 | expect(ws.expanded_count).to eq(3) 64 | end 65 | 66 | it 'returns the count of the contraction expanded #003' do 67 | token = "Bob's" 68 | following_token = "car" 69 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 70 | expect(ws.expanded_count).to eq(1) 71 | end 72 | 73 | it 'returns the count of the contraction expanded #004' do 74 | token = "Bob's" 75 | following_token = "the" 76 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil) 77 | expect(ws.expanded_count).to eq(2) 78 | end 79 | 80 | it 'returns the count of the contraction expanded #005' do 81 | token = "cat-o'-nine-tails" 82 | following_token = nil 83 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_one') 84 | expect(ws.expanded_count).to eq(1) 85 | end 86 | 87 | it 'returns the count of the contraction expanded #006' do 88 | token = "cat-o'-nine-tails" 89 | following_token = nil 90 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_multiple') 91 | expect(ws.expanded_count).to eq(4) 92 | end 93 | end 94 | 95 | context '#replace' do 96 | it 'replaces the token with the contraction expanded #001' do 97 | token = "cat-o'-nine-tails" 98 | following_token = nil 99 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr) 100 | expect(ws.replace).to eq("cat-of-nine-tails") 101 | end 102 | 103 | it 'replaces the token with the contraction expanded #002' do 104 | token = "Bob's" 105 | following_token = "the" 106 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr) 107 | expect(ws.replace).to eq(" word word ") 108 | end 109 | 110 | it 'replaces the token with the contraction expanded #003' do 111 | token = "don't" 112 | following_token = nil 113 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr) 114 | expect(ws.replace).to eq("do not") 115 | end 116 | 117 | it 'replaces the token with the contraction expanded #004' do 118 | token = "hello" 119 | following_token = nil 120 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr) 121 | expect(ws.replace).to eq("hello") 122 | end 123 | end 124 | end 125 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/contraction.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Contraction 3 | CONTRACTIONS = { 4 | "i'm" => "I am", 5 | "i'll" => "I will", 6 | "i'd" => "I would", 7 | "i've" => "I have", 8 | "you're" => "you are", 9 | "you'll" => "you will", 10 | "you'd" => "you would", 11 | "you've" => "you have", 12 | "he's" => "he is", 13 | "he'll" => "he will", 14 | "he'd" => "he would", 15 | "she's" => "she is", 16 | "she'll" => "she will", 17 | "she'd" => "she would", 18 | "it's" => "it is", 19 | "'tis" => "it is", 20 | "it'll" => "it will", 21 | "it'd" => "it would", 22 | "we're" => "we are", 23 | "we'll" => "we will", 24 | "we'd" => "we would", 25 | "we've" => "we have", 26 | "they're" => "they are", 27 | "they'll" => "they will", 28 | "they'd" => "they would", 29 | "they've" => "they have", 30 | "that's" => "that is", 31 | "that'll" => "that will", 32 | "that'd" => "that would", 33 | "who's" => "who is", 34 | "who'll" => "who will", 35 | "who'd" => "who would", 36 | "what's" => "what is", 37 | "what're" => "what are", 38 | "what'll" => "what will", 39 | "what'd" => "what would", 40 | "where's" => "where is", 41 | "where'll" => "where will", 42 | "where'd" => "where would", 43 | "when's" => "when is", 44 | "when'll" => "when will", 45 | "when'd" => "when would", 46 | "why's" => "why is", 47 | "why'll" => "why will", 48 | "why'd" => "why would", 49 | "how's" => "how is", 50 | "how'll" => "how will", 51 | "how'd" => "how would", 52 | "she'd've" => "she would have", 53 | "'tisn't" => "it is not", 54 | "isn't" => "is not", 55 | "aren't" => "are not", 56 | "wasn't" => "was not", 57 | "weren't" => "were not", 58 | "haven't" => "have not", 59 | "hasn't" => "has not", 60 | "hadn't" => "had not", 61 | "won't" => "will not", 62 | "wouldn't" => "would not", 63 | "don't" => "do not", 64 | "doesn't" => "does not", 65 | "didn't" => "did not", 66 | "can't" => "cannot", 67 | "couldn't" => "could not", 68 | "shouldn't" => "should not", 69 | "mightn't" => "might not", 70 | "mustn't" => "must not", 71 | "would've" => "would have", 72 | "should've" => "should have", 73 | "could've" => "could have", 74 | "might've" => "might have", 75 | "must've" => "must have", 76 | "o'" => "of", 77 | "o'clock" => "of the clock", 78 | "ma'am" => "madam", 79 | "ne'er-do-well" => "never-do-well", 80 | "cat-o'-nine-tails" => "cat-of-nine-tails", 81 | "jack-o'-lantern" => "jack-of-the-lantern", 82 | "will-o'-the-wisp" => "will-of-the-wisp", 83 | "'twas" => "it was" 84 | }.freeze 85 | 86 | attr_reader :token, :following_token, :tgr, :hyphen 87 | def initialize(token:, following_token:, tgr:, **args) 88 | @token = token 89 | @following_token = following_token 90 | @tgr = tgr 91 | @hyphen = args[:hyphen] || 'count_as_one' 92 | end 93 | 94 | def contraction? 95 | common_contraction? || 96 | (apostrophe_s_token? && 97 | following_is_not_a_noun?) 98 | end 99 | 100 | def expanded_count 101 | if self.contraction? 102 | if common_contraction? 103 | calculate_contraction_length 104 | else 105 | 2 106 | end 107 | else 108 | 1 109 | end 110 | end 111 | 112 | def replace 113 | if CONTRACTIONS.has_key?(token.downcase) 114 | CONTRACTIONS[token.downcase] 115 | elsif apostrophe_s_token? && following_is_not_a_noun? 116 | ' word word ' 117 | else 118 | token 119 | end 120 | end 121 | 122 | private 123 | 124 | def calculate_contraction_length 125 | if hyphen.eql?('count_as_one') && hyphen 126 | contraction_length 127 | else 128 | contraction_length_hyphen 129 | end 130 | end 131 | 132 | def contraction_length 133 | CONTRACTIONS[token.downcase].split(' ').length 134 | end 135 | 136 | def contraction_length_hyphen 137 | CONTRACTIONS[token.downcase].split(' ').map { |token| token.split('-') }.flatten.length 138 | end 139 | 140 | def common_contraction? 141 | CONTRACTIONS.has_key?(token.downcase) 142 | end 143 | 144 | def following_is_not_a_noun? 145 | !tgr.add_tags(following_token)[1].downcase.eql?('n') 146 | end 147 | 148 | def apostrophe_s_token? 149 | token.include?("'s") 150 | end 151 | end 152 | end -------------------------------------------------------------------------------- /spec/word_count_analyzer/date_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Date do 4 | context '#includes_date?(string)' do 5 | it 'returns true if the string includes a date #001' do 6 | string = 'Today is Monday, April 4th, 2011, aka 04/04/2011.' 7 | ws = WordCountAnalyzer::Date.new 8 | expect(ws.includes_date?(string)).to eq(true) 9 | end 10 | 11 | it 'returns true if the string includes a date #002' do 12 | string = 'Today is Monday April 4th 2011.' 13 | ws = WordCountAnalyzer::Date.new 14 | expect(ws.includes_date?(string)).to eq(true) 15 | end 16 | 17 | it 'returns true if the string includes a date #003' do 18 | string = 'Today is April 4th, 2011.' 19 | ws = WordCountAnalyzer::Date.new 20 | expect(ws.includes_date?(string)).to eq(true) 21 | end 22 | 23 | it 'returns true if the string includes a date #004' do 24 | string = 'Today is Mon., Apr. 4, 2011.' 25 | ws = WordCountAnalyzer::Date.new 26 | expect(ws.includes_date?(string)).to eq(true) 27 | end 28 | 29 | it 'returns true if the string includes a date #005' do 30 | string = 'Today is 04/04/2011.' 31 | ws = WordCountAnalyzer::Date.new 32 | expect(ws.includes_date?(string)).to eq(true) 33 | end 34 | 35 | it 'returns true if the string includes a date #006' do 36 | string = 'Today is 04.04.2011.' 37 | ws = WordCountAnalyzer::Date.new 38 | expect(ws.includes_date?(string)).to eq(true) 39 | end 40 | 41 | it 'returns true if the string includes a date #007' do 42 | string = 'Today is 2011.04.04.' 43 | ws = WordCountAnalyzer::Date.new 44 | expect(ws.includes_date?(string)).to eq(true) 45 | end 46 | 47 | it 'returns true if the string includes a date #008' do 48 | string = 'Today is 2011/04/04.' 49 | ws = WordCountAnalyzer::Date.new 50 | expect(ws.includes_date?(string)).to eq(true) 51 | end 52 | 53 | it 'returns true if the string includes a date #009' do 54 | string = 'Today is 2011-04-04.' 55 | ws = WordCountAnalyzer::Date.new 56 | expect(ws.includes_date?(string)).to eq(true) 57 | end 58 | 59 | it 'returns true if the string includes a date #010' do 60 | string = 'Today is 04-04-2011.' 61 | ws = WordCountAnalyzer::Date.new 62 | expect(ws.includes_date?(string)).to eq(true) 63 | end 64 | 65 | it 'returns true if the string includes a date #011' do 66 | string = 'Today is 2003 November 9.' 67 | ws = WordCountAnalyzer::Date.new 68 | expect(ws.includes_date?(string)).to eq(true) 69 | end 70 | 71 | it 'returns true if the string includes a date #012' do 72 | string = 'Today is 2003Nov9.' 73 | ws = WordCountAnalyzer::Date.new 74 | expect(ws.includes_date?(string)).to eq(true) 75 | end 76 | 77 | it 'returns true if the string includes a date #013' do 78 | string = 'Today is 2003Nov09.' 79 | ws = WordCountAnalyzer::Date.new 80 | expect(ws.includes_date?(string)).to eq(true) 81 | end 82 | 83 | it 'returns true if the string includes a date #014' do 84 | string = 'Today is 2003-Nov-9.' 85 | ws = WordCountAnalyzer::Date.new 86 | expect(ws.includes_date?(string)).to eq(true) 87 | end 88 | 89 | it 'returns true if the string includes a date #015' do 90 | string = 'Today is 2003-Nov-09.' 91 | ws = WordCountAnalyzer::Date.new 92 | expect(ws.includes_date?(string)).to eq(true) 93 | end 94 | 95 | it 'returns true if the string includes a date #016' do 96 | string = 'Today is 2003-Nov-9, Sunday.' 97 | ws = WordCountAnalyzer::Date.new 98 | expect(ws.includes_date?(string)).to eq(true) 99 | end 100 | 101 | it 'returns true if the string includes a date #017' do 102 | string = 'Today is 2003. november 9.' 103 | ws = WordCountAnalyzer::Date.new 104 | expect(ws.includes_date?(string)).to eq(true) 105 | end 106 | 107 | it 'returns true if the string includes a date #018' do 108 | string = 'Today is 2003.11.9.' 109 | ws = WordCountAnalyzer::Date.new 110 | expect(ws.includes_date?(string)).to eq(true) 111 | end 112 | 113 | it 'returns true if the string includes a date #019' do 114 | string = 'Today is Monday, Apr. 4, 2011.' 115 | ws = WordCountAnalyzer::Date.new 116 | expect(ws.includes_date?(string)).to eq(true) 117 | end 118 | 119 | it 'returns true if the string includes a date #020' do 120 | string = 'Today is 2003/11/09.' 121 | ws = WordCountAnalyzer::Date.new 122 | expect(ws.includes_date?(string)).to eq(true) 123 | end 124 | 125 | it 'returns true if the string includes a date #021' do 126 | string = 'Today is 20030109.' 127 | ws = WordCountAnalyzer::Date.new 128 | expect(ws.includes_date?(string)).to eq(true) 129 | end 130 | 131 | it 'returns true if the string includes a date #022' do 132 | string = 'Today is 01092003.' 133 | ws = WordCountAnalyzer::Date.new 134 | expect(ws.includes_date?(string)).to eq(true) 135 | end 136 | 137 | it 'returns true if the string includes a date #023' do 138 | string = 'Today is Sunday, November 9, 2014.' 139 | ws = WordCountAnalyzer::Date.new 140 | expect(ws.includes_date?(string)).to eq(true) 141 | end 142 | 143 | it 'returns true if the string includes a date #024' do 144 | string = 'Today is November 9, 2014.' 145 | ws = WordCountAnalyzer::Date.new 146 | expect(ws.includes_date?(string)).to eq(true) 147 | end 148 | 149 | it 'returns true if the string includes a date #025' do 150 | string = 'Today is Nov. 9, 2014.' 151 | ws = WordCountAnalyzer::Date.new 152 | expect(ws.includes_date?(string)).to eq(true) 153 | end 154 | 155 | it 'returns true if the string includes a date #026' do 156 | string = 'Today is july 1st.' 157 | ws = WordCountAnalyzer::Date.new 158 | expect(ws.includes_date?(string)).to eq(true) 159 | end 160 | 161 | it 'returns true if the string includes a date #027' do 162 | string = 'Today is jul. 1st.' 163 | ws = WordCountAnalyzer::Date.new 164 | expect(ws.includes_date?(string)).to eq(true) 165 | end 166 | 167 | it 'returns true if the string includes a date #028' do 168 | string = 'Today is 8 November 2014.' 169 | ws = WordCountAnalyzer::Date.new 170 | expect(ws.includes_date?(string)).to eq(true) 171 | end 172 | 173 | it 'returns true if the string includes a date #029' do 174 | string = 'Today is 8. November 2014.' 175 | ws = WordCountAnalyzer::Date.new 176 | expect(ws.includes_date?(string)).to eq(true) 177 | end 178 | 179 | it 'returns true if the string includes a date #030' do 180 | string = 'Today is 08-Nov-2014.' 181 | ws = WordCountAnalyzer::Date.new 182 | expect(ws.includes_date?(string)).to eq(true) 183 | end 184 | 185 | it 'returns true if the string includes a date #031' do 186 | string = 'Today is 08Nov14.' 187 | ws = WordCountAnalyzer::Date.new 188 | expect(ws.includes_date?(string)).to eq(true) 189 | end 190 | 191 | it 'returns true if the string includes a date #032' do 192 | string = 'Today is 8th November 2014.' 193 | ws = WordCountAnalyzer::Date.new 194 | expect(ws.includes_date?(string)).to eq(true) 195 | end 196 | 197 | it 'returns true if the string includes a date #033' do 198 | string = 'Today is the 8th of November 2014.' 199 | ws = WordCountAnalyzer::Date.new 200 | expect(ws.includes_date?(string)).to eq(true) 201 | end 202 | 203 | it 'returns true if the string includes a date #034' do 204 | string = 'Today is 08/Nov/2014.' 205 | ws = WordCountAnalyzer::Date.new 206 | expect(ws.includes_date?(string)).to eq(true) 207 | end 208 | 209 | it 'returns true if the string includes a date #035' do 210 | string = 'Today is Sunday, 8 November 2014.' 211 | ws = WordCountAnalyzer::Date.new 212 | expect(ws.includes_date?(string)).to eq(true) 213 | end 214 | 215 | it 'returns true if the string includes a date #036' do 216 | string = 'Today is 8 November 2014.' 217 | ws = WordCountAnalyzer::Date.new 218 | expect(ws.includes_date?(string)).to eq(true) 219 | end 220 | 221 | it 'returns false if the string does not include a date #037' do 222 | string = 'Hello world. There is no date here - $50,000. The sun is hot.' 223 | ws = WordCountAnalyzer::Date.new 224 | expect(ws.includes_date?(string)).to eq(false) 225 | end 226 | end 227 | 228 | context '#occurrences' do 229 | it 'counts the date occurrences in a string #001' do 230 | string = 'Today is Sunday, 8 November 2014.' 231 | ws = WordCountAnalyzer::Date.new 232 | expect(ws.occurrences(string)).to eq(1) 233 | end 234 | 235 | it 'counts the date occurrences in a string #002' do 236 | string = 'Today is Sunday, 8 November 2014. Yesterday was 07/Nov/2014.' 237 | ws = WordCountAnalyzer::Date.new 238 | expect(ws.occurrences(string)).to eq(2) 239 | end 240 | end 241 | 242 | context '#replace' do 243 | it 'replaces the date occurrences in a string #001' do 244 | string = 'Today is Tues. March 3rd, 2011.' 245 | ws = WordCountAnalyzer::Date.new 246 | expect(ws.replace(string)).to eq('Today is wsdateword ') 247 | end 248 | 249 | it 'replaces the date occurrences in a string #002' do 250 | string = 'The scavenger hunt ends on Dec. 31st, 2011.' 251 | ws = WordCountAnalyzer::Date.new 252 | expect(ws.replace(string)).to eq('The scavenger hunt ends on wsdateword ') 253 | end 254 | end 255 | 256 | context '#replace_number_only_date' do 257 | it 'replaces only the number date occurrences in a string' do 258 | string = 'Today is Tues. March 3rd, 2011. 4/28/2013' 259 | ws = WordCountAnalyzer::Date.new 260 | expect(ws.replace_number_only_date(string)).to eq("Today is Tues. March 3rd, 2011. wsdateword ") 261 | end 262 | end 263 | end 264 | -------------------------------------------------------------------------------- /lib/word_count_analyzer/counter.rb: -------------------------------------------------------------------------------- 1 | module WordCountAnalyzer 2 | class Counter 3 | attr_reader :ellipsis, :hyperlink, :contraction, :hyphenated_word, :date, :number, :numbered_list, :xhtml, :forward_slash, :backslash, :dotted_line, :dashed_line, :underscore, :stray_punctuation, :equal_sign 4 | def initialize(**args) 5 | @ellipsis = args[:ellipsis] || 'ignore' 6 | @hyperlink = args[:hyperlink] || 'count_as_one' 7 | @contraction = args[:contraction] || 'count_as_one' 8 | @hyphenated_word = args[:hyphenated_word] || 'count_as_one' 9 | @date = args[:date] || 'no_special_treatment' 10 | @number = args[:number] || 'count' 11 | @numbered_list = args[:numbered_list] || 'count' 12 | @xhtml = args[:xhtml] || 'remove' 13 | @forward_slash = args[:forward_slash] || 'count_as_multiple_except_dates' 14 | @backslash = args[:backslash] || 'count_as_one' 15 | @dotted_line = args[:dotted_line] || 'ignore' 16 | @dashed_line = args[:dashed_line] || 'ignore' 17 | @underscore = args[:underscore] || 'ignore' 18 | @stray_punctuation = args[:stray_punctuation] || 'ignore' 19 | @equal_sign = 'ignore' 20 | @tgr = EngTagger.new 21 | end 22 | 23 | def count(text) 24 | word_count(text) 25 | end 26 | 27 | def pages_count(text) 28 | @ellipsis = 'ignore' 29 | @hyperlink = 'split_at_period' 30 | @contraction = 'count_as_one' 31 | @hyphenated_word = 'count_as_multiple' 32 | @date = 'no_special_treatment' 33 | @number = 'count' 34 | @numbered_list = 'ignore' 35 | @xhtml = 'keep' 36 | @forward_slash = 'count_as_multiple' 37 | @backslash = 'count_as_multiple' 38 | @dotted_line = 'ignore' 39 | @dashed_line = 'ignore' 40 | @underscore = 'ignore' 41 | @stray_punctuation = 'ignore' 42 | @equal_sign = 'break' 43 | word_count(text) 44 | end 45 | 46 | def mword_count(text) 47 | @ellipsis = 'no_special_treatment' 48 | @hyperlink = 'count_as_one' 49 | @contraction = 'count_as_one' 50 | @hyphenated_word = 'count_as_one' 51 | @date = 'no_special_treatment' 52 | @number = 'count' 53 | @numbered_list = 'count' 54 | @xhtml = 'keep' 55 | @forward_slash = 'count_as_one' 56 | @backslash = 'count_as_one' 57 | @dotted_line = 'count' 58 | @dashed_line = 'count' 59 | @underscore = 'count' 60 | @stray_punctuation = 'count' 61 | word_count(text) 62 | end 63 | 64 | private 65 | 66 | def word_count(text) 67 | processed_text = process_ellipsis(text) 68 | processed_text = process_hyperlink(processed_text) 69 | processed_text = process_contraction(processed_text, @tgr) 70 | processed_text = process_date(processed_text) 71 | processed_text = process_number_list(processed_text) 72 | processed_text = process_number(processed_text) 73 | processed_text = process_xhtml(processed_text) 74 | processed_text = process_forward_slash(processed_text) 75 | processed_text = process_backslash(processed_text) 76 | processed_text = process_hyphenated_word(processed_text) 77 | processed_text = process_dotted_line(processed_text) 78 | processed_text = process_dashed_line(processed_text) 79 | processed_text = process_underscore(processed_text) 80 | processed_text = process_stray_punctuation(processed_text) 81 | processed_text = process_equal_sign(processed_text) if @equal_sign.eql?('break') 82 | processed_text.split(/\s+/).reject(&:empty?).size 83 | end 84 | 85 | def process_ellipsis(txt) 86 | if ellipsis.eql?('ignore') 87 | WordCountAnalyzer::Ellipsis.new.replace(txt).gsub(/wseword/, '') 88 | elsif ellipsis.eql?('no_special_treatment') 89 | txt 90 | else 91 | raise 'The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`' 92 | end 93 | end 94 | 95 | def process_hyperlink(txt) 96 | case 97 | when hyperlink.eql?('count_as_one') 98 | WordCountAnalyzer::Hyperlink.new.replace(txt) 99 | when hyperlink.eql?('split_at_period') 100 | WordCountAnalyzer::Hyperlink.new.replace_split_at_period(txt) 101 | when hyperlink.eql?('no_special_treatment') 102 | txt 103 | else 104 | raise 'The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`' 105 | end 106 | end 107 | 108 | def process_contraction(txt, tgr) 109 | if contraction.eql?('count_as_one') 110 | txt 111 | elsif contraction.eql?('count_as_multiple') 112 | array = txt.split(/\s+/) 113 | array.each_with_index.map { |token, i| WordCountAnalyzer::Contraction.new(token: token, following_token: array[i +1], tgr: tgr).replace }.join(' ') 114 | else 115 | raise 'The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`' 116 | end 117 | end 118 | 119 | def process_hyphenated_word(txt) 120 | if hyphenated_word.eql?('count_as_one') 121 | txt 122 | elsif hyphenated_word.eql?('count_as_multiple') 123 | txt.split(/\s+/).each_with_index.map { |token, i| WordCountAnalyzer::HyphenatedWord.new(token: token).replace }.join(' ') 124 | else 125 | raise 'The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`' 126 | end 127 | end 128 | 129 | def process_date(txt) 130 | if date.eql?('no_special_treatment') 131 | txt 132 | elsif date.eql?('count_as_one') 133 | WordCountAnalyzer::Date.new.replace(txt) 134 | else 135 | raise 'The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`' 136 | end 137 | end 138 | 139 | def process_number(txt) 140 | if number.eql?('ignore') 141 | WordCountAnalyzer::Number.new(string: txt).replace.gsub(/wsnumword/, '') 142 | elsif number.eql?('count') 143 | txt 144 | else 145 | raise 'The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`' 146 | end 147 | end 148 | 149 | def process_number_list(txt) 150 | if numbered_list.eql?('ignore') 151 | WordCountAnalyzer::NumberedList.new(string: txt).replace 152 | elsif numbered_list.eql?('count') 153 | txt 154 | else 155 | raise 'The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`' 156 | end 157 | end 158 | 159 | def process_xhtml(txt) 160 | if xhtml.eql?('remove') 161 | WordCountAnalyzer::Xhtml.new(string: txt).replace 162 | elsif xhtml.eql?('keep') 163 | txt 164 | else 165 | raise 'The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`' 166 | end 167 | end 168 | 169 | def process_forward_slash(txt) 170 | case 171 | when forward_slash.eql?('count_as_multiple') 172 | WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes 173 | when forward_slash.eql?('count_as_multiple_except_dates') 174 | WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes_except_dates 175 | when forward_slash.eql?('count_as_one') 176 | txt 177 | else 178 | raise 'The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`' 179 | end 180 | end 181 | 182 | def process_backslash(txt) 183 | if backslash.eql?('count_as_multiple') 184 | WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_backslashes 185 | elsif backslash.eql?('count_as_one') 186 | txt 187 | else 188 | raise 'The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`' 189 | end 190 | end 191 | 192 | def process_dotted_line(txt) 193 | if dotted_line.eql?('ignore') 194 | WordCountAnalyzer::Punctuation.new(string: txt).replace_dotted_line 195 | elsif dotted_line.eql?('count') 196 | txt 197 | else 198 | raise 'The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`' 199 | end 200 | end 201 | 202 | def process_dashed_line(txt) 203 | if dashed_line.eql?('ignore') 204 | WordCountAnalyzer::Punctuation.new(string: txt).replace_dashed_line 205 | elsif dashed_line.eql?('count') 206 | txt 207 | else 208 | raise 'The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`' 209 | end 210 | end 211 | 212 | def process_underscore(txt) 213 | if underscore.eql?('ignore') 214 | WordCountAnalyzer::Punctuation.new(string: txt).replace_underscore 215 | elsif underscore.eql?('count') 216 | txt 217 | else 218 | raise 'The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`' 219 | end 220 | end 221 | 222 | def process_stray_punctuation(txt) 223 | if stray_punctuation.eql?('ignore') 224 | WordCountAnalyzer::Punctuation.new(string: txt).replace_stray_punctuation 225 | elsif stray_punctuation.eql?('count') 226 | txt 227 | else 228 | raise 'The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`' 229 | end 230 | end 231 | 232 | def process_equal_sign(txt) 233 | txt.split('=').join(' ').split(/>(?=[a-zA-z]+)/).join(' ') 234 | end 235 | end 236 | end 237 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Word Count Analyzer 2 | 3 | [![Gem Version](https://badge.fury.io/rb/word_count_analyzer.svg)](http://badge.fury.io/rb/word_count_analyzer) [![Build Status](https://travis-ci.org/diasks2/word_count_analyzer.png)](https://travis-ci.org/diasks2/word_count_analyzer) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/word_count_analyzer/blob/master/LICENSE.txt) 4 | 5 | See what word count [gray areas](#gray-area-details) might be affecting your word count. 6 | 7 | Word Count Analyzer is a Ruby gem that analyzes a string for potential areas of the text that might cause word count discrepancies depending on the tool used. It also provides comprehensive configuration options so you can easily customize how different gray areas should be counted and find the right word count for your purposes. 8 | 9 | If you prioritize speed over accuracy, then I recommend not using this gem. There are most definitely faster gems for getting a word count. However, if accuracy is important, and you want control over the gray areas that affect word count, then this gem is for you. 10 | 11 | ## Install 12 | 13 | **Ruby** 14 | *Supports Ruby 2.1.0 and above* 15 | ``` 16 | gem install word_count_analyzer 17 | ``` 18 | 19 | **Ruby on Rails** 20 | Add this line to your application’s Gemfile: 21 | ```ruby 22 | gem 'word_count_analyzer' 23 | ``` 24 | 25 | ## Live Demo 26 | 27 | Try out a [live demo](https://www.tm-town.com/word-count-analyzer) of Word Count Analyzer in the browser. 28 | 29 | ## Usage 30 | 31 | ### Analyze the word count gray areas of a string 32 | 33 | Common word count gray areas include (*[more details below](#gray-area-details)*): 34 | - Ellipses 35 | - Hyperlinks 36 | - Contractions 37 | - Hyphenated Words 38 | - Dates 39 | - Numbers 40 | - Numbered Lists 41 | - XML and HTML tags 42 | - Forward slashes and backslashes 43 | - Punctuation 44 | 45 | Other gray areas not covered by this gem: 46 | - Headers 47 | - Footers 48 | - Hidden Text (*specific to Microsoft Word*) 49 | 50 | ```ruby 51 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 4. Some HTML and a hyphenated-word. Don't count stray punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 52 | WordCountAnalyzer::Analyzer.new.analyze(text) 53 | 54 | # => { 55 | # "ellipsis": 1, 56 | # "hyperlink": 2, 57 | # "contraction": 4, 58 | # "hyphenated_word": 2, 59 | # "date": 2, 60 | # "number": 1, 61 | # "numbered_list": 3, 62 | # "xhtml": 1, 63 | # "forward_slash": 1, 64 | # "backslash": 1, 65 | # "dotted_line": 1, 66 | # "dashed_line": 1, 67 | # "underscore": 1, 68 | # "stray_punctuation": 5 69 | # } 70 | ``` 71 | 72 | ### Count the words in a string 73 | 74 | ```ruby 75 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list \n\n1. item a \n\n2. item b \n\n3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 76 | 77 | WordCountAnalyzer::Counter.new.count(text) 78 | # => 64 79 | 80 | # Overrides all settings to match the way Pages handles word count. 81 | # N.B. The developers of Pages may change the algorithm at any time so this should just be as an approximation. 82 | WordCountAnalyzer::Counter.new.pages_count(text) 83 | # => 76 (or 79 if the list items are not formatted as a list) 84 | 85 | # Overrides all settings to match the way Microsoft Word and wc (Unix) handle word count. 86 | # N.B. The developers of these tools may change the algorithm at any time so this should just be as an approximation. 87 | WordCountAnalyzer::Counter.new.mword_count(text) 88 | # => 71 89 | 90 | # Highly configurable (see all options below) 91 | WordCountAnalyzer::Counter.new( 92 | ellipsis: 'no_special_treatment', 93 | hyperlink: 'no_special_treatment', 94 | contraction: 'count_as_multiple', 95 | hyphenated_word: 'count_as_multiple', 96 | date: 'count_as_one', 97 | number: 'ignore', 98 | numbered_list: 'ignore', 99 | xhtml: 'keep', 100 | forward_slash: 'count_as_multiple', 101 | backslash: 'count_as_multiple', 102 | dotted_line: 'count', 103 | dashed_line: 'count', 104 | underscore: 'count', 105 | stray_punctuation: 'count' 106 | ).count(text) 107 | 108 | # => 77 109 | ``` 110 | 111 | #### Counter `options` 112 | 113 | ##### `ellipsis` 114 | **default** = `'ignore'` 115 | - `'ignore'` 116 | Ignores all ellipses in the word count total. 117 | - `'no_special_treatment'` 118 | Ellipses will not be searched for in the string. 119 | 120 |
121 | 122 | ##### `hyperlink` 123 | **default** = `'count_as_one'` 124 | - `'count_as_one'` 125 | Counts a hyperlink as one word. 126 | - `'no_special_treatment'` 127 | Hyperlinks will not be searched for in the string. Therefore, how a hyperlink is handled in the word count will depend on other settings (mainly slashes). 128 | - `'split_at_period'` 129 | Pages will split hyperlinks at a period and count each token as a separate word. 130 | 131 |
132 | 133 | ##### `contraction` 134 | **default** = `'count_as_one'` 135 | - `'count_as_one'` 136 | Counts a contraction as one word. 137 | - `'count_as_multiple'` 138 | Splits a contraction into the words that make it up. Examples: 139 | - `don't` => `do not` (2 words) 140 | - `o'clock` => `of the clock` (3 words) 141 | 142 |
143 | 144 | ##### `hyphenated_word` 145 | **default** = `'count_as_one'` 146 | - `'count_as_one'` 147 | Counts a hyphenated word as one word. 148 | - `'count_as_multiple'` 149 | Breaks a hyphenated word at each hyphen and counts each word separately. Example: 150 | - `devil-may-care` (3 words) 151 | 152 |
153 | 154 | ##### `date` 155 | **default** = `'no_special_treatment'` 156 | - `'no_special_treatment'` 157 | Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings. 158 | - `'count_as_one'` 159 | Counts a date as one word. This is more commonly seen in translation CAT tools where a date is thought of as a *placeable* that can usually be automatically translated. Examples: 160 | - Monday, April 4th, 2011 (1 word) 161 | - April 4th, 2011 (1 word) 162 | - 04/04/2011 (1 word) 163 | - 04.04.2011 (1 word) 164 | - 2011/04/04 (1 word) 165 | - 2011-04-04 (1 word) 166 | - 2003Nov9 (1 word) 167 | - 2003 November 9 (1 word) 168 | - 2003-Nov-9 (1 word) 169 | - and others... 170 | 171 |
172 | 173 | ##### `number` 174 | **default** = `'count'` 175 | - `'count'` 176 | Counts a number as one word. 177 | - `'ignore'` 178 | Ignores any numbers in the string (with the exception of `dates` and `numbered_lists`) and does not count them towards the word count. 179 | 180 |
181 | 182 | ##### `numbered_list` 183 | **default** = `'count'` 184 | - `'count'` 185 | Counts a number in a numbered list as one word. 186 | - `'ignore'` 187 | Ignores any numbers that are part of a numbered list and does not count them towards the word count. 188 | 189 |
190 | 191 | ##### `xhtml` 192 | **default** = `'remove'` 193 | - `'remove'` 194 | Removes any XML or HTML opening and closing tags from the string. 195 | - `'keep'` 196 | Ignores any XML or HTML in the string. 197 | 198 |
199 | 200 | ##### `forward_slash` 201 | **default** = `'count_as_multiple_except_dates'` 202 | - `'count_as_multiple_except_dates'` 203 | Separates any tokens that include a forward slash (except dates) at the slash(s) and counts each token individually. Example: 204 | - she/he/it 4/25/2014 (4 words) 205 | - `'count_as_multiple'` 206 | Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example: 207 | - she/he/it (3 words) 208 | - `'count_as_one'` 209 | Counts any tokens that include a forward slash as one word. Example: 210 | - she/he/it (1 word) 211 | 212 |
213 | 214 | ##### `backslash` 215 | **default** = `'count_as_one'` 216 | - `'count_as_one'` 217 | Counts any tokens that include a backslash as one word. Example: 218 | - c:\Users\johndoe (1 word) 219 | - `'count_as_multiple'` 220 | Separates any tokens that include a backslash at the slash(s) and counts each token individually. Example: 221 | - c:\Users\johndoe (3 words) 222 | 223 |
224 | 225 | ##### `dotted_line` 226 | **default** = `'ignore'` 227 | - `'ignore'` 228 | Ignores any dotted lines in the string and does not count them towards the word count. 229 | - `'count'` 230 | Counts a dotted line as one word. 231 | 232 |
233 | 234 | ##### `dashed_line` 235 | **default** = `'ignore'` 236 | - `'ignore'` 237 | Ignores any dashed lines in the string and does not count them towards the word count. 238 | - `'count'` 239 | Counts a dashed line as one word. 240 | 241 |
242 | 243 | ##### `underscore` 244 | **default** = `'ignore'` 245 | - `'ignore'` 246 | Ignores any series of underscores in the string and does not count them towards the word count. 247 | - `'count'` 248 | Counts a series of underscores as one word. 249 | 250 |
251 | 252 | ##### `stray_punctuation` 253 | **default** = `'ignore'` 254 | - `'ignore'` 255 | Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count. 256 | - `'count'` 257 | Counts a punctuation mark surrounded on both sides by a whitespace as one word. 258 | 259 | ### Gray Area Details 260 | 261 | #### Ellipsis 262 | 263 | Checks for any occurrences of ellipses in your text. Writers tend to use different formats for ellipsis, and although there are [style guides](http://www.thepunctuationguide.com/ellipses.html), it is rare that these rules are followed. 264 | 265 | ##### Three Consecutive Periods 266 | ``` 267 | ... 268 | ``` 269 | Tool | Word Count 270 | -------------- | ---------- 271 | Microsoft Word | 1 272 | Pages | 0 273 | wc (Unix) | 1 274 | 275 | ##### Four Consecutive Periods 276 | ``` 277 | .... 278 | ``` 279 | Tool | Word Count 280 | -------------- | ---------- 281 | Microsoft Word | 1 282 | Pages | 0 283 | wc (Unix) | 1 284 | 285 | ##### Three Periods With Spaces 286 | ``` 287 | . . . 288 | ``` 289 | Tool | Word Count 290 | -------------- | ---------- 291 | Microsoft Word | 3 292 | Pages | 0 293 | wc (Unix) | 3 294 | 295 | ##### Four Periods With Spaces 296 | ``` 297 | . . . . 298 | ``` 299 | Tool | Word Count 300 | -------------- | ---------- 301 | Microsoft Word | 4 302 | Pages | 0 303 | wc (Unix) | 4 304 | 305 | ##### Horizontal Ellipsis 306 | ``` 307 | … 308 | ``` 309 | Tool | Word Count 310 | -------------- | ---------- 311 | Microsoft Word | 1 312 | Pages | 0 313 | wc (Unix) | 1 314 | 315 | #### Hyperlink 316 | 317 | ``` 318 | http://www.example.com 319 | ``` 320 | Tool | Word Count 321 | -------------- | ---------- 322 | Microsoft Word | 1 323 | Pages | 4 324 | wc (Unix) | 1 325 | 326 | #### Contraction 327 | 328 | Most tools count contractions as one word. [Some might argue](http://english.stackexchange.com/questions/80635/counting-contractions-as-one-or-two-words) a contraction is technically more than one word. 329 | 330 | ``` 331 | can't 332 | ``` 333 | Tool | Word Count 334 | -------------- | ---------- 335 | Microsoft Word | 1 336 | Pages | 1 337 | wc (Unix) | 1 338 | 339 | #### Hyphenated Word 340 | 341 | ``` 342 | devil-may-care 343 | ``` 344 | Tool | Word Count 345 | -------------- | ---------- 346 | Microsoft Word | 1 347 | Pages | 3 348 | wc (Unix) | 1 349 | 350 | #### Date 351 | 352 | Most word processing tools do not do recognize dates, but translation CAT tools tend to recognize dates as one word or [placeable](http://www.wordfast.net/wiki/Placeables). This gem checks for many date formats including those that include day or month abbreviations. A few examples are listed below (*not an exhaustive list*). 353 | 354 | ##### Date (example A) 355 | ``` 356 | Monday, April 4th, 2011 357 | ``` 358 | Tool | Word Count 359 | -------------- | ---------- 360 | Microsoft Word | 4 361 | Pages | 4 362 | wc (Unix) | 4 363 | 364 | ##### Date (example B) 365 | ``` 366 | 04/04/2011 367 | ``` 368 | Tool | Word Count 369 | -------------- | ---------- 370 | Microsoft Word | 1 371 | Pages | 3 372 | wc (Unix) | 1 373 | 374 | ##### Date (example C) 375 | ``` 376 | 04.04.2011 377 | ``` 378 | Tool | Word Count 379 | -------------- | ---------- 380 | Microsoft Word | 1 381 | Pages | 1 382 | wc (Unix) | 1 383 | 384 | #### Number 385 | 386 | ##### Simple number 387 | ``` 388 | 200 389 | ``` 390 | Tool | Word Count 391 | -------------- | ---------- 392 | Microsoft Word | 1 393 | Pages | 1 394 | wc (Unix) | 1 395 | 396 | ##### Number with preceding unit 397 | ``` 398 | $200 399 | ``` 400 | Tool | Word Count 401 | -------------- | ---------- 402 | Microsoft Word | 1 403 | Pages | 1 404 | wc (Unix) | 1 405 | 406 | 407 | ##### Number with unit following 408 | ``` 409 | 50% 410 | ``` 411 | Tool | Word Count 412 | -------------- | ---------- 413 | Microsoft Word | 1 414 | Pages | 1 415 | wc (Unix) | 1 416 | 417 | #### Numbered List 418 | 419 | ``` 420 | 1. List item a 421 | 2. List item b 422 | 3. List item c 423 | ``` 424 | Tool | Word Count 425 | -------------- | ---------- 426 | Microsoft Word | 12 427 | Pages | 9 428 | wc (Unix) | 12 429 | 430 | #### XML and HTML Tags 431 | 432 | ```html 433 | Hello world Hello 434 | ``` 435 | Tool | Word Count 436 | -------------- | ---------- 437 | Microsoft Word | 4 438 | Pages | 12 439 | wc (Unix) | 4 440 | 441 | #### Slashes 442 | 443 | ##### Forward slash 444 | ``` 445 | she/he/it 446 | ``` 447 | Tool | Word Count 448 | -------------- | ---------- 449 | Microsoft Word | 1 450 | Pages | 3 451 | wc (Unix) | 1 452 | 453 | ##### Backslash 454 | ``` 455 | c:\Users\johndoe 456 | ``` 457 | Tool | Word Count 458 | -------------- | ---------- 459 | Microsoft Word | 1 460 | Pages | 3 461 | wc (Unix) | 1 462 | 463 | #### Punctuation 464 | 465 | ##### Dotted line 466 | ``` 467 | ......... 468 | ``` 469 | Tool | Word Count 470 | -------------- | ---------- 471 | Microsoft Word | 1 472 | Pages | 0 473 | wc (Unix) | 1 474 | 475 | ``` 476 | ……………………… 477 | ``` 478 | Tool | Word Count 479 | -------------- | ---------- 480 | Microsoft Word | 1 481 | Pages | 0 482 | wc (Unix) | 1 483 | 484 | ##### Dashed line 485 | ``` 486 | ----------- 487 | ``` 488 | Tool | Word Count 489 | -------------- | ---------- 490 | Microsoft Word | 1 491 | Pages | 0 492 | wc (Unix) | 1 493 | 494 | ##### Underscore 495 | ``` 496 | ____________ 497 | ``` 498 | Tool | Word Count 499 | -------------- | ---------- 500 | Microsoft Word | 1 501 | Pages | 0 502 | wc (Unix) | 1 503 | 504 | ##### Punctuation mark surrounded by spaces 505 | ``` 506 | : 507 | ``` 508 | Tool | Word Count 509 | -------------- | ---------- 510 | Microsoft Word | 1 511 | Pages | 0 512 | wc (Unix) | 1 513 | 514 | ## Research 515 | 516 | - *[So how many words do you think it is?](http://multifarious.filkin.com/2012/11/13/wordcount)* - Paul Filkin 517 | - [Word Count](http://en.wikipedia.org/wiki/Word_count) - Wikipedia 518 | - [Words Counted Ruby Gem](https://github.com/abitdodgy/words_counted) - Mohamad El-Husseini 519 | 520 | ## TODO 521 | 522 | - Add language support for languages other than English 523 | - For most languages this is probably as simple as adding in the translations and abbreviations for months and days. 524 | - For languages that use a character count (Japanese, Chinese) there will be larger changes. For these languages need to add an option for how to handle Roman words within the text. 525 | - Improve performace for longer strings (potentially break string into smaller parts and then sum total of each) 526 | 527 | ## Contributing 528 | 529 | 1. Fork it ( https://github.com/diasks2/word_count_analyzer/fork ) 530 | 2. Create your feature branch (`git checkout -b my-new-feature`) 531 | 3. Commit your changes (`git commit -am 'Add some feature'`) 532 | 4. Push to the branch (`git push origin my-new-feature`) 533 | 5. Create a new Pull Request 534 | 535 | ## License 536 | 537 | The MIT License (MIT) 538 | 539 | Copyright (c) 2015 Kevin S. Dias 540 | 541 | Permission is hereby granted, free of charge, to any person obtaining a copy 542 | of this software and associated documentation files (the "Software"), to deal 543 | in the Software without restriction, including without limitation the rights 544 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 545 | copies of the Software, and to permit persons to whom the Software is 546 | furnished to do so, subject to the following conditions: 547 | 548 | The above copyright notice and this permission notice shall be included in 549 | all copies or substantial portions of the Software. 550 | 551 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 552 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 553 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 554 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 555 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 556 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 557 | THE SOFTWARE. 558 | -------------------------------------------------------------------------------- /spec/word_count_analyzer/counter_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WordCountAnalyzer::Counter do 4 | context 'ellipsis' do 5 | it 'handles an invalid ellipsis argument value' do 6 | text = 'hello world.' 7 | ws = WordCountAnalyzer::Counter.new(ellipsis: 'hello') 8 | expect { ws.count(text) }.to raise_error('The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`') 9 | end 10 | 11 | it 'ignores ellipses in the word count' do 12 | text = 'hello world ... what day is it.' 13 | ws = WordCountAnalyzer::Counter.new( 14 | ellipsis: 'ignore' 15 | ) 16 | expect(ws.count(text)).to eq(6) 17 | end 18 | 19 | it 'does not ignore ellipses in the word count' do 20 | text = 'hello world ... what day is it.' 21 | ws = WordCountAnalyzer::Counter.new( 22 | ellipsis: 'no_special_treatment' 23 | ) 24 | expect(ws.count(text)).to eq(7) 25 | end 26 | 27 | it 'does not ignore ellipses in the word count' do 28 | text = 'hello world... what day is it.' 29 | ws = WordCountAnalyzer::Counter.new( 30 | ellipsis: 'no_special_treatment' 31 | ) 32 | expect(ws.count(text)).to eq(6) 33 | end 34 | 35 | it 'sets ignore as the default option' do 36 | text = 'hello world ... what day is it.' 37 | ws = WordCountAnalyzer::Counter.new 38 | expect(ws.count(text)).to eq(6) 39 | end 40 | end 41 | 42 | context 'hyperlink' do 43 | it 'handles an invalid hyperlink argument value' do 44 | text = 'hello world.' 45 | ws = WordCountAnalyzer::Counter.new(hyperlink: 'hello') 46 | expect { ws.count(text) }.to raise_error('The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`') 47 | end 48 | 49 | it 'counts a hyperlink as one word in the word count' do 50 | text = 'The site address is http://www.example.com she said.' 51 | ws = WordCountAnalyzer::Counter.new( 52 | hyperlink: 'count_as_one' 53 | ) 54 | expect(ws.count(text)).to eq(7) 55 | end 56 | 57 | it 'counts a hyperlink as one word in the word count' do 58 | text = 'The site address is http://www.example.com she said.' 59 | ws = WordCountAnalyzer::Counter.new( 60 | hyperlink: 'split_at_period', 61 | forward_slash: 'count_as_one' 62 | ) 63 | expect(ws.count(text)).to eq(9) 64 | end 65 | 66 | it 'does not search for hyperlinks' do 67 | text = 'The site address is http://www.example.com she said.' 68 | ws = WordCountAnalyzer::Counter.new( 69 | hyperlink: 'no_special_treatment' 70 | ) 71 | expect(ws.count(text)).to eq(8) 72 | end 73 | 74 | it 'sets count_as_one as the default option' do 75 | text = 'The site address is http://www.example.com she said.' 76 | ws = WordCountAnalyzer::Counter.new 77 | expect(ws.count(text)).to eq(7) 78 | end 79 | end 80 | 81 | context 'contraction' do 82 | it 'handles an invalid contraction argument value' do 83 | text = 'hello world.' 84 | ws = WordCountAnalyzer::Counter.new(contraction: 'hello') 85 | expect { ws.count(text) }.to raise_error('The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`') 86 | end 87 | 88 | it 'counts a contraction as one word in the word count' do 89 | text = "Don't do that." 90 | ws = WordCountAnalyzer::Counter.new( 91 | contraction: 'count_as_one' 92 | ) 93 | expect(ws.count(text)).to eq(3) 94 | end 95 | 96 | it 'splits a contraction into its separate words for the word count' do 97 | text = "Don't do that." 98 | ws = WordCountAnalyzer::Counter.new( 99 | contraction: 'count_as_multiple' 100 | ) 101 | expect(ws.count(text)).to eq(4) 102 | end 103 | 104 | it 'sets count_as_one as the default option' do 105 | text = "Don't do that." 106 | ws = WordCountAnalyzer::Counter.new 107 | expect(ws.count(text)).to eq(3) 108 | end 109 | end 110 | 111 | context 'hyphenated_word' do 112 | it 'handles an invalid hyphenated_word argument value' do 113 | text = 'hello world.' 114 | ws = WordCountAnalyzer::Counter.new(hyphenated_word: 'hello') 115 | expect { ws.count(text) }.to raise_error('The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`') 116 | end 117 | 118 | it 'counts a hyphenated word as one word in the word count' do 119 | text = 'He has a devil-may-care attitude.' 120 | ws = WordCountAnalyzer::Counter.new( 121 | hyphenated_word: 'count_as_one' 122 | ) 123 | expect(ws.count(text)).to eq(5) 124 | end 125 | 126 | it 'splits a hyphenated word into its separate words for the word count' do 127 | text = 'He has a devil-may-care attitude.' 128 | ws = WordCountAnalyzer::Counter.new( 129 | hyphenated_word: 'count_as_multiple' 130 | ) 131 | expect(ws.count(text)).to eq(7) 132 | end 133 | 134 | it 'sets count_as_one as the default option' do 135 | text = 'He has a devil-may-care attitude.' 136 | ws = WordCountAnalyzer::Counter.new 137 | expect(ws.count(text)).to eq(5) 138 | end 139 | end 140 | 141 | context 'date' do 142 | it 'handles an invalid date argument value' do 143 | text = 'hello world.' 144 | ws = WordCountAnalyzer::Counter.new(date: 'hello') 145 | expect { ws.count(text) }.to raise_error('The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`') 146 | end 147 | 148 | it 'ignores date placeables' do 149 | text = 'Today is Tues. March 3rd, 2011.' 150 | ws = WordCountAnalyzer::Counter.new( 151 | date: 'no_special_treatment' 152 | ) 153 | expect(ws.count(text)).to eq(6) 154 | end 155 | 156 | it 'counts a date placeable as one word in the word count' do 157 | text = 'Today is Tues. March 3rd, 2011.' 158 | ws = WordCountAnalyzer::Counter.new( 159 | date: 'count_as_one' 160 | ) 161 | expect(ws.count(text)).to eq(3) 162 | end 163 | 164 | it 'sets count_as_one as the default option' do 165 | text = 'Today is Tues. March 3rd, 2011.' 166 | ws = WordCountAnalyzer::Counter.new 167 | expect(ws.count(text)).to eq(6) 168 | end 169 | end 170 | 171 | context 'number' do 172 | it 'handles an invalid number argument value' do 173 | text = 'hello world.' 174 | ws = WordCountAnalyzer::Counter.new(number: 'hello') 175 | expect { ws.count(text) }.to raise_error('The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`') 176 | end 177 | 178 | it 'counts a number as a word' do 179 | text = 'There is $300 in the safe. The password is 1234.' 180 | ws = WordCountAnalyzer::Counter.new( 181 | number: 'count' 182 | ) 183 | expect(ws.count(text)).to eq(10) 184 | end 185 | 186 | it 'ignores numbers in the word count' do 187 | text = 'There is $300 in the safe. The password is 1234.' 188 | ws = WordCountAnalyzer::Counter.new( 189 | number: 'ignore' 190 | ) 191 | expect(ws.count(text)).to eq(8) 192 | end 193 | 194 | it 'sets count as the default option' do 195 | text = 'There is $300 in the safe. The password is 1234.' 196 | ws = WordCountAnalyzer::Counter.new 197 | expect(ws.count(text)).to eq(10) 198 | end 199 | end 200 | 201 | context 'number_list' do 202 | it 'handles an invalid number argument value' do 203 | text = 'hello world.' 204 | ws = WordCountAnalyzer::Counter.new(numbered_list: 'hello') 205 | expect { ws.count(text) }.to raise_error('The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`') 206 | end 207 | 208 | it 'counts a numbered list number as a word' do 209 | text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c." 210 | ws = WordCountAnalyzer::Counter.new( 211 | numbered_list: 'count' 212 | ) 213 | expect(ws.count(text)).to eq(17) 214 | end 215 | 216 | it 'ignores numbered list numbers' do 217 | text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c." 218 | ws = WordCountAnalyzer::Counter.new( 219 | numbered_list: 'ignore' 220 | ) 221 | expect(ws.count(text)).to eq(14) 222 | end 223 | 224 | it 'sets count as the default option' do 225 | text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c." 226 | ws = WordCountAnalyzer::Counter.new 227 | expect(ws.count(text)).to eq(17) 228 | end 229 | end 230 | 231 | context 'xhtml' do 232 | it 'handles an invalid number argument value' do 233 | text = 'hello world.' 234 | ws = WordCountAnalyzer::Counter.new(xhtml: 'hello') 235 | expect { ws.count(text) }.to raise_error('The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`') 236 | end 237 | 238 | it 'removes all xhtml from the text' do 239 | text = "Hello world" 240 | ws = WordCountAnalyzer::Counter.new( 241 | xhtml: 'remove' 242 | ) 243 | expect(ws.count(text)).to eq(2) 244 | end 245 | 246 | it 'keeps xhtml in the text' do 247 | text = "Hello world" 248 | ws = WordCountAnalyzer::Counter.new( 249 | xhtml: 'keep', 250 | forward_slash: 'count_as_one' 251 | ) 252 | expect(ws.count(text)).to eq(3) 253 | end 254 | 255 | it 'keeps xhtml in the text' do 256 | text = "Hello world" 257 | ws = WordCountAnalyzer::Counter.new( 258 | xhtml: 'keep' 259 | ) 260 | expect(ws.count(text)).to eq(4) 261 | end 262 | 263 | it 'sets remove as the default option' do 264 | text = "Hello world" 265 | ws = WordCountAnalyzer::Counter.new 266 | expect(ws.count(text)).to eq(2) 267 | end 268 | end 269 | 270 | context 'forward_slash' do 271 | it 'handles an invalid number argument value' do 272 | text = 'hello world.' 273 | ws = WordCountAnalyzer::Counter.new(forward_slash: 'hello') 274 | expect { ws.count(text) }.to raise_error('The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`') 275 | end 276 | 277 | it 'counts a forward slash as multiple words (except dates) #001' do 278 | text = "She/he/it said hello. 4/22/2013." 279 | ws = WordCountAnalyzer::Counter.new( 280 | forward_slash: 'count_as_multiple_except_dates' 281 | ) 282 | expect(ws.count(text)).to eq(6) 283 | end 284 | 285 | it 'counts a forward slash as multiple words #002' do 286 | text = "She/he/it said hello. 4/22/2013." 287 | ws = WordCountAnalyzer::Counter.new( 288 | forward_slash: 'count_as_multiple' 289 | ) 290 | expect(ws.count(text)).to eq(8) 291 | end 292 | 293 | it 'counts a forward slash as multiple words #003' do 294 | text = "She/he/it said hello. 4/22/2013." 295 | ws = WordCountAnalyzer::Counter.new( 296 | forward_slash: 'count_as_multiple', 297 | date: 'count_as_one' 298 | ) 299 | expect(ws.count(text)).to eq(6) 300 | end 301 | 302 | it 'counts a forward slash as one word' do 303 | text = "She/he/it said hello." 304 | ws = WordCountAnalyzer::Counter.new( 305 | forward_slash: 'count_as_one' 306 | ) 307 | expect(ws.count(text)).to eq(3) 308 | end 309 | 310 | it 'sets count_as_multiple_except_dates as the default option' do 311 | text = "She/he/it said hello. 4/22/2013." 312 | ws = WordCountAnalyzer::Counter.new 313 | expect(ws.count(text)).to eq(6) 314 | end 315 | end 316 | 317 | context 'backslash' do 318 | it 'handles an invalid number argument value' do 319 | text = 'hello world.' 320 | ws = WordCountAnalyzer::Counter.new(backslash: 'hello') 321 | expect { ws.count(text) }.to raise_error('The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`') 322 | end 323 | 324 | it 'counts a token with backslashes as one word' do 325 | text = 'The file location is c:\Users\johndoe' 326 | ws = WordCountAnalyzer::Counter.new( 327 | backslash: 'count_as_one' 328 | ) 329 | expect(ws.count(text)).to eq(5) 330 | end 331 | 332 | it 'counts a token with backslashes as multiple words' do 333 | text = 'The file location is c:\Users\johndoe' 334 | ws = WordCountAnalyzer::Counter.new( 335 | backslash: 'count_as_multiple' 336 | ) 337 | expect(ws.count(text)).to eq(7) 338 | end 339 | 340 | it 'sets count_as_one as the default option' do 341 | text = 'The file location is c:\Users\johndoe' 342 | ws = WordCountAnalyzer::Counter.new 343 | expect(ws.count(text)).to eq(5) 344 | end 345 | end 346 | 347 | context 'dotted_line' do 348 | it 'handles an invalid number argument value' do 349 | text = 'hello world.' 350 | ws = WordCountAnalyzer::Counter.new(dotted_line: 'hello') 351 | expect { ws.count(text) }.to raise_error('The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`') 352 | end 353 | 354 | it 'ignores continuous strings of dots in the word count' do 355 | text = 'Here is one …………………………………………………………………… and another ......' 356 | ws = WordCountAnalyzer::Counter.new( 357 | dotted_line: 'ignore' 358 | ) 359 | expect(ws.count(text)).to eq(5) 360 | end 361 | 362 | it 'counts a continuous string of dots as a word' do 363 | text = 'Here is one …………………………………………………………………… and another ......' 364 | ws = WordCountAnalyzer::Counter.new( 365 | dotted_line: 'count' 366 | ) 367 | expect(ws.count(text)).to eq(7) 368 | end 369 | 370 | it 'sets ignore as the default option' do 371 | text = 'Here is one …………………………………………………………………… and another ......' 372 | ws = WordCountAnalyzer::Counter.new 373 | expect(ws.count(text)).to eq(5) 374 | end 375 | end 376 | 377 | context 'dashed_line' do 378 | it 'handles an invalid number argument value' do 379 | text = 'hello world.' 380 | ws = WordCountAnalyzer::Counter.new(dashed_line: 'hello') 381 | expect { ws.count(text) }.to raise_error('The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`') 382 | end 383 | 384 | it 'ignores continuous strings of dashes in the word count' do 385 | text = 'Here is one ----- and another -----' 386 | ws = WordCountAnalyzer::Counter.new( 387 | dashed_line: 'ignore' 388 | ) 389 | expect(ws.count(text)).to eq(5) 390 | end 391 | 392 | it 'counts a continuous string of dashes as a word' do 393 | text = 'Here is one ----- and another -----' 394 | ws = WordCountAnalyzer::Counter.new( 395 | dashed_line: 'count' 396 | ) 397 | expect(ws.count(text)).to eq(7) 398 | end 399 | 400 | it 'sets ignore as the default option' do 401 | text = 'Here is one ----- and another -----' 402 | ws = WordCountAnalyzer::Counter.new 403 | expect(ws.count(text)).to eq(5) 404 | end 405 | end 406 | 407 | context 'underscore' do 408 | it 'handles an invalid number argument value' do 409 | text = 'hello world.' 410 | ws = WordCountAnalyzer::Counter.new(underscore: 'hello') 411 | expect { ws.count(text) }.to raise_error('The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`') 412 | end 413 | 414 | it 'ignores continuous strings of underscores in the word count' do 415 | text = "Here is one ______ and another ______" 416 | ws = WordCountAnalyzer::Counter.new( 417 | underscore: 'ignore' 418 | ) 419 | expect(ws.count(text)).to eq(5) 420 | end 421 | 422 | it 'counts a continuous string of underscores as a word' do 423 | text = 'Here is one ______ and another ______' 424 | ws = WordCountAnalyzer::Counter.new( 425 | underscore: 'count' 426 | ) 427 | expect(ws.count(text)).to eq(7) 428 | end 429 | 430 | it 'sets ignore as the default option' do 431 | text = 'Here is one ______ and another ______' 432 | ws = WordCountAnalyzer::Counter.new 433 | expect(ws.count(text)).to eq(5) 434 | end 435 | end 436 | 437 | context 'stray_punctuation' do 438 | it 'handles an invalid number argument value' do 439 | text = 'hello world.' 440 | ws = WordCountAnalyzer::Counter.new(stray_punctuation: 'hello') 441 | expect { ws.count(text) }.to raise_error('The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`') 442 | end 443 | 444 | it 'ignores continuous strings of underscores in the word count' do 445 | text = 'Hello world ? This is another - sentence .' 446 | ws = WordCountAnalyzer::Counter.new( 447 | stray_punctuation: 'ignore' 448 | ) 449 | expect(ws.count(text)).to eq(6) 450 | end 451 | 452 | it 'counts a continuous string of underscores as a word' do 453 | text = 'Hello world ? This is another - sentence .' 454 | ws = WordCountAnalyzer::Counter.new( 455 | stray_punctuation: 'count' 456 | ) 457 | expect(ws.count(text)).to eq(9) 458 | end 459 | 460 | it 'sets ignore as the default option' do 461 | text = 'Hello world ? This is another - sentence .' 462 | ws = WordCountAnalyzer::Counter.new 463 | expect(ws.count(text)).to eq(6) 464 | end 465 | end 466 | 467 | it 'counts the words in a string #001' do 468 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 469 | ws = WordCountAnalyzer::Counter.new( 470 | ellipsis: 'ignore', 471 | hyperlink: 'count_as_one', 472 | contraction: 'count_as_one', 473 | hyphenated_word: 'count_as_one', 474 | date: 'no_special_treatment', 475 | number: 'count', 476 | numbered_list: 'count', 477 | xhtml: 'remove', 478 | forward_slash: 'count_as_one', 479 | backslash: 'count_as_one', 480 | dotted_line: 'ignore', 481 | dashed_line: 'ignore', 482 | underscore: 'ignore', 483 | stray_punctuation: 'ignore' 484 | ) 485 | expect(ws.count(text)).to eq(62) 486 | end 487 | 488 | it 'counts the words in a string #002' do 489 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 490 | ws = WordCountAnalyzer::Counter.new( 491 | ellipsis: 'no_special_treatment', 492 | hyperlink: 'no_special_treatment', 493 | contraction: 'count_as_multiple', 494 | hyphenated_word: 'count_as_multiple', 495 | date: 'count_as_one', 496 | number: 'ignore', 497 | numbered_list: 'ignore', 498 | xhtml: 'keep', 499 | forward_slash: 'count_as_multiple', 500 | backslash: 'count_as_multiple', 501 | dotted_line: 'count', 502 | dashed_line: 'count', 503 | underscore: 'count', 504 | stray_punctuation: 'count' 505 | ) 506 | expect(ws.count(text)).to eq(77) 507 | end 508 | 509 | it 'counts the words in a string #003' do 510 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 511 | ws = WordCountAnalyzer::Counter.new 512 | expect(ws.count(text)).to eq(64) 513 | end 514 | 515 | it 'counts the words in a string #004' do 516 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 517 | ws = WordCountAnalyzer::Counter.new(forward_slash: 'count_as_multiple') 518 | expect(ws.count(text)).to eq(66) 519 | end 520 | 521 | it 'counts the words in a string #005' do 522 | text = "Hello world... 11/22/2013" 523 | ws = WordCountAnalyzer::Counter.new 524 | expect(ws.count(text)).to eq(3) 525 | end 526 | 527 | context 'Pages Word Count' do 528 | it 'reverse engineers Pages word count #001' do 529 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list \n\n1. item a \n\n2. item b \n\n3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 530 | ws = WordCountAnalyzer::Counter.new( 531 | ellipsis: 'no_special_treatment', 532 | hyperlink: 'split_at_period', 533 | contraction: 'count_as_one', 534 | hyphenated_word: 'count_as_multiple', 535 | date: 'no_special_treatment', 536 | number: 'count', 537 | numbered_list: 'ignore', 538 | xhtml: 'keep', 539 | forward_slash: 'count_as_multiple', 540 | backslash: 'count_as_multiple', 541 | dotted_line: 'ignore', 542 | dashed_line: 'ignore', 543 | underscore: 'ignore', 544 | stray_punctuation: 'ignore' 545 | ) 546 | expect(ws.count(text)).to eq(76) 547 | end 548 | 549 | it 'reverse engineers Pages word count #002' do 550 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 551 | ws = WordCountAnalyzer::Counter.new 552 | expect(ws.pages_count(text)).to eq(76) 553 | end 554 | 555 | it 'reverse engineers Pages word count #003' do 556 | text = "..." 557 | ws = WordCountAnalyzer::Counter.new 558 | expect(ws.pages_count(text)).to eq(0) 559 | end 560 | 561 | it 'reverse engineers Pages word count #004' do 562 | text = "1. List item a\n\n2. List item b\n\n3. List item c" 563 | ws = WordCountAnalyzer::Counter.new 564 | expect(ws.pages_count(text)).to eq(9) 565 | end 566 | 567 | it 'reverse engineers Pages word count #005' do 568 | text = "Hello world Hello" 569 | ws = WordCountAnalyzer::Counter.new 570 | expect(ws.pages_count(text)).to eq(12) 571 | end 572 | end 573 | 574 | context 'Microsoft Word Count' do 575 | it 'reverse engineers the Microsoft Word / wc (Unix) word count #001' do 576 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 577 | ws = WordCountAnalyzer::Counter.new( 578 | ellipsis: 'no_special_treatment', 579 | hyperlink: 'count_as_one', 580 | contraction: 'count_as_one', 581 | hyphenated_word: 'count_as_one', 582 | date: 'no_special_treatment', 583 | number: 'count', 584 | numbered_list: 'count', 585 | xhtml: 'keep', 586 | forward_slash: 'count_as_one', 587 | backslash: 'count_as_one', 588 | dotted_line: 'count', 589 | dashed_line: 'count', 590 | underscore: 'count', 591 | stray_punctuation: 'count' 592 | ) 593 | expect(ws.count(text)).to eq(71) 594 | end 595 | 596 | it 'reverse engineers the Microsoft Word / wc (Unix) word count #002' do 597 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }" 598 | ws = WordCountAnalyzer::Counter.new 599 | expect(ws.mword_count(text)).to eq(71) 600 | end 601 | end 602 | 603 | context 'example sentences' do 604 | it 'String with common words (no edge cases) #001' do 605 | ws = WordCountAnalyzer::Counter.new 606 | expect(ws.count('This sentence contains nothing crazy.')).to eq(5) 607 | end 608 | 609 | it 'String with a number #002' do 610 | ws = WordCountAnalyzer::Counter.new 611 | expect(ws.count('This sentence contains 1 number.')).to eq(5) 612 | end 613 | 614 | it 'String with a date #003' do 615 | ws = WordCountAnalyzer::Counter.new 616 | expect(ws.count('Today is Monday, April 4th, 2011.')).to eq(6) 617 | end 618 | 619 | it 'String #004' do 620 | ws = WordCountAnalyzer::Counter.new 621 | expect(ws.count('hello world ...')).to eq(2) 622 | end 623 | 624 | it 'does not split on unicode chars' do 625 | ws = WordCountAnalyzer::Counter.new 626 | expect(ws.count('São Paulo')).to eq(2) 627 | end 628 | 629 | it 'should not count HTML tags' do 630 | ws = WordCountAnalyzer::Counter.new 631 | expect(ws.count("the brown fox jumped over the lazy dog")).to eq(8) 632 | end 633 | 634 | it 'should handle special characters' do 635 | ws = WordCountAnalyzer::Counter.new 636 | expect(ws.count("the \"brown\" fox 'jumped' | over \\ the / lazy dog")).to eq(8) 637 | end 638 | end 639 | end 640 | --------------------------------------------------------------------------------