├── .rspec
├── spec
├── spec_helper.rb
└── word_count_analyzer
│ ├── analyzer_spec.rb
│ ├── performance_spec.rb
│ ├── number_spec.rb
│ ├── xhtml_spec.rb
│ ├── hyperlink_spec.rb
│ ├── hyphenated_word_spec.rb
│ ├── numbered_list_spec.rb
│ ├── punctuation_spec.rb
│ ├── slash_spec.rb
│ ├── ellipsis_spec.rb
│ ├── contraction_spec.rb
│ ├── date_spec.rb
│ └── counter_spec.rb
├── .travis.yml
├── lib
├── word_count_analyzer
│ ├── version.rb
│ ├── number.rb
│ ├── hyphenated_word.rb
│ ├── xhtml.rb
│ ├── ellipsis.rb
│ ├── punctuation.rb
│ ├── hyperlink.rb
│ ├── numbered_list.rb
│ ├── analyzer.rb
│ ├── slash.rb
│ ├── date.rb
│ ├── contraction.rb
│ └── counter.rb
└── word_count_analyzer.rb
├── Gemfile
├── Rakefile
├── .gitignore
├── LICENSE.txt
├── word_count_analyzer.gemspec
└── README.md
/.rspec:
--------------------------------------------------------------------------------
1 | --color
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | require 'word_count_analyzer'
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - "2.1.0"
4 | - "2.1.5"
5 | - "2.2.0"
6 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/version.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | VERSION = "1.0.1"
3 | end
4 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in word_count_analyzer.gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | require 'rspec/core/rake_task'
3 | RSpec::Core::RakeTask.new(:spec)
4 | task :default => :spec
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle/
2 | /.yardoc
3 | /Gemfile.lock
4 | /_yardoc/
5 | /coverage/
6 | /doc/
7 | /pkg/
8 | /spec/reports/
9 | /tmp/
10 | .idea/
11 | *.bundle
12 | *.so
13 | *.o
14 | *.a
15 | mkmf.log
16 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer.rb:
--------------------------------------------------------------------------------
1 | require "word_count_analyzer/version"
2 | require "word_count_analyzer/analyzer"
3 | require "word_count_analyzer/counter"
4 | require "word_count_analyzer/contraction"
5 | require "word_count_analyzer/hyperlink"
6 | require "word_count_analyzer/hyphenated_word"
7 | require "word_count_analyzer/date"
8 | require "word_count_analyzer/ellipsis"
9 | require "word_count_analyzer/numbered_list"
10 | require "word_count_analyzer/xhtml"
11 | require "word_count_analyzer/number"
12 | require "word_count_analyzer/slash"
13 | require "word_count_analyzer/punctuation"
14 | require "engtagger"
15 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/number.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Number
3 | # Rubular: http://rubular.com/r/OGj82uEu8d
4 | NUMBER_REGEX = /(?<=\A)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$)|(?<=\s)\D?\d+((,|\.)*\d)*(\D?\s|\s|\.?\s|\.$|$)/
5 |
6 | attr_reader :string
7 | def initialize(string:)
8 | @string = string
9 | end
10 |
11 | def includes_number?
12 | !(string !~ NUMBER_REGEX)
13 | end
14 |
15 | def replace
16 | string.gsub(NUMBER_REGEX, ' wsnumword ')
17 | end
18 |
19 | def occurrences
20 | replace.scan(/wsnumword/).size
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/hyphenated_word.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class HyphenatedWord
3 | # Rubular: http://rubular.com/r/RjZ7qi0uFf
4 | DASHED_LINE_REGEX = /\s-{2,}(\s|$)|\A-{2,}(\s|$)/
5 |
6 | attr_reader :token
7 | def initialize(token:)
8 | @token = token.gsub(DASHED_LINE_REGEX, '')
9 | end
10 |
11 | def hyphenated_word?
12 | (token.include?('-') || token.include?('﹘')) && !WordCountAnalyzer::Hyperlink.new.hyperlink?(token)
13 | end
14 |
15 | def count_as_multiple
16 | token.split(/[﹘,-]/).length
17 | end
18 |
19 | def replace
20 | token.split(/[﹘,-]/).join(' ')
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/xhtml.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Xhtml
3 | # Rubular: http://rubular.com/r/ENrVFMdJ8v
4 | XHTML_REGEX = /<\/?[^>]*>/
5 | attr_reader :string
6 | def initialize(string:)
7 | @string = string
8 | end
9 |
10 | def includes_xhtml?
11 | !(string !~ XHTML_REGEX)
12 | end
13 |
14 | def replace
15 | string.gsub(XHTML_REGEX, ' ')
16 | end
17 |
18 | def count_difference_word_boundary
19 | string.split(/\s+/).size - replace.strip.split(/\s+/).size
20 | end
21 |
22 | def occurrences
23 | string.gsub(XHTML_REGEX, ' wsxhtmlword ').scan(/wsxhtmlword/).size / 2
24 | end
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015 Kevin S. Dias
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/ellipsis.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Ellipsis
3 | # Rubular: http://rubular.com/r/mfdtSeuIf2
4 | FOUR_CONSECUTIVE_REGEX = /(?<=[^\.]|\A)\.{3}\.(?=[^\.]|$)/
5 |
6 | # Rubular: http://rubular.com/r/YBG1dIHTRu
7 | THREE_SPACE_REGEX = /(\s\.){3}\s/
8 |
9 | # Rubular: http://rubular.com/r/2VvZ8wRbd8
10 | FOUR_SPACE_REGEX = /(?<=[a-z]|\A)(\.\s){3}\.(\z|$|\n)/
11 |
12 | OTHER_THREE_PERIOD_REGEX = /(?<=[^\.]|\A)\.{3}(?=([^\.]|$))/
13 |
14 | UNICODE_ELLIPSIS = /(?<=[^…]|\A)…{1}(?=[^…]|$)/
15 |
16 | def includes_ellipsis?(text)
17 | !(text !~ FOUR_CONSECUTIVE_REGEX) ||
18 | !(text !~ THREE_SPACE_REGEX) ||
19 | !(text !~ FOUR_SPACE_REGEX) ||
20 | !(text !~ OTHER_THREE_PERIOD_REGEX) ||
21 | !(text !~ UNICODE_ELLIPSIS)
22 | end
23 |
24 | def replace(text)
25 | text.gsub(FOUR_CONSECUTIVE_REGEX, ' wseword ')
26 | .gsub(THREE_SPACE_REGEX, ' wseword ')
27 | .gsub(FOUR_SPACE_REGEX, ' wseword ')
28 | .gsub(OTHER_THREE_PERIOD_REGEX, ' wseword ')
29 | .gsub(UNICODE_ELLIPSIS, ' wseword ')
30 | end
31 |
32 | def occurrences(text)
33 | count = 0
34 | replace(text).split(' ').map { |token| count += 1 if token.strip.eql?('wseword') }
35 | count
36 | end
37 | end
38 | end
--------------------------------------------------------------------------------
/spec/word_count_analyzer/analyzer_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Analyzer do
4 | context '#analysis' do
5 | it 'should analyze the gray areas #001' do
6 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
7 | ws = WordCountAnalyzer::Analyzer.new(text: text)
8 | expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>2, "contraction"=>4, "hyphenated_word"=>2, "date"=>2, "number"=>5, "numbered_list"=>3, "xhtml"=>1, "forward_slash"=>1, "backslash"=>1, "dotted_line"=>1, "dashed_line"=>1, "underscore"=>1, "stray_punctuation"=>5})
9 | end
10 |
11 | it 'should analyze the gray areas #002' do
12 | text = "hello world ..."
13 | ws = WordCountAnalyzer::Analyzer.new(text: text)
14 | expect(ws.analyze).to eq({"ellipsis"=>1, "hyperlink"=>0, "contraction"=>0, "hyphenated_word"=>0, "date"=>0, "number"=>0, "numbered_list"=>0, "xhtml"=>0, "forward_slash"=>0, "backslash"=>0, "dotted_line"=>0, "dashed_line"=>0, "underscore"=>0, "stray_punctuation"=>0})
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/punctuation.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Punctuation
3 | # Rubular: http://rubular.com/r/ZVBsZVkiqC
4 | DOTTED_LINE_REGEX = /…{2,}|\.{5,}/
5 |
6 | # Rubular: http://rubular.com/r/RjZ7qi0uFf
7 | DASHED_LINE_REGEX = /(?<=\s)-{2,}(\s|$)|\A-{2,}(?=(\s|$))/
8 |
9 | # Rubular: http://rubular.com/r/hNofimZwdh
10 | UNDERSCORE_REGEX = /(?<=\s)_{2,}(\s|$)|\A_{2,}(?=(\s|$))/
11 |
12 | # Rubular: http://rubular.com/r/FexKxGUuIe
13 | STRAY_PUNCTUATION_REGEX = /(?<=\s|\A)[[:punct:]](?=(\s|$))|(?<=\s|\A)\|(?=(\s|$))/
14 |
15 | attr_reader :string
16 | def initialize(string:)
17 | @string = string
18 | end
19 |
20 | def dotted_line_ocurrances
21 | string.scan(DOTTED_LINE_REGEX).size
22 | end
23 |
24 | def dashed_line_ocurrances
25 | string.scan(DASHED_LINE_REGEX).size
26 | end
27 |
28 | def underscore_ocurrances
29 | string.scan(UNDERSCORE_REGEX).size
30 | end
31 |
32 | def stray_punctuation_occurences
33 | string.scan(STRAY_PUNCTUATION_REGEX).size
34 | end
35 |
36 | def replace_dotted_line
37 | string.gsub(DOTTED_LINE_REGEX, '')
38 | end
39 |
40 | def replace_dashed_line
41 | string.gsub(DASHED_LINE_REGEX, '')
42 | end
43 |
44 | def replace_underscore
45 | string.gsub(UNDERSCORE_REGEX, '')
46 | end
47 |
48 | def replace_stray_punctuation
49 | string.gsub(STRAY_PUNCTUATION_REGEX, '')
50 | end
51 | end
52 | end
53 |
--------------------------------------------------------------------------------
/word_count_analyzer.gemspec:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | lib = File.expand_path('../lib', __FILE__)
3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4 | require 'word_count_analyzer/version'
5 |
6 | Gem::Specification.new do |spec|
7 | spec.name = "word_count_analyzer"
8 | spec.version = WordCountAnalyzer::VERSION
9 | spec.authors = ["Kevin S. Dias"]
10 | spec.email = ["diasks2@gmail.com"]
11 | spec.summary = %q{A word count analyzer - see what word count gray areas might be affecting your word count.}
12 | spec.description = %q{Word Count Analyzer is a Ruby gem that analyzes a string for potential areas of the text that might cause word count discrepancies depending on the tool used. It also provides comprehensive configuration options so you can easily customize how different gray areas should be counted and find the right word count for your purposes.}
13 | spec.homepage = ""
14 | spec.license = "MIT"
15 |
16 | spec.files = `git ls-files -z`.split("\x0")
17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19 | spec.require_paths = ["lib"]
20 | spec.required_ruby_version = '>= 2.1.0'
21 |
22 | spec.add_development_dependency "bundler"
23 | spec.add_development_dependency "rake", "~> 10.0"
24 | spec.add_development_dependency "rspec"
25 | spec.add_development_dependency "stackprof"
26 | spec.add_runtime_dependency "engtagger"
27 | end
28 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/hyperlink.rb:
--------------------------------------------------------------------------------
1 | require 'uri'
2 |
3 | module WordCountAnalyzer
4 | class Hyperlink
5 | NON_HYPERLINK_REGEX = /\A\w+:$/
6 |
7 | # Rubular: http://rubular.com/r/fXa4lp0gfS
8 | HYPERLINK_REGEX = /(http|https|www)(\.|:)/
9 |
10 | def hyperlink?(text)
11 | !(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX)
12 | end
13 |
14 | def occurrences(text)
15 | text.scan(URI.regexp).map { |link| link.compact.size > 1 ? 1 : 0 }.inject(0) { |sum, x| sum + x }
16 | end
17 |
18 | def replace(text)
19 | text.split(/\s+/).each do |token|
20 | if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
21 | text = text.gsub(/#{Regexp.escape(token.split('">')[0])}/, ' wslinkword ')
22 | elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
23 | text = text.gsub(/#{Regexp.escape(token)}/, ' wslinkword ')
24 | end
25 | end
26 | text
27 | end
28 |
29 | def replace_split_at_period(text)
30 | text.split(/\s+/).each do |token|
31 | if !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX) && token.include?('">')
32 | text.gsub!(/#{Regexp.escape(token.split('">')[0])}/) do |match|
33 | match.split('.').join(' ')
34 | end
35 | elsif !(token !~ URI.regexp) && token !~ NON_HYPERLINK_REGEX && !(token !~ HYPERLINK_REGEX)
36 | text.gsub!(/#{Regexp.escape(token)}/) do |match|
37 | match.split('.').join(' ')
38 | end
39 | end
40 | end
41 | text
42 | end
43 | end
44 | end
45 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/numbered_list.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class NumberedList
3 | # Rubular: http://rubular.com/r/RKmRH9Y4oO
4 | NUMBERED_LIST_REGEX = /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.\)|^\d{1,2}\.\)/
5 |
6 | attr_reader :string
7 | def initialize(string:)
8 | @string = string
9 | end
10 |
11 | def includes_numbered_list?
12 | !(string !~ NUMBERED_LIST_REGEX) && has_at_least_two_items?
13 | end
14 |
15 | def replace
16 | new_string = string.dup
17 | list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i)
18 | skips = 0
19 | list_array.each_with_index do |a, i|
20 | if (a + 1).eql?(list_array[i + 1]) ||
21 | (a - 1).eql?(list_array[i - 1]) ||
22 | (a.eql?(0) && list_array[i - 1].eql?(9)) ||
23 | (a.eql?(9) && list_array[i + 1].eql?(0))
24 | new_string.gsub!(NUMBERED_LIST_REGEX).with_index do |match, index|
25 | if i.eql?(index + (i - skips)) && match.chomp('.').eql?(a.to_s)
26 | ''
27 | else
28 | match
29 | end
30 | end
31 | else
32 | skips +=1
33 | end
34 | end
35 | new_string
36 | end
37 |
38 | def occurrences
39 | count_list_items_in_array
40 | end
41 |
42 | private
43 |
44 | def has_at_least_two_items?
45 | count_list_items_in_array >= 2
46 | end
47 |
48 | def count_list_items_in_array
49 | list_array = string.scan(NUMBERED_LIST_REGEX).map(&:to_i)
50 | counter = 0
51 | list_array.each_with_index do |a, i|
52 | next unless (a + 1).eql?(list_array[i + 1]) ||
53 | (a - 1).eql?(list_array[i - 1]) ||
54 | (a.eql?(0) && list_array[i - 1].eql?(9)) ||
55 | (a.eql?(9) && list_array[i + 1].eql?(0))
56 | counter += 1
57 | end
58 | counter
59 | end
60 | end
61 | end
62 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/analyzer.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Analyzer
3 | attr_reader :text, :tagger
4 | def initialize(text:)
5 | @text = text
6 | @tagger = EngTagger.new
7 | end
8 |
9 | def analyze
10 | analysis = {}
11 | analysis['ellipsis'] = WordCountAnalyzer::Ellipsis.new.occurrences(text)
12 | contraction_count = 0
13 | hyphenated_word_count = 0
14 | WordCountAnalyzer::Xhtml.new(string: text).replace.split(/\s+/).each_with_index do |token, index|
15 | contraction_count += 1 if WordCountAnalyzer::Contraction.new(token: token, following_token: text.split(/\s+/)[index + 1], tgr: tagger, hyphen: 'single').contraction?
16 | hyphenated_word_count += 1 if WordCountAnalyzer::HyphenatedWord.new(token: token).hyphenated_word?
17 | end
18 | analysis['hyperlink'] = WordCountAnalyzer::Hyperlink.new.occurrences(text)
19 | analysis['contraction'] = contraction_count
20 | analysis['hyphenated_word'] = hyphenated_word_count
21 | analysis['date'] = WordCountAnalyzer::Date.new.occurrences(text)
22 | analysis['number'] = WordCountAnalyzer::Number.new(string: text).occurrences
23 | analysis['numbered_list'] = WordCountAnalyzer::NumberedList.new(string: text).occurrences
24 | analysis['xhtml'] = WordCountAnalyzer::Xhtml.new(string: text).occurrences
25 | analysis['forward_slash'] = WordCountAnalyzer::Slash.new(string: text).forward_slash_occurences
26 | analysis['backslash'] = WordCountAnalyzer::Slash.new(string: text).backslash_occurences
27 | analysis['dotted_line'] = WordCountAnalyzer::Punctuation.new(string: text).dotted_line_ocurrances
28 | analysis['dashed_line'] = WordCountAnalyzer::Punctuation.new(string: text).dashed_line_ocurrances
29 | analysis['underscore'] = WordCountAnalyzer::Punctuation.new(string: text).underscore_ocurrances
30 | analysis['stray_punctuation'] = WordCountAnalyzer::Punctuation.new(string: text).stray_punctuation_occurences
31 | analysis
32 | end
33 | end
34 | end
35 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/performance_spec.rb:
--------------------------------------------------------------------------------
1 | # -*- encoding : utf-8 -*-
2 | require 'benchmark'
3 | require 'spec_helper'
4 | require 'stackprof'
5 |
6 | RSpec.describe WordCountAnalyzer::Analyzer do
7 | it 'is fast?' do
8 | benchmark do
9 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
10 | ws = WordCountAnalyzer::Analyzer.new(text: text).analyze
11 | end
12 | end
13 |
14 | it 'is analyzed' do
15 | data = StackProf.run(mode: :cpu, interval: 1000) do
16 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
17 | ws = WordCountAnalyzer::Analyzer.new(text: text).analyze
18 | end
19 | puts StackProf::Report.new(data).print_text
20 | end
21 |
22 | it 'is analyzed 2' do
23 | data = StackProf.run(mode: :cpu, interval: 1000) do
24 | token = "when'd"
25 | following_token = nil
26 | WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: EngTagger.new, hyphen: nil).contraction?
27 | end
28 | puts StackProf::Report.new(data).print_text
29 | end
30 |
31 | it 'is analyzed 3' do
32 | benchmark do
33 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
34 | ws = WordCountAnalyzer::Counter.new(forward_slash: 'count_as_multiple')
35 | 300.times do
36 | ws.count(text)
37 | end
38 | end
39 | end
40 | end
41 |
42 | def benchmark
43 | yield
44 | time = Benchmark.realtime { yield }
45 | puts "RUNTIME: #{time}"
46 | end
47 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/number_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Number do
4 | context '#includes_number?' do
5 | it 'returns true if the string includes a number #001' do
6 | string = 'It cost $10,000 dollars.'
7 | ws = WordCountAnalyzer::Number.new(string: string)
8 | expect(ws.includes_number?).to eq(true)
9 | end
10 |
11 | it 'returns true if the string includes a number #002' do
12 | string = 'It cost 500 dollars.'
13 | ws = WordCountAnalyzer::Number.new(string: string)
14 | expect(ws.includes_number?).to eq(true)
15 | end
16 |
17 | it 'returns true if the string includes a number #003' do
18 | string = 'It was only 50% of the total.'
19 | ws = WordCountAnalyzer::Number.new(string: string)
20 | expect(ws.includes_number?).to eq(true)
21 | end
22 |
23 | it 'returns true if the string includes a number #004' do
24 | string = 'It was only 50 % of the total.'
25 | ws = WordCountAnalyzer::Number.new(string: string)
26 | expect(ws.includes_number?).to eq(true)
27 | end
28 |
29 | it 'returns true if the string includes a number #005' do
30 | string = 'I was born in 1993'
31 | ws = WordCountAnalyzer::Number.new(string: string)
32 | expect(ws.includes_number?).to eq(true)
33 | end
34 |
35 | it "returns false if the string doesn't includes a number #006" do
36 | string = 'Hello world.'
37 | ws = WordCountAnalyzer::Number.new(string: string)
38 | expect(ws.includes_number?).to eq(false)
39 | end
40 |
41 | it "returns false if the string doesn't includes a number #007" do
42 | string = 'Today is 2/18/2014.'
43 | ws = WordCountAnalyzer::Number.new(string: string)
44 | expect(ws.includes_number?).to eq(false)
45 | end
46 | end
47 |
48 | context '#replace' do
49 | it 'returns the string with number and unit substituted as one token #001' do
50 | string = 'It was only 50 % of the total. 500 total $300.'
51 | ws = WordCountAnalyzer::Number.new(string: string)
52 | expect(ws.replace).to eq("It was only wsnumword % of the total. wsnumword total wsnumword ")
53 | end
54 | end
55 |
56 | context '#occurrences' do
57 | it 'returns the number of occurrences of a number in the string #001' do
58 | string = 'It was only 50 % of the total. 500 total. That costs $300 and is 50% off.'
59 | ws = WordCountAnalyzer::Number.new(string: string)
60 | expect(ws.occurrences).to eq(4)
61 | end
62 |
63 | it 'does not ignore dates #002' do
64 | string = 'It was only 50 % of the total on Wednesday, June 4 2015. 500 total. That costs $300 and is 50% off only on Apr 5th 1999.'
65 | ws = WordCountAnalyzer::Number.new(string: string)
66 | expect(ws.occurrences).to eq(7)
67 | end
68 | end
69 | end
70 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/xhtml_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Xhtml do
4 | context '#includes_xhtml?' do
5 | it 'returns true if the string includes XML or HTML #001' do
6 | string = 'Hello world'
7 | ws = WordCountAnalyzer::Xhtml.new(string: string)
8 | expect(ws.includes_xhtml?).to eq(true)
9 | end
10 |
11 | it 'returns true if the string includes XML or HTML #002' do
12 | string = 'Hello there. Another sentence Sentence here.'
13 | ws = WordCountAnalyzer::Xhtml.new(string: string)
14 | expect(ws.includes_xhtml?).to eq(true)
15 | end
16 |
17 | it "returns false if the string doesn't include XML or HTML #003" do
18 | string = 'Hello world.'
19 | ws = WordCountAnalyzer::Xhtml.new(string: string)
20 | expect(ws.includes_xhtml?).to eq(false)
21 | end
22 | end
23 |
24 | context '#replace' do
25 | it 'replaces XML or HTML with an empty string #001' do
26 | string = 'Hello world'
27 | ws = WordCountAnalyzer::Xhtml.new(string: string)
28 | expect(ws.replace).to eq(" Hello world ")
29 | end
30 |
31 | it 'replaces XML or HTML with an empty string #002' do
32 | string = 'Hello there. Another sentence Sentence here.'
33 | ws = WordCountAnalyzer::Xhtml.new(string: string)
34 | expect(ws.replace).to eq("Hello there. Another sentence Sentence here.")
35 | end
36 | end
37 |
38 | context '#count_difference_word_boundary' do
39 | it 'counts the difference in word count between with xhtml and without #001' do
40 | string = 'Hello world'
41 | ws = WordCountAnalyzer::Xhtml.new(string: string)
42 | expect(ws.count_difference_word_boundary).to eq(1)
43 | end
44 |
45 | it 'counts the difference in word count between with xhtml and without #002' do
46 | string = 'Hello there. Another sentence Sentence here.'
47 | ws = WordCountAnalyzer::Xhtml.new(string: string)
48 | expect(ws.count_difference_word_boundary).to eq(0)
49 | end
50 |
51 | it 'counts the difference in word count between with xhtml and without #003' do
52 | string = 'Hello world Hello there. Another sentence Sentence here. Hello world.'
53 | ws = WordCountAnalyzer::Xhtml.new(string: string)
54 | expect(ws.count_difference_word_boundary).to eq(1)
55 | end
56 | end
57 |
58 | context '#occurrences' do
59 | it 'counts the number of tags (1 opening set and 1 closing set of tags counts as 1)' do
60 | string = 'Hello world Hello there. Another sentence Sentence here. Hello world.'
61 | ws = WordCountAnalyzer::Xhtml.new(string: string)
62 | expect(ws.occurrences).to eq(2)
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/hyperlink_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Hyperlink do
4 | context '#hyperlink?(string)' do
5 | it 'returns true if the string is a hyperlink #001' do
6 | string = "http://www.example.com/this-IS-a_test/hello.html"
7 | ws = WordCountAnalyzer::Hyperlink.new
8 | expect(ws.hyperlink?(string)).to eq(true)
9 | end
10 |
11 | it 'returns true if the string is a hyperlink #002' do
12 | string = "http://www.google.co.uk"
13 | ws = WordCountAnalyzer::Hyperlink.new
14 | expect(ws.hyperlink?(string)).to eq(true)
15 | end
16 |
17 | it 'returns true if the string is a hyperlink #003' do
18 | string = "https://google.co.uk"
19 | ws = WordCountAnalyzer::Hyperlink.new
20 | expect(ws.hyperlink?(string)).to eq(true)
21 | end
22 |
23 | it 'returns false if the string is not a hyperlink #004' do
24 | string = "hello"
25 | ws = WordCountAnalyzer::Hyperlink.new
26 | expect(ws.hyperlink?(string)).to eq(false)
27 | end
28 |
29 | it 'returns false if the string is not a hyperlink #005' do
30 | string = "john@gmail.com"
31 | ws = WordCountAnalyzer::Hyperlink.new
32 | expect(ws.hyperlink?(string)).to eq(false)
33 | end
34 |
35 | it 'returns false if the string is not a hyperlink #006' do
36 | string = "date:"
37 | ws = WordCountAnalyzer::Hyperlink.new
38 | expect(ws.hyperlink?(string)).to eq(false)
39 | end
40 |
41 | it 'returns false if the string is not a hyperlink #007' do
42 | string = 'The file location is c:\Users\johndoe.'
43 | ws = WordCountAnalyzer::Hyperlink.new
44 | expect(ws.hyperlink?(string)).to eq(false)
45 | end
46 | end
47 |
48 | context '#occurrences' do
49 | it 'returns the occurrences of hyperlink tokens in a string #001' do
50 | string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
51 | ws = WordCountAnalyzer::Hyperlink.new
52 | expect(ws.occurrences(string)).to eq(2)
53 | end
54 | end
55 |
56 | context '#replace' do
57 | it 'replaces the hyperlinks in a string with regular tokens #001' do
58 | string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk"
59 | ws = WordCountAnalyzer::Hyperlink.new
60 | expect(ws.replace(string)).to eq("Today the date is: Jan 1. Visit wslinkword or wslinkword ")
61 | end
62 |
63 | it 'replaces the hyperlinks in a string with regular tokens #002' do
64 | string = 'The file location is c:\Users\johndoe or d:\Users\john\www'
65 | ws = WordCountAnalyzer::Hyperlink.new
66 | expect(ws.replace(string)).to eq('The file location is c:\Users\johndoe or d:\Users\john\www')
67 | end
68 | end
69 |
70 | context '#replace_split_at_period' do
71 | it 'replaces the hyperlinks in a string with regular tokens, split at periods #001' do
72 | string = "http://www.google.co.uk"
73 | ws = WordCountAnalyzer::Hyperlink.new
74 | expect(ws.replace_split_at_period(string)).to eq("http://www google co uk")
75 | end
76 | end
77 | end
78 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/hyphenated_word_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::HyphenatedWord do
4 | context '#hyphenated_word?' do
5 | it 'returns true if the token is a hyphenated word #001' do
6 | token = 'devil-may-care'
7 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
8 | expect(ws.hyphenated_word?).to eq(true)
9 | end
10 |
11 | it 'returns true if the token is a hyphenated word #002' do
12 | token = '(2R)-2-methylsulfanyl-3-hydroxybutanedioate'
13 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
14 | expect(ws.hyphenated_word?).to eq(true)
15 | end
16 |
17 | it 'returns false if the token is not a hyphenated word' do
18 | token = 'hello'
19 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
20 | expect(ws.hyphenated_word?).to eq(false)
21 | end
22 |
23 | it 'returns false if the token is a hyperlink' do
24 | token = 'https://www.example-one.com'
25 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
26 | expect(ws.hyphenated_word?).to eq(false)
27 | end
28 |
29 | it 'returns false if the token is long string of dashes' do
30 | token = '------------'
31 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
32 | expect(ws.hyphenated_word?).to eq(false)
33 | end
34 |
35 | it 'returns true if the token is a hyphenated word (small em dashes)' do
36 | token = 'devil﹘may﹘care'
37 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
38 | expect(ws.hyphenated_word?).to eq(true)
39 | end
40 | end
41 |
42 | context '#count_as_multiple' do
43 | it 'returns the count of the individual words that are separated by the hyphen' do
44 | token = 'devil-may-care'
45 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
46 | expect(ws.count_as_multiple).to eq(3)
47 | end
48 |
49 | it 'handles small em dashes' do
50 | token = 'devil﹘may﹘care'
51 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
52 | expect(ws.count_as_multiple).to eq(3)
53 | end
54 |
55 | it 'returns the count of the individual words that are separated by the hyphen #002' do
56 | token = '(2R)-2-methylsulfanyl-3-hydroxybutanedioate'
57 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
58 | expect(ws.count_as_multiple).to eq(5)
59 | end
60 | end
61 |
62 | context '#replace' do
63 | it 'splits hyphenated words #001' do
64 | token = 'devil-may-care'
65 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
66 | expect(ws.replace).to eq('devil may care')
67 | end
68 |
69 | it 'splits hyphenated words #002' do
70 | token = 'devil﹘may﹘care'
71 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
72 | expect(ws.replace).to eq('devil may care')
73 | end
74 |
75 | it 'splits hyphenated words #003' do
76 | token = '(2R)-2-methylsulfanyl-3-hydroxybutanedioate'
77 | ws = WordCountAnalyzer::HyphenatedWord.new(token: token)
78 | expect(ws.replace).to eq('(2R) 2 methylsulfanyl 3 hydroxybutanedioate')
79 | end
80 | end
81 | end
82 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/numbered_list_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::NumberedList do
4 | context '#includes_numbered_list?' do
5 | it 'returns true if the string includes a numbered list #001' do
6 | string = "1. List item a\n\n2. List item b\n\n3. List item c."
7 | ws = WordCountAnalyzer::NumberedList.new(string: string)
8 | expect(ws.includes_numbered_list?).to eq(true)
9 | end
10 |
11 | it 'returns false if the string does not include a numbered list #002' do
12 | string = "I have 1.00 dollar and 2 cents."
13 | ws = WordCountAnalyzer::NumberedList.new(string: string)
14 | expect(ws.includes_numbered_list?).to eq(false)
15 | end
16 |
17 | it 'returns false if the string does not include at least 2 list items #003' do
18 | string = "I have 2."
19 | ws = WordCountAnalyzer::NumberedList.new(string: string)
20 | expect(ws.includes_numbered_list?).to eq(false)
21 | end
22 | end
23 |
24 | context '#replace' do
25 | it 'replaces any numbered list numbers with an empty string' do
26 | string = "1. List item a\n\n2. List item b\n\n3. List item c."
27 | ws = WordCountAnalyzer::NumberedList.new(string: string)
28 | expect(ws.replace).to eq(" List item a\n\n List item b\n\n List item c.")
29 | end
30 |
31 | it 'replaces any numbered list numbers with an empty string' do
32 | string = "It also shouldn't have too many contractions, maybe 2. Let's add a list 1. List item a\n\n2. List item b\n\n3. List item c."
33 | ws = WordCountAnalyzer::NumberedList.new(string: string)
34 | expect(ws.replace).to eq("It also shouldn't have too many contractions, maybe 2. Let's add a list List item a\n\n List item b\n\n List item c.")
35 | end
36 | end
37 |
38 | context '#occurrences' do
39 | it 'counts the occurrences of numbered lists #001' do
40 | string = "1. List item a\n\n2. List item b\n\n3. List item c."
41 | ws = WordCountAnalyzer::NumberedList.new(string: string)
42 | expect(ws.occurrences).to eq(3)
43 | end
44 |
45 | it 'counts the occurrences of numbered lists #002' do
46 | string = "I have 2."
47 | ws = WordCountAnalyzer::NumberedList.new(string: string)
48 | expect(ws.occurrences).to eq(0)
49 | end
50 |
51 | it 'counts the occurrences of numbered lists #003' do
52 | string = "1. List item a\n\n2. List item b\n\n3. List item c. Then more text. Ok start a new list. 1. item a 2. item b."
53 | ws = WordCountAnalyzer::NumberedList.new(string: string)
54 | expect(ws.occurrences).to eq(5)
55 | end
56 |
57 | it 'counts the occurrences of numbered lists #004' do
58 | string = "1. List item a\n\n2. List item b\n\n3. List item c. Then more text. Ok start a new non-list. I have 2."
59 | ws = WordCountAnalyzer::NumberedList.new(string: string)
60 | expect(ws.occurrences).to eq(3)
61 | end
62 |
63 | it 'counts the occurrences of numbered lists #005' do
64 | string = "It also shouldn't have too many contractions, maybe 2. Let's add a list 1. item a 2. item b 3. item c."
65 | ws = WordCountAnalyzer::NumberedList.new(string: string)
66 | expect(ws.occurrences).to eq(3)
67 | end
68 | end
69 | end
70 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/slash.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Slash
3 | # Rubular: http://rubular.com/r/AqvcH29sgg
4 | FORWARD_SLASH_REGEX = /(?<=\s)(\S+\/)+\S+|(?<=\A)(\S+\/)+\S+/
5 |
6 | # Rubular: http://rubular.com/r/tuFWtdMs4G
7 | BACKSLASH_REGEX = /\S+\\\S+/
8 |
9 | attr_reader :string, :processed_string, :date, :xhtml, :hyperlink
10 | def initialize(string:, **args)
11 | @string = string
12 | @date = args[:date] || nil
13 | @xhtml = args[:xhtml] || nil
14 | @hyperlink = args[:hyperlink] || nil
15 | hyper = WordCountAnalyzer::Hyperlink.new
16 | if date.eql?('no_special_treatment')
17 | if xhtml.eql?('keep')
18 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
19 | @processed_string = string
20 | else
21 | @processed_string = hyper.replace(string)
22 | end
23 | else
24 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
25 | @processed_string = WordCountAnalyzer::Xhtml.new(string: string).replace
26 | else
27 | @processed_string = WordCountAnalyzer::Xhtml.new(string: hyper.replace(string)).replace
28 | end
29 | end
30 | else
31 | if xhtml.eql?('keep')
32 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
33 | @processed_string = WordCountAnalyzer::Date.new.replace(string)
34 | else
35 | @processed_string = WordCountAnalyzer::Date.new.replace(hyper.replace(string))
36 | end
37 | else
38 | if hyperlink.eql?('no_special_treatment') || hyperlink.eql?('split_at_period')
39 | @processed_string = WordCountAnalyzer::Date.new.replace(WordCountAnalyzer::Xhtml.new(string: string).replace)
40 | else
41 | @processed_string = WordCountAnalyzer::Date.new.replace(WordCountAnalyzer::Xhtml.new(string: hyper.replace(string)).replace)
42 | end
43 | end
44 | end
45 | end
46 |
47 | def includes_forward_slash?
48 | !(processed_string !~ FORWARD_SLASH_REGEX)
49 | end
50 |
51 | def includes_backslash?
52 | !(processed_string !~ BACKSLASH_REGEX)
53 | end
54 |
55 | def forward_slash_occurences
56 | processed_string.scan(FORWARD_SLASH_REGEX).size
57 | end
58 |
59 | def replace_forward_slashes
60 | return processed_string if processed_string !~ FORWARD_SLASH_REGEX
61 | processed_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
62 | match.split(/\/+/).join(' ')
63 | end
64 | processed_string
65 | end
66 |
67 | def replace_forward_slashes_except_dates
68 | return processed_string if processed_string !~ FORWARD_SLASH_REGEX
69 | except_date_string = WordCountAnalyzer::Date.new.replace_number_only_date(processed_string)
70 | except_date_string.gsub!(FORWARD_SLASH_REGEX).each do |match|
71 | match.split(/\/+/).join(' ')
72 | end
73 | except_date_string
74 | end
75 |
76 | def backslash_occurences
77 | processed_string.scan(BACKSLASH_REGEX).size
78 | end
79 |
80 | def replace_backslashes
81 | return processed_string if processed_string !~ BACKSLASH_REGEX
82 | processed_string.gsub!(BACKSLASH_REGEX).each do |match|
83 | ' word ' * match.split(/\\+/).length
84 | end
85 | processed_string
86 | end
87 | end
88 | end
89 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/punctuation_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Punctuation do
4 | context '#dotted_line_ocurrances' do
5 | it 'returns the number of dotted line occurrences #001' do
6 | string = "Here is one …………………………………………………………………… and another ......"
7 | ws = WordCountAnalyzer::Punctuation.new(string: string)
8 | expect(ws.dotted_line_ocurrances).to eq(2)
9 | end
10 |
11 | it 'returns the number of dotted line occurrences #002' do
12 | string = "Hello world"
13 | ws = WordCountAnalyzer::Punctuation.new(string: string)
14 | expect(ws.dotted_line_ocurrances).to eq(0)
15 | end
16 | end
17 |
18 | context '#dashed_line_ocurrances' do
19 | it 'returns the number of dotted line occurrences #001' do
20 | string = "Here is one ----- and another -----"
21 | ws = WordCountAnalyzer::Punctuation.new(string: string)
22 | expect(ws.dashed_line_ocurrances).to eq(2)
23 | end
24 |
25 | it 'returns the number of dotted line occurrences #002' do
26 | string = "Hello world"
27 | ws = WordCountAnalyzer::Punctuation.new(string: string)
28 | expect(ws.dashed_line_ocurrances).to eq(0)
29 | end
30 | end
31 |
32 | context '#underscore_ocurrances' do
33 | it 'returns the number of undescore occurrences #001' do
34 | string = "Here is one ______ and another ______"
35 | ws = WordCountAnalyzer::Punctuation.new(string: string)
36 | expect(ws.underscore_ocurrances).to eq(2)
37 | end
38 |
39 | it 'returns the number of undescore occurrences #002' do
40 | string = "Hello world"
41 | ws = WordCountAnalyzer::Punctuation.new(string: string)
42 | expect(ws.underscore_ocurrances).to eq(0)
43 | end
44 | end
45 |
46 | context '#stray_punctuation_occurences' do
47 | it 'returns the number of stray punctuation occurrences #001' do
48 | string = "Hello world ? This is another - sentence ."
49 | ws = WordCountAnalyzer::Punctuation.new(string: string)
50 | expect(ws.stray_punctuation_occurences).to eq(3)
51 | end
52 |
53 | it 'returns the number of stray punctuation occurrences #002' do
54 | string = "Hello world. Great?"
55 | ws = WordCountAnalyzer::Punctuation.new(string: string)
56 | expect(ws.stray_punctuation_occurences).to eq(0)
57 | end
58 |
59 | it 'returns the number of stray punctuation occurrences #003' do
60 | string = "."
61 | ws = WordCountAnalyzer::Punctuation.new(string: string)
62 | expect(ws.stray_punctuation_occurences).to eq(1)
63 | end
64 | end
65 |
66 | context '#replace_dotted_line' do
67 | it 'replaces the dotted lines' do
68 | string = "Here is one …………………………………………………………………… and another ......"
69 | ws = WordCountAnalyzer::Punctuation.new(string: string)
70 | expect(ws.replace_dotted_line).to eq("Here is one and another ")
71 | end
72 | end
73 |
74 | context '#replace_dashed_line' do
75 | it 'replaces the dashed lines' do
76 | string = "Here is one ----- and another -----"
77 | ws = WordCountAnalyzer::Punctuation.new(string: string)
78 | expect(ws.replace_dashed_line).to eq("Here is one and another ")
79 | end
80 | end
81 |
82 | context '#replace_underscore' do
83 | it 'replaces the underscores' do
84 | string = "Here is one ______ and another ______"
85 | ws = WordCountAnalyzer::Punctuation.new(string: string)
86 | expect(ws.replace_underscore).to eq("Here is one and another ")
87 | end
88 | end
89 |
90 | context '#replace_stray_punctuation' do
91 | it 'replaces any stray punctutation' do
92 | string = "Hello world ? This is another - sentence ."
93 | ws = WordCountAnalyzer::Punctuation.new(string: string)
94 | expect(ws.replace_stray_punctuation).to eq("Hello world This is another sentence ")
95 | end
96 | end
97 | end
98 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/slash_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Slash do
4 | context '#includes_forward_slash?' do
5 | it 'returns true if the string includes a token with a forward slash #001' do
6 | string = "Using the solidus for he/she/it is often discouraged, except in this case."
7 | ws = WordCountAnalyzer::Slash.new(string: string)
8 | expect(ws.includes_forward_slash?).to eq(true)
9 | end
10 |
11 | it 'returns false if the string does not includes a token with a forward slash #002' do
12 | string = "Hello world."
13 | ws = WordCountAnalyzer::Slash.new(string: string)
14 | expect(ws.includes_forward_slash?).to eq(false)
15 | end
16 |
17 | it 'ignores hyperlinks #003' do
18 | string = "http://www.google.com/google"
19 | ws = WordCountAnalyzer::Slash.new(string: string)
20 | expect(ws.includes_forward_slash?).to eq(false)
21 | end
22 |
23 | it 'ignores dates #004' do
24 | string = "Today is 2/15/2013"
25 | ws = WordCountAnalyzer::Slash.new(string: string)
26 | expect(ws.includes_forward_slash?).to eq(false)
27 | end
28 | end
29 |
30 | context '#includes_backslash?' do
31 | it 'returns true if the string includes a token with a backslash #001' do
32 | string = 'The file location is c:\Users\johndoe.'
33 | ws = WordCountAnalyzer::Slash.new(string: string)
34 | expect(ws.includes_backslash?).to eq(true)
35 | end
36 |
37 | it 'returns false if the string does not includes a token with a backslash #002' do
38 | string = "Hello world."
39 | ws = WordCountAnalyzer::Slash.new(string: string)
40 | expect(ws.includes_backslash?).to eq(false)
41 | end
42 | end
43 |
44 | context '#forward_slash_occurences' do
45 | it 'returns the number of occurrences of tokens with a forward slash #001' do
46 | string = "Using the solidus for he/she/it is often discouraged, except in this case she/he said."
47 | ws = WordCountAnalyzer::Slash.new(string: string)
48 | expect(ws.forward_slash_occurences).to eq(2)
49 | end
50 |
51 | it 'returns the number of occurrences of tokens with a forward slash #002' do
52 | string = "Hello world."
53 | ws = WordCountAnalyzer::Slash.new(string: string)
54 | expect(ws.forward_slash_occurences).to eq(0)
55 | end
56 | end
57 |
58 | context '#backslash_occurences' do
59 | it 'returns the number of occurrences of tokens with a backslash #001' do
60 | string = 'The file location is c:\Users\johndoe or d:\Users\john\www'
61 | ws = WordCountAnalyzer::Slash.new(string: string)
62 | expect(ws.backslash_occurences).to eq(2)
63 | end
64 |
65 | it 'returns the number of occurrences of tokens with a backslash #002' do
66 | string = "Hello world."
67 | ws = WordCountAnalyzer::Slash.new(string: string)
68 | expect(ws.backslash_occurences).to eq(0)
69 | end
70 |
71 | it 'returns the number of occurrences of tokens with a backslash #003' do
72 | string = "Hello world."
73 | ws = WordCountAnalyzer::Slash.new(string: string)
74 | expect(ws.backslash_occurences).to eq(0)
75 | end
76 | end
77 |
78 | context '#replace_forward_slashes_multiple' do
79 | it 'replaces forward slashes with multiple tokens #001' do
80 | string = "he/she/it"
81 | ws = WordCountAnalyzer::Slash.new(string: string)
82 | expect(ws.replace_forward_slashes).to eq("he she it")
83 | end
84 |
85 | it 'replaces forward slashes with multiple tokens #002' do
86 | string = "hello//world"
87 | ws = WordCountAnalyzer::Slash.new(string: string)
88 | expect(ws.replace_forward_slashes).to eq("hello world")
89 | end
90 | end
91 |
92 | context '#replace_forward_slashes_except_dates' do
93 | it 'replaces forward slashes with multiple tokens #001' do
94 | string = "he/she/it 4/28/2013"
95 | ws = WordCountAnalyzer::Slash.new(string: string)
96 | expect(ws.replace_forward_slashes).to eq("he she it wsdateword ")
97 | end
98 |
99 | it 'replaces forward slashes with multiple tokens #002' do
100 | string = "hello//world 4/28/2013"
101 | ws = WordCountAnalyzer::Slash.new(string: string)
102 | expect(ws.replace_forward_slashes).to eq("hello world wsdateword ")
103 | end
104 | end
105 | end
106 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/ellipsis_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Ellipsis do
4 | context '#includes_ellipsis?(string)' do
5 | it 'returns true if the string includes an ellipsis #001' do
6 | string = 'Using an ellipsis … causes different counts.'
7 | ws = WordCountAnalyzer::Ellipsis.new
8 | expect(ws.includes_ellipsis?(string)).to eq(true)
9 | end
10 |
11 | it 'returns true if the string includes an ellipsis #002' do
12 | string = 'Using an ellipsis causes different counts…depending on the style that you use.'
13 | ws = WordCountAnalyzer::Ellipsis.new
14 | expect(ws.includes_ellipsis?(string)).to eq(true)
15 | end
16 |
17 | it 'returns true if the string includes an ellipsis #003' do
18 | string = 'Using an ellipsis causes different counts depending on the style . . . that you use.'
19 | ws = WordCountAnalyzer::Ellipsis.new
20 | expect(ws.includes_ellipsis?(string)).to eq(true)
21 | end
22 |
23 | it 'returns true if the string includes an ellipsis #004' do
24 | string = 'Using an ellipsis causes different counts depending on the style . . . . that you use.'
25 | ws = WordCountAnalyzer::Ellipsis.new
26 | expect(ws.includes_ellipsis?(string)).to eq(true)
27 | end
28 |
29 | it 'returns true if the string includes an ellipsis #005' do
30 | string = 'Using an ellipsis causes different counts depending on the style.... that you use.'
31 | ws = WordCountAnalyzer::Ellipsis.new
32 | expect(ws.includes_ellipsis?(string)).to eq(true)
33 | end
34 |
35 | it 'returns true if the string includes an ellipsis #006' do
36 | string = 'hello world ...'
37 | ws = WordCountAnalyzer::Ellipsis.new
38 | expect(ws.includes_ellipsis?(string)).to eq(true)
39 | end
40 |
41 | it 'returns true if the string includes an ellipsis #007' do
42 | string = '...'
43 | ws = WordCountAnalyzer::Ellipsis.new
44 | expect(ws.includes_ellipsis?(string)).to eq(true)
45 | end
46 |
47 | it 'returns true if the string includes an ellipsis #008' do
48 | string = '....'
49 | ws = WordCountAnalyzer::Ellipsis.new
50 | expect(ws.includes_ellipsis?(string)).to eq(true)
51 | end
52 |
53 | it 'returns true if the string includes an ellipsis #009' do
54 | string = ' . . . '
55 | ws = WordCountAnalyzer::Ellipsis.new
56 | expect(ws.includes_ellipsis?(string)).to eq(true)
57 | end
58 |
59 | it 'returns true if the string includes an ellipsis #010' do
60 | string = ' . . . . '
61 | ws = WordCountAnalyzer::Ellipsis.new
62 | expect(ws.includes_ellipsis?(string)).to eq(true)
63 | end
64 |
65 | it 'returns true if the string includes an ellipsis #011' do
66 | string = '…'
67 | ws = WordCountAnalyzer::Ellipsis.new
68 | expect(ws.includes_ellipsis?(string)).to eq(true)
69 | end
70 |
71 | it "returns false if the string doesn't include an ellipsis #012" do
72 | string = 'Hello world.'
73 | ws = WordCountAnalyzer::Ellipsis.new
74 | expect(ws.includes_ellipsis?(string)).to eq(false)
75 | end
76 |
77 | it "returns false if the string includes a dotted_line #0013" do
78 | string = '.....'
79 | ws = WordCountAnalyzer::Ellipsis.new
80 | expect(ws.includes_ellipsis?(string)).to eq(false)
81 | end
82 |
83 | it "returns false if the string includes a dotted_line #0014" do
84 | string = "Here is one …………………………………………………………………… and another ......"
85 | ws = WordCountAnalyzer::Ellipsis.new
86 | expect(ws.includes_ellipsis?(string)).to eq(false)
87 | end
88 | end
89 |
90 | context '#replace' do
91 | it 'returns a string with the ellipsis replaced #001' do
92 | string = 'Using an ellipsis … causes different counts…depending on the style . . . that you use. I never meant that.... She left the store. The practice was not abandoned. . . .'
93 | ws = WordCountAnalyzer::Ellipsis.new
94 | expect(ws.replace(string)).to eq("Using an ellipsis wseword causes different counts wseword depending on the style wseword that you use. I never meant that wseword She left the store. The practice was not abandoned wseword ")
95 | end
96 | end
97 |
98 | context '#occurrences' do
99 | it 'returns a string with the ellipsis replaced #001' do
100 | string = 'Using an ellipsis … causes different counts…depending on the style . . . that you use. I never meant that.... She left the store. The practice was not abandoned. . . .'
101 | ws = WordCountAnalyzer::Ellipsis.new
102 | expect(ws.occurrences(string)).to eq(5)
103 | end
104 | end
105 | end
--------------------------------------------------------------------------------
/lib/word_count_analyzer/date.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Date
3 | DOW = %w(monday tuesday wednesday thursday friday saturday sunday)
4 | DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun)
5 | MONTHS = %w(january february march april may june july august september october november december)
6 | MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec)
7 | # Rubular: http://rubular.com/r/73CZ2HU0q6
8 | DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}\.?/
9 |
10 | # Rubular: http://rubular.com/r/GWbuWXw4t0
11 | YMD_YDM_REGEX = /\d{4}(\/|\.|-)(\d{1,2}(\/|\.|-)){2}\.?/
12 |
13 | # Rubular: http://rubular.com/r/SRZ27XNlvR
14 | DIGIT_ONLY_YEAR_FIRST_REGEX = /[12]\d{7}\D\.?/
15 |
16 | # Rubular: http://rubular.com/r/mpVSeaKwdY
17 | DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D\.?/
18 |
19 | def includes_date?(text)
20 | includes_long_date?(text) || includes_number_only_date?(text)
21 | end
22 |
23 | def replace(text)
24 | counter = 0
25 | DOW_ABBR.map { |day| counter +=1 if text.include?('day') }
26 | text = redact_dates(counter, text)
27 | redact_regex(text)
28 | end
29 |
30 | def occurrences(text)
31 | replace(text).scan(/wsdateword/).size
32 | end
33 |
34 | def replace_number_only_date(text)
35 | text.gsub(DMY_MDY_REGEX, ' wsdateword ')
36 | .gsub(YMD_YDM_REGEX, ' wsdateword ')
37 | .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
38 | .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
39 | end
40 |
41 | private
42 |
43 | def redact_dates(counter, text)
44 | if counter > 0
45 | text = redact_dow_abbr(text)
46 | text = redact_dow(text)
47 | else
48 | text = redact_dow(text)
49 | text = redact_dow_abbr(text)
50 | end
51 | text
52 | end
53 |
54 | def redact_regex(text)
55 | text.gsub(DMY_MDY_REGEX, ' wsdateword ')
56 | .gsub(YMD_YDM_REGEX, ' wsdateword ')
57 | .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' wsdateword ')
58 | .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' wsdateword ')
59 | end
60 |
61 | def redact_dow(text)
62 | DOW.each do |day|
63 | MONTHS.map { |month| text = redact_date(text, day, month) }
64 | MONTH_ABBR.map { |month| text = redact_date(text, day, month) }
65 | end
66 | text
67 | end
68 |
69 | def redact_dow_abbr(text)
70 | DOW_ABBR.each do |day|
71 | MONTHS.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') }
72 | MONTH_ABBR.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ') }
73 | end
74 | text
75 | end
76 |
77 | def redact_date(text, day, month)
78 | text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
79 | .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}\.?/i, ' wsdateword ')
80 | .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
81 | .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+\.?/i, ' wsdateword ')
82 | .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*\.?/i, ' wsdateword ')
83 | .gsub(/\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})\.?/i, ' wsdateword ')
84 | end
85 |
86 | def includes_long_date?(text)
87 | includes_long_date_1?(text) || includes_long_date_2?(text)
88 | end
89 |
90 | def includes_long_date_1?(text)
91 | DOW.each do |day|
92 | MONTHS.map { |month| return true if check_for_matches(day, month, text) }
93 | MONTH_ABBR.map { |month| return true if check_for_matches(day, month, text) }
94 | end
95 | false
96 | end
97 |
98 | def includes_long_date_2?(text)
99 | DOW_ABBR.each do |day|
100 | MONTHS.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
101 | MONTH_ABBR.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) }
102 | end
103 | false
104 | end
105 |
106 | def includes_number_only_date?(text)
107 | !(text !~ DMY_MDY_REGEX) ||
108 | !(text !~ YMD_YDM_REGEX) ||
109 | !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) ||
110 | !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX)
111 | end
112 |
113 | def check_for_matches(day, month, text)
114 | !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
115 | !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) ||
116 | !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) ||
117 | !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) ||
118 | !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) ||
119 | !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i)
120 | end
121 | end
122 | end
--------------------------------------------------------------------------------
/spec/word_count_analyzer/contraction_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Contraction do
4 | before do
5 | @tgr = EngTagger.new
6 | end
7 | context '#contraction?' do
8 | it 'returns true if the token is a contraction' do
9 | token = "when'd"
10 | following_token = nil
11 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
12 | expect(ws.contraction?).to eq(true)
13 | end
14 |
15 | it 'returns true if the token is an irregular contraction' do
16 | token = "o'clock"
17 | following_token = nil
18 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
19 | expect(ws.contraction?).to eq(true)
20 | end
21 |
22 | it 'returns false if the token is a possesive and not a contraction' do
23 | token = "Bob's"
24 | following_token = "car"
25 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
26 | expect(ws.contraction?).to eq(false)
27 | end
28 |
29 | it 'returns true if the token is a contraction' do
30 | token = "Bob's"
31 | following_token = "the"
32 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
33 | expect(ws.contraction?).to eq(true)
34 | end
35 |
36 | it 'returns true if the token is a contraction' do
37 | token = "Bob's"
38 | following_token = "open"
39 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
40 | expect(ws.contraction?).to eq(true)
41 | end
42 |
43 | it 'returns true if the token is a contraction' do
44 | token = "Don't"
45 | following_token = "count"
46 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
47 | expect(ws.contraction?).to eq(true)
48 | end
49 | end
50 |
51 | context '#expanded_count' do
52 | it 'returns the count of the contraction expanded #001' do
53 | token = "when'd"
54 | following_token = nil
55 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
56 | expect(ws.expanded_count).to eq(2)
57 | end
58 |
59 | it 'returns the count of the contraction expanded #002' do
60 | token = "o'clock"
61 | following_token = nil
62 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
63 | expect(ws.expanded_count).to eq(3)
64 | end
65 |
66 | it 'returns the count of the contraction expanded #003' do
67 | token = "Bob's"
68 | following_token = "car"
69 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
70 | expect(ws.expanded_count).to eq(1)
71 | end
72 |
73 | it 'returns the count of the contraction expanded #004' do
74 | token = "Bob's"
75 | following_token = "the"
76 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: nil)
77 | expect(ws.expanded_count).to eq(2)
78 | end
79 |
80 | it 'returns the count of the contraction expanded #005' do
81 | token = "cat-o'-nine-tails"
82 | following_token = nil
83 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_one')
84 | expect(ws.expanded_count).to eq(1)
85 | end
86 |
87 | it 'returns the count of the contraction expanded #006' do
88 | token = "cat-o'-nine-tails"
89 | following_token = nil
90 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr, hyphen: 'count_as_multiple')
91 | expect(ws.expanded_count).to eq(4)
92 | end
93 | end
94 |
95 | context '#replace' do
96 | it 'replaces the token with the contraction expanded #001' do
97 | token = "cat-o'-nine-tails"
98 | following_token = nil
99 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
100 | expect(ws.replace).to eq("cat-of-nine-tails")
101 | end
102 |
103 | it 'replaces the token with the contraction expanded #002' do
104 | token = "Bob's"
105 | following_token = "the"
106 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
107 | expect(ws.replace).to eq(" word word ")
108 | end
109 |
110 | it 'replaces the token with the contraction expanded #003' do
111 | token = "don't"
112 | following_token = nil
113 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
114 | expect(ws.replace).to eq("do not")
115 | end
116 |
117 | it 'replaces the token with the contraction expanded #004' do
118 | token = "hello"
119 | following_token = nil
120 | ws = WordCountAnalyzer::Contraction.new(token: token, following_token: following_token, tgr: @tgr)
121 | expect(ws.replace).to eq("hello")
122 | end
123 | end
124 | end
125 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/contraction.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Contraction
3 | CONTRACTIONS = {
4 | "i'm" => "I am",
5 | "i'll" => "I will",
6 | "i'd" => "I would",
7 | "i've" => "I have",
8 | "you're" => "you are",
9 | "you'll" => "you will",
10 | "you'd" => "you would",
11 | "you've" => "you have",
12 | "he's" => "he is",
13 | "he'll" => "he will",
14 | "he'd" => "he would",
15 | "she's" => "she is",
16 | "she'll" => "she will",
17 | "she'd" => "she would",
18 | "it's" => "it is",
19 | "'tis" => "it is",
20 | "it'll" => "it will",
21 | "it'd" => "it would",
22 | "we're" => "we are",
23 | "we'll" => "we will",
24 | "we'd" => "we would",
25 | "we've" => "we have",
26 | "they're" => "they are",
27 | "they'll" => "they will",
28 | "they'd" => "they would",
29 | "they've" => "they have",
30 | "that's" => "that is",
31 | "that'll" => "that will",
32 | "that'd" => "that would",
33 | "who's" => "who is",
34 | "who'll" => "who will",
35 | "who'd" => "who would",
36 | "what's" => "what is",
37 | "what're" => "what are",
38 | "what'll" => "what will",
39 | "what'd" => "what would",
40 | "where's" => "where is",
41 | "where'll" => "where will",
42 | "where'd" => "where would",
43 | "when's" => "when is",
44 | "when'll" => "when will",
45 | "when'd" => "when would",
46 | "why's" => "why is",
47 | "why'll" => "why will",
48 | "why'd" => "why would",
49 | "how's" => "how is",
50 | "how'll" => "how will",
51 | "how'd" => "how would",
52 | "she'd've" => "she would have",
53 | "'tisn't" => "it is not",
54 | "isn't" => "is not",
55 | "aren't" => "are not",
56 | "wasn't" => "was not",
57 | "weren't" => "were not",
58 | "haven't" => "have not",
59 | "hasn't" => "has not",
60 | "hadn't" => "had not",
61 | "won't" => "will not",
62 | "wouldn't" => "would not",
63 | "don't" => "do not",
64 | "doesn't" => "does not",
65 | "didn't" => "did not",
66 | "can't" => "cannot",
67 | "couldn't" => "could not",
68 | "shouldn't" => "should not",
69 | "mightn't" => "might not",
70 | "mustn't" => "must not",
71 | "would've" => "would have",
72 | "should've" => "should have",
73 | "could've" => "could have",
74 | "might've" => "might have",
75 | "must've" => "must have",
76 | "o'" => "of",
77 | "o'clock" => "of the clock",
78 | "ma'am" => "madam",
79 | "ne'er-do-well" => "never-do-well",
80 | "cat-o'-nine-tails" => "cat-of-nine-tails",
81 | "jack-o'-lantern" => "jack-of-the-lantern",
82 | "will-o'-the-wisp" => "will-of-the-wisp",
83 | "'twas" => "it was"
84 | }.freeze
85 |
86 | attr_reader :token, :following_token, :tgr, :hyphen
87 | def initialize(token:, following_token:, tgr:, **args)
88 | @token = token
89 | @following_token = following_token
90 | @tgr = tgr
91 | @hyphen = args[:hyphen] || 'count_as_one'
92 | end
93 |
94 | def contraction?
95 | common_contraction? ||
96 | (apostrophe_s_token? &&
97 | following_is_not_a_noun?)
98 | end
99 |
100 | def expanded_count
101 | if self.contraction?
102 | if common_contraction?
103 | calculate_contraction_length
104 | else
105 | 2
106 | end
107 | else
108 | 1
109 | end
110 | end
111 |
112 | def replace
113 | if CONTRACTIONS.has_key?(token.downcase)
114 | CONTRACTIONS[token.downcase]
115 | elsif apostrophe_s_token? && following_is_not_a_noun?
116 | ' word word '
117 | else
118 | token
119 | end
120 | end
121 |
122 | private
123 |
124 | def calculate_contraction_length
125 | if hyphen.eql?('count_as_one') && hyphen
126 | contraction_length
127 | else
128 | contraction_length_hyphen
129 | end
130 | end
131 |
132 | def contraction_length
133 | CONTRACTIONS[token.downcase].split(' ').length
134 | end
135 |
136 | def contraction_length_hyphen
137 | CONTRACTIONS[token.downcase].split(' ').map { |token| token.split('-') }.flatten.length
138 | end
139 |
140 | def common_contraction?
141 | CONTRACTIONS.has_key?(token.downcase)
142 | end
143 |
144 | def following_is_not_a_noun?
145 | !tgr.add_tags(following_token)[1].downcase.eql?('n')
146 | end
147 |
148 | def apostrophe_s_token?
149 | token.include?("'s")
150 | end
151 | end
152 | end
--------------------------------------------------------------------------------
/spec/word_count_analyzer/date_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Date do
4 | context '#includes_date?(string)' do
5 | it 'returns true if the string includes a date #001' do
6 | string = 'Today is Monday, April 4th, 2011, aka 04/04/2011.'
7 | ws = WordCountAnalyzer::Date.new
8 | expect(ws.includes_date?(string)).to eq(true)
9 | end
10 |
11 | it 'returns true if the string includes a date #002' do
12 | string = 'Today is Monday April 4th 2011.'
13 | ws = WordCountAnalyzer::Date.new
14 | expect(ws.includes_date?(string)).to eq(true)
15 | end
16 |
17 | it 'returns true if the string includes a date #003' do
18 | string = 'Today is April 4th, 2011.'
19 | ws = WordCountAnalyzer::Date.new
20 | expect(ws.includes_date?(string)).to eq(true)
21 | end
22 |
23 | it 'returns true if the string includes a date #004' do
24 | string = 'Today is Mon., Apr. 4, 2011.'
25 | ws = WordCountAnalyzer::Date.new
26 | expect(ws.includes_date?(string)).to eq(true)
27 | end
28 |
29 | it 'returns true if the string includes a date #005' do
30 | string = 'Today is 04/04/2011.'
31 | ws = WordCountAnalyzer::Date.new
32 | expect(ws.includes_date?(string)).to eq(true)
33 | end
34 |
35 | it 'returns true if the string includes a date #006' do
36 | string = 'Today is 04.04.2011.'
37 | ws = WordCountAnalyzer::Date.new
38 | expect(ws.includes_date?(string)).to eq(true)
39 | end
40 |
41 | it 'returns true if the string includes a date #007' do
42 | string = 'Today is 2011.04.04.'
43 | ws = WordCountAnalyzer::Date.new
44 | expect(ws.includes_date?(string)).to eq(true)
45 | end
46 |
47 | it 'returns true if the string includes a date #008' do
48 | string = 'Today is 2011/04/04.'
49 | ws = WordCountAnalyzer::Date.new
50 | expect(ws.includes_date?(string)).to eq(true)
51 | end
52 |
53 | it 'returns true if the string includes a date #009' do
54 | string = 'Today is 2011-04-04.'
55 | ws = WordCountAnalyzer::Date.new
56 | expect(ws.includes_date?(string)).to eq(true)
57 | end
58 |
59 | it 'returns true if the string includes a date #010' do
60 | string = 'Today is 04-04-2011.'
61 | ws = WordCountAnalyzer::Date.new
62 | expect(ws.includes_date?(string)).to eq(true)
63 | end
64 |
65 | it 'returns true if the string includes a date #011' do
66 | string = 'Today is 2003 November 9.'
67 | ws = WordCountAnalyzer::Date.new
68 | expect(ws.includes_date?(string)).to eq(true)
69 | end
70 |
71 | it 'returns true if the string includes a date #012' do
72 | string = 'Today is 2003Nov9.'
73 | ws = WordCountAnalyzer::Date.new
74 | expect(ws.includes_date?(string)).to eq(true)
75 | end
76 |
77 | it 'returns true if the string includes a date #013' do
78 | string = 'Today is 2003Nov09.'
79 | ws = WordCountAnalyzer::Date.new
80 | expect(ws.includes_date?(string)).to eq(true)
81 | end
82 |
83 | it 'returns true if the string includes a date #014' do
84 | string = 'Today is 2003-Nov-9.'
85 | ws = WordCountAnalyzer::Date.new
86 | expect(ws.includes_date?(string)).to eq(true)
87 | end
88 |
89 | it 'returns true if the string includes a date #015' do
90 | string = 'Today is 2003-Nov-09.'
91 | ws = WordCountAnalyzer::Date.new
92 | expect(ws.includes_date?(string)).to eq(true)
93 | end
94 |
95 | it 'returns true if the string includes a date #016' do
96 | string = 'Today is 2003-Nov-9, Sunday.'
97 | ws = WordCountAnalyzer::Date.new
98 | expect(ws.includes_date?(string)).to eq(true)
99 | end
100 |
101 | it 'returns true if the string includes a date #017' do
102 | string = 'Today is 2003. november 9.'
103 | ws = WordCountAnalyzer::Date.new
104 | expect(ws.includes_date?(string)).to eq(true)
105 | end
106 |
107 | it 'returns true if the string includes a date #018' do
108 | string = 'Today is 2003.11.9.'
109 | ws = WordCountAnalyzer::Date.new
110 | expect(ws.includes_date?(string)).to eq(true)
111 | end
112 |
113 | it 'returns true if the string includes a date #019' do
114 | string = 'Today is Monday, Apr. 4, 2011.'
115 | ws = WordCountAnalyzer::Date.new
116 | expect(ws.includes_date?(string)).to eq(true)
117 | end
118 |
119 | it 'returns true if the string includes a date #020' do
120 | string = 'Today is 2003/11/09.'
121 | ws = WordCountAnalyzer::Date.new
122 | expect(ws.includes_date?(string)).to eq(true)
123 | end
124 |
125 | it 'returns true if the string includes a date #021' do
126 | string = 'Today is 20030109.'
127 | ws = WordCountAnalyzer::Date.new
128 | expect(ws.includes_date?(string)).to eq(true)
129 | end
130 |
131 | it 'returns true if the string includes a date #022' do
132 | string = 'Today is 01092003.'
133 | ws = WordCountAnalyzer::Date.new
134 | expect(ws.includes_date?(string)).to eq(true)
135 | end
136 |
137 | it 'returns true if the string includes a date #023' do
138 | string = 'Today is Sunday, November 9, 2014.'
139 | ws = WordCountAnalyzer::Date.new
140 | expect(ws.includes_date?(string)).to eq(true)
141 | end
142 |
143 | it 'returns true if the string includes a date #024' do
144 | string = 'Today is November 9, 2014.'
145 | ws = WordCountAnalyzer::Date.new
146 | expect(ws.includes_date?(string)).to eq(true)
147 | end
148 |
149 | it 'returns true if the string includes a date #025' do
150 | string = 'Today is Nov. 9, 2014.'
151 | ws = WordCountAnalyzer::Date.new
152 | expect(ws.includes_date?(string)).to eq(true)
153 | end
154 |
155 | it 'returns true if the string includes a date #026' do
156 | string = 'Today is july 1st.'
157 | ws = WordCountAnalyzer::Date.new
158 | expect(ws.includes_date?(string)).to eq(true)
159 | end
160 |
161 | it 'returns true if the string includes a date #027' do
162 | string = 'Today is jul. 1st.'
163 | ws = WordCountAnalyzer::Date.new
164 | expect(ws.includes_date?(string)).to eq(true)
165 | end
166 |
167 | it 'returns true if the string includes a date #028' do
168 | string = 'Today is 8 November 2014.'
169 | ws = WordCountAnalyzer::Date.new
170 | expect(ws.includes_date?(string)).to eq(true)
171 | end
172 |
173 | it 'returns true if the string includes a date #029' do
174 | string = 'Today is 8. November 2014.'
175 | ws = WordCountAnalyzer::Date.new
176 | expect(ws.includes_date?(string)).to eq(true)
177 | end
178 |
179 | it 'returns true if the string includes a date #030' do
180 | string = 'Today is 08-Nov-2014.'
181 | ws = WordCountAnalyzer::Date.new
182 | expect(ws.includes_date?(string)).to eq(true)
183 | end
184 |
185 | it 'returns true if the string includes a date #031' do
186 | string = 'Today is 08Nov14.'
187 | ws = WordCountAnalyzer::Date.new
188 | expect(ws.includes_date?(string)).to eq(true)
189 | end
190 |
191 | it 'returns true if the string includes a date #032' do
192 | string = 'Today is 8th November 2014.'
193 | ws = WordCountAnalyzer::Date.new
194 | expect(ws.includes_date?(string)).to eq(true)
195 | end
196 |
197 | it 'returns true if the string includes a date #033' do
198 | string = 'Today is the 8th of November 2014.'
199 | ws = WordCountAnalyzer::Date.new
200 | expect(ws.includes_date?(string)).to eq(true)
201 | end
202 |
203 | it 'returns true if the string includes a date #034' do
204 | string = 'Today is 08/Nov/2014.'
205 | ws = WordCountAnalyzer::Date.new
206 | expect(ws.includes_date?(string)).to eq(true)
207 | end
208 |
209 | it 'returns true if the string includes a date #035' do
210 | string = 'Today is Sunday, 8 November 2014.'
211 | ws = WordCountAnalyzer::Date.new
212 | expect(ws.includes_date?(string)).to eq(true)
213 | end
214 |
215 | it 'returns true if the string includes a date #036' do
216 | string = 'Today is 8 November 2014.'
217 | ws = WordCountAnalyzer::Date.new
218 | expect(ws.includes_date?(string)).to eq(true)
219 | end
220 |
221 | it 'returns false if the string does not include a date #037' do
222 | string = 'Hello world. There is no date here - $50,000. The sun is hot.'
223 | ws = WordCountAnalyzer::Date.new
224 | expect(ws.includes_date?(string)).to eq(false)
225 | end
226 | end
227 |
228 | context '#occurrences' do
229 | it 'counts the date occurrences in a string #001' do
230 | string = 'Today is Sunday, 8 November 2014.'
231 | ws = WordCountAnalyzer::Date.new
232 | expect(ws.occurrences(string)).to eq(1)
233 | end
234 |
235 | it 'counts the date occurrences in a string #002' do
236 | string = 'Today is Sunday, 8 November 2014. Yesterday was 07/Nov/2014.'
237 | ws = WordCountAnalyzer::Date.new
238 | expect(ws.occurrences(string)).to eq(2)
239 | end
240 | end
241 |
242 | context '#replace' do
243 | it 'replaces the date occurrences in a string #001' do
244 | string = 'Today is Tues. March 3rd, 2011.'
245 | ws = WordCountAnalyzer::Date.new
246 | expect(ws.replace(string)).to eq('Today is wsdateword ')
247 | end
248 |
249 | it 'replaces the date occurrences in a string #002' do
250 | string = 'The scavenger hunt ends on Dec. 31st, 2011.'
251 | ws = WordCountAnalyzer::Date.new
252 | expect(ws.replace(string)).to eq('The scavenger hunt ends on wsdateword ')
253 | end
254 | end
255 |
256 | context '#replace_number_only_date' do
257 | it 'replaces only the number date occurrences in a string' do
258 | string = 'Today is Tues. March 3rd, 2011. 4/28/2013'
259 | ws = WordCountAnalyzer::Date.new
260 | expect(ws.replace_number_only_date(string)).to eq("Today is Tues. March 3rd, 2011. wsdateword ")
261 | end
262 | end
263 | end
264 |
--------------------------------------------------------------------------------
/lib/word_count_analyzer/counter.rb:
--------------------------------------------------------------------------------
1 | module WordCountAnalyzer
2 | class Counter
3 | attr_reader :ellipsis, :hyperlink, :contraction, :hyphenated_word, :date, :number, :numbered_list, :xhtml, :forward_slash, :backslash, :dotted_line, :dashed_line, :underscore, :stray_punctuation, :equal_sign
4 | def initialize(**args)
5 | @ellipsis = args[:ellipsis] || 'ignore'
6 | @hyperlink = args[:hyperlink] || 'count_as_one'
7 | @contraction = args[:contraction] || 'count_as_one'
8 | @hyphenated_word = args[:hyphenated_word] || 'count_as_one'
9 | @date = args[:date] || 'no_special_treatment'
10 | @number = args[:number] || 'count'
11 | @numbered_list = args[:numbered_list] || 'count'
12 | @xhtml = args[:xhtml] || 'remove'
13 | @forward_slash = args[:forward_slash] || 'count_as_multiple_except_dates'
14 | @backslash = args[:backslash] || 'count_as_one'
15 | @dotted_line = args[:dotted_line] || 'ignore'
16 | @dashed_line = args[:dashed_line] || 'ignore'
17 | @underscore = args[:underscore] || 'ignore'
18 | @stray_punctuation = args[:stray_punctuation] || 'ignore'
19 | @equal_sign = 'ignore'
20 | @tgr = EngTagger.new
21 | end
22 |
23 | def count(text)
24 | word_count(text)
25 | end
26 |
27 | def pages_count(text)
28 | @ellipsis = 'ignore'
29 | @hyperlink = 'split_at_period'
30 | @contraction = 'count_as_one'
31 | @hyphenated_word = 'count_as_multiple'
32 | @date = 'no_special_treatment'
33 | @number = 'count'
34 | @numbered_list = 'ignore'
35 | @xhtml = 'keep'
36 | @forward_slash = 'count_as_multiple'
37 | @backslash = 'count_as_multiple'
38 | @dotted_line = 'ignore'
39 | @dashed_line = 'ignore'
40 | @underscore = 'ignore'
41 | @stray_punctuation = 'ignore'
42 | @equal_sign = 'break'
43 | word_count(text)
44 | end
45 |
46 | def mword_count(text)
47 | @ellipsis = 'no_special_treatment'
48 | @hyperlink = 'count_as_one'
49 | @contraction = 'count_as_one'
50 | @hyphenated_word = 'count_as_one'
51 | @date = 'no_special_treatment'
52 | @number = 'count'
53 | @numbered_list = 'count'
54 | @xhtml = 'keep'
55 | @forward_slash = 'count_as_one'
56 | @backslash = 'count_as_one'
57 | @dotted_line = 'count'
58 | @dashed_line = 'count'
59 | @underscore = 'count'
60 | @stray_punctuation = 'count'
61 | word_count(text)
62 | end
63 |
64 | private
65 |
66 | def word_count(text)
67 | processed_text = process_ellipsis(text)
68 | processed_text = process_hyperlink(processed_text)
69 | processed_text = process_contraction(processed_text, @tgr)
70 | processed_text = process_date(processed_text)
71 | processed_text = process_number_list(processed_text)
72 | processed_text = process_number(processed_text)
73 | processed_text = process_xhtml(processed_text)
74 | processed_text = process_forward_slash(processed_text)
75 | processed_text = process_backslash(processed_text)
76 | processed_text = process_hyphenated_word(processed_text)
77 | processed_text = process_dotted_line(processed_text)
78 | processed_text = process_dashed_line(processed_text)
79 | processed_text = process_underscore(processed_text)
80 | processed_text = process_stray_punctuation(processed_text)
81 | processed_text = process_equal_sign(processed_text) if @equal_sign.eql?('break')
82 | processed_text.split(/\s+/).reject(&:empty?).size
83 | end
84 |
85 | def process_ellipsis(txt)
86 | if ellipsis.eql?('ignore')
87 | WordCountAnalyzer::Ellipsis.new.replace(txt).gsub(/wseword/, '')
88 | elsif ellipsis.eql?('no_special_treatment')
89 | txt
90 | else
91 | raise 'The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`'
92 | end
93 | end
94 |
95 | def process_hyperlink(txt)
96 | case
97 | when hyperlink.eql?('count_as_one')
98 | WordCountAnalyzer::Hyperlink.new.replace(txt)
99 | when hyperlink.eql?('split_at_period')
100 | WordCountAnalyzer::Hyperlink.new.replace_split_at_period(txt)
101 | when hyperlink.eql?('no_special_treatment')
102 | txt
103 | else
104 | raise 'The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`'
105 | end
106 | end
107 |
108 | def process_contraction(txt, tgr)
109 | if contraction.eql?('count_as_one')
110 | txt
111 | elsif contraction.eql?('count_as_multiple')
112 | array = txt.split(/\s+/)
113 | array.each_with_index.map { |token, i| WordCountAnalyzer::Contraction.new(token: token, following_token: array[i +1], tgr: tgr).replace }.join(' ')
114 | else
115 | raise 'The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
116 | end
117 | end
118 |
119 | def process_hyphenated_word(txt)
120 | if hyphenated_word.eql?('count_as_one')
121 | txt
122 | elsif hyphenated_word.eql?('count_as_multiple')
123 | txt.split(/\s+/).each_with_index.map { |token, i| WordCountAnalyzer::HyphenatedWord.new(token: token).replace }.join(' ')
124 | else
125 | raise 'The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
126 | end
127 | end
128 |
129 | def process_date(txt)
130 | if date.eql?('no_special_treatment')
131 | txt
132 | elsif date.eql?('count_as_one')
133 | WordCountAnalyzer::Date.new.replace(txt)
134 | else
135 | raise 'The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`'
136 | end
137 | end
138 |
139 | def process_number(txt)
140 | if number.eql?('ignore')
141 | WordCountAnalyzer::Number.new(string: txt).replace.gsub(/wsnumword/, '')
142 | elsif number.eql?('count')
143 | txt
144 | else
145 | raise 'The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`'
146 | end
147 | end
148 |
149 | def process_number_list(txt)
150 | if numbered_list.eql?('ignore')
151 | WordCountAnalyzer::NumberedList.new(string: txt).replace
152 | elsif numbered_list.eql?('count')
153 | txt
154 | else
155 | raise 'The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`'
156 | end
157 | end
158 |
159 | def process_xhtml(txt)
160 | if xhtml.eql?('remove')
161 | WordCountAnalyzer::Xhtml.new(string: txt).replace
162 | elsif xhtml.eql?('keep')
163 | txt
164 | else
165 | raise 'The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`'
166 | end
167 | end
168 |
169 | def process_forward_slash(txt)
170 | case
171 | when forward_slash.eql?('count_as_multiple')
172 | WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes
173 | when forward_slash.eql?('count_as_multiple_except_dates')
174 | WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_forward_slashes_except_dates
175 | when forward_slash.eql?('count_as_one')
176 | txt
177 | else
178 | raise 'The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`'
179 | end
180 | end
181 |
182 | def process_backslash(txt)
183 | if backslash.eql?('count_as_multiple')
184 | WordCountAnalyzer::Slash.new(string: txt, date: date, xhtml: xhtml, hyperlink: hyperlink).replace_backslashes
185 | elsif backslash.eql?('count_as_one')
186 | txt
187 | else
188 | raise 'The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`'
189 | end
190 | end
191 |
192 | def process_dotted_line(txt)
193 | if dotted_line.eql?('ignore')
194 | WordCountAnalyzer::Punctuation.new(string: txt).replace_dotted_line
195 | elsif dotted_line.eql?('count')
196 | txt
197 | else
198 | raise 'The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
199 | end
200 | end
201 |
202 | def process_dashed_line(txt)
203 | if dashed_line.eql?('ignore')
204 | WordCountAnalyzer::Punctuation.new(string: txt).replace_dashed_line
205 | elsif dashed_line.eql?('count')
206 | txt
207 | else
208 | raise 'The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
209 | end
210 | end
211 |
212 | def process_underscore(txt)
213 | if underscore.eql?('ignore')
214 | WordCountAnalyzer::Punctuation.new(string: txt).replace_underscore
215 | elsif underscore.eql?('count')
216 | txt
217 | else
218 | raise 'The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
219 | end
220 | end
221 |
222 | def process_stray_punctuation(txt)
223 | if stray_punctuation.eql?('ignore')
224 | WordCountAnalyzer::Punctuation.new(string: txt).replace_stray_punctuation
225 | elsif stray_punctuation.eql?('count')
226 | txt
227 | else
228 | raise 'The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`'
229 | end
230 | end
231 |
232 | def process_equal_sign(txt)
233 | txt.split('=').join(' ').split(/>(?=[a-zA-z]+)/).join(' ')
234 | end
235 | end
236 | end
237 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Word Count Analyzer
2 |
3 | [](http://badge.fury.io/rb/word_count_analyzer) [](https://travis-ci.org/diasks2/word_count_analyzer) [](https://github.com/diasks2/word_count_analyzer/blob/master/LICENSE.txt)
4 |
5 | See what word count [gray areas](#gray-area-details) might be affecting your word count.
6 |
7 | Word Count Analyzer is a Ruby gem that analyzes a string for potential areas of the text that might cause word count discrepancies depending on the tool used. It also provides comprehensive configuration options so you can easily customize how different gray areas should be counted and find the right word count for your purposes.
8 |
9 | If you prioritize speed over accuracy, then I recommend not using this gem. There are most definitely faster gems for getting a word count. However, if accuracy is important, and you want control over the gray areas that affect word count, then this gem is for you.
10 |
11 | ## Install
12 |
13 | **Ruby**
14 | *Supports Ruby 2.1.0 and above*
15 | ```
16 | gem install word_count_analyzer
17 | ```
18 |
19 | **Ruby on Rails**
20 | Add this line to your application’s Gemfile:
21 | ```ruby
22 | gem 'word_count_analyzer'
23 | ```
24 |
25 | ## Live Demo
26 |
27 | Try out a [live demo](https://www.tm-town.com/word-count-analyzer) of Word Count Analyzer in the browser.
28 |
29 | ## Usage
30 |
31 | ### Analyze the word count gray areas of a string
32 |
33 | Common word count gray areas include (*[more details below](#gray-area-details)*):
34 | - Ellipses
35 | - Hyperlinks
36 | - Contractions
37 | - Hyphenated Words
38 | - Dates
39 | - Numbers
40 | - Numbered Lists
41 | - XML and HTML tags
42 | - Forward slashes and backslashes
43 | - Punctuation
44 |
45 | Other gray areas not covered by this gem:
46 | - Headers
47 | - Footers
48 | - Hidden Text (*specific to Microsoft Word*)
49 |
50 | ```ruby
51 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 4. Some HTML and a hyphenated-word. Don't count stray punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
52 | WordCountAnalyzer::Analyzer.new.analyze(text)
53 |
54 | # => {
55 | # "ellipsis": 1,
56 | # "hyperlink": 2,
57 | # "contraction": 4,
58 | # "hyphenated_word": 2,
59 | # "date": 2,
60 | # "number": 1,
61 | # "numbered_list": 3,
62 | # "xhtml": 1,
63 | # "forward_slash": 1,
64 | # "backslash": 1,
65 | # "dotted_line": 1,
66 | # "dashed_line": 1,
67 | # "underscore": 1,
68 | # "stray_punctuation": 5
69 | # }
70 | ```
71 |
72 | ### Count the words in a string
73 |
74 | ```ruby
75 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list \n\n1. item a \n\n2. item b \n\n3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
76 |
77 | WordCountAnalyzer::Counter.new.count(text)
78 | # => 64
79 |
80 | # Overrides all settings to match the way Pages handles word count.
81 | # N.B. The developers of Pages may change the algorithm at any time so this should just be as an approximation.
82 | WordCountAnalyzer::Counter.new.pages_count(text)
83 | # => 76 (or 79 if the list items are not formatted as a list)
84 |
85 | # Overrides all settings to match the way Microsoft Word and wc (Unix) handle word count.
86 | # N.B. The developers of these tools may change the algorithm at any time so this should just be as an approximation.
87 | WordCountAnalyzer::Counter.new.mword_count(text)
88 | # => 71
89 |
90 | # Highly configurable (see all options below)
91 | WordCountAnalyzer::Counter.new(
92 | ellipsis: 'no_special_treatment',
93 | hyperlink: 'no_special_treatment',
94 | contraction: 'count_as_multiple',
95 | hyphenated_word: 'count_as_multiple',
96 | date: 'count_as_one',
97 | number: 'ignore',
98 | numbered_list: 'ignore',
99 | xhtml: 'keep',
100 | forward_slash: 'count_as_multiple',
101 | backslash: 'count_as_multiple',
102 | dotted_line: 'count',
103 | dashed_line: 'count',
104 | underscore: 'count',
105 | stray_punctuation: 'count'
106 | ).count(text)
107 |
108 | # => 77
109 | ```
110 |
111 | #### Counter `options`
112 |
113 | ##### `ellipsis`
114 | **default** = `'ignore'`
115 | - `'ignore'`
116 | Ignores all ellipses in the word count total.
117 | - `'no_special_treatment'`
118 | Ellipses will not be searched for in the string.
119 |
120 |
121 |
122 | ##### `hyperlink`
123 | **default** = `'count_as_one'`
124 | - `'count_as_one'`
125 | Counts a hyperlink as one word.
126 | - `'no_special_treatment'`
127 | Hyperlinks will not be searched for in the string. Therefore, how a hyperlink is handled in the word count will depend on other settings (mainly slashes).
128 | - `'split_at_period'`
129 | Pages will split hyperlinks at a period and count each token as a separate word.
130 |
131 |
132 |
133 | ##### `contraction`
134 | **default** = `'count_as_one'`
135 | - `'count_as_one'`
136 | Counts a contraction as one word.
137 | - `'count_as_multiple'`
138 | Splits a contraction into the words that make it up. Examples:
139 | - `don't` => `do not` (2 words)
140 | - `o'clock` => `of the clock` (3 words)
141 |
142 |
143 |
144 | ##### `hyphenated_word`
145 | **default** = `'count_as_one'`
146 | - `'count_as_one'`
147 | Counts a hyphenated word as one word.
148 | - `'count_as_multiple'`
149 | Breaks a hyphenated word at each hyphen and counts each word separately. Example:
150 | - `devil-may-care` (3 words)
151 |
152 |
153 |
154 | ##### `date`
155 | **default** = `'no_special_treatment'`
156 | - `'no_special_treatment'`
157 | Dates will not be searched for in the string. Therefore, how a date is handled in the word count will depend on other settings.
158 | - `'count_as_one'`
159 | Counts a date as one word. This is more commonly seen in translation CAT tools where a date is thought of as a *placeable* that can usually be automatically translated. Examples:
160 | - Monday, April 4th, 2011 (1 word)
161 | - April 4th, 2011 (1 word)
162 | - 04/04/2011 (1 word)
163 | - 04.04.2011 (1 word)
164 | - 2011/04/04 (1 word)
165 | - 2011-04-04 (1 word)
166 | - 2003Nov9 (1 word)
167 | - 2003 November 9 (1 word)
168 | - 2003-Nov-9 (1 word)
169 | - and others...
170 |
171 |
172 |
173 | ##### `number`
174 | **default** = `'count'`
175 | - `'count'`
176 | Counts a number as one word.
177 | - `'ignore'`
178 | Ignores any numbers in the string (with the exception of `dates` and `numbered_lists`) and does not count them towards the word count.
179 |
180 |
181 |
182 | ##### `numbered_list`
183 | **default** = `'count'`
184 | - `'count'`
185 | Counts a number in a numbered list as one word.
186 | - `'ignore'`
187 | Ignores any numbers that are part of a numbered list and does not count them towards the word count.
188 |
189 |
190 |
191 | ##### `xhtml`
192 | **default** = `'remove'`
193 | - `'remove'`
194 | Removes any XML or HTML opening and closing tags from the string.
195 | - `'keep'`
196 | Ignores any XML or HTML in the string.
197 |
198 |
199 |
200 | ##### `forward_slash`
201 | **default** = `'count_as_multiple_except_dates'`
202 | - `'count_as_multiple_except_dates'`
203 | Separates any tokens that include a forward slash (except dates) at the slash(s) and counts each token individually. Example:
204 | - she/he/it 4/25/2014 (4 words)
205 | - `'count_as_multiple'`
206 | Separates any tokens that include a forward slash at the slash(s) and counts each token individually. Whether dates, hyperlinks and xhtml are included depends on what is set for those options. Example:
207 | - she/he/it (3 words)
208 | - `'count_as_one'`
209 | Counts any tokens that include a forward slash as one word. Example:
210 | - she/he/it (1 word)
211 |
212 |
213 |
214 | ##### `backslash`
215 | **default** = `'count_as_one'`
216 | - `'count_as_one'`
217 | Counts any tokens that include a backslash as one word. Example:
218 | - c:\Users\johndoe (1 word)
219 | - `'count_as_multiple'`
220 | Separates any tokens that include a backslash at the slash(s) and counts each token individually. Example:
221 | - c:\Users\johndoe (3 words)
222 |
223 |
224 |
225 | ##### `dotted_line`
226 | **default** = `'ignore'`
227 | - `'ignore'`
228 | Ignores any dotted lines in the string and does not count them towards the word count.
229 | - `'count'`
230 | Counts a dotted line as one word.
231 |
232 |
233 |
234 | ##### `dashed_line`
235 | **default** = `'ignore'`
236 | - `'ignore'`
237 | Ignores any dashed lines in the string and does not count them towards the word count.
238 | - `'count'`
239 | Counts a dashed line as one word.
240 |
241 |
242 |
243 | ##### `underscore`
244 | **default** = `'ignore'`
245 | - `'ignore'`
246 | Ignores any series of underscores in the string and does not count them towards the word count.
247 | - `'count'`
248 | Counts a series of underscores as one word.
249 |
250 |
251 |
252 | ##### `stray_punctuation`
253 | **default** = `'ignore'`
254 | - `'ignore'`
255 | Ignores any punctuation marks surrounded on both sides by a whitespace in the string and does not count them towards the word count.
256 | - `'count'`
257 | Counts a punctuation mark surrounded on both sides by a whitespace as one word.
258 |
259 | ### Gray Area Details
260 |
261 | #### Ellipsis
262 |
263 | Checks for any occurrences of ellipses in your text. Writers tend to use different formats for ellipsis, and although there are [style guides](http://www.thepunctuationguide.com/ellipses.html), it is rare that these rules are followed.
264 |
265 | ##### Three Consecutive Periods
266 | ```
267 | ...
268 | ```
269 | Tool | Word Count
270 | -------------- | ----------
271 | Microsoft Word | 1
272 | Pages | 0
273 | wc (Unix) | 1
274 |
275 | ##### Four Consecutive Periods
276 | ```
277 | ....
278 | ```
279 | Tool | Word Count
280 | -------------- | ----------
281 | Microsoft Word | 1
282 | Pages | 0
283 | wc (Unix) | 1
284 |
285 | ##### Three Periods With Spaces
286 | ```
287 | . . .
288 | ```
289 | Tool | Word Count
290 | -------------- | ----------
291 | Microsoft Word | 3
292 | Pages | 0
293 | wc (Unix) | 3
294 |
295 | ##### Four Periods With Spaces
296 | ```
297 | . . . .
298 | ```
299 | Tool | Word Count
300 | -------------- | ----------
301 | Microsoft Word | 4
302 | Pages | 0
303 | wc (Unix) | 4
304 |
305 | ##### Horizontal Ellipsis
306 | ```
307 | …
308 | ```
309 | Tool | Word Count
310 | -------------- | ----------
311 | Microsoft Word | 1
312 | Pages | 0
313 | wc (Unix) | 1
314 |
315 | #### Hyperlink
316 |
317 | ```
318 | http://www.example.com
319 | ```
320 | Tool | Word Count
321 | -------------- | ----------
322 | Microsoft Word | 1
323 | Pages | 4
324 | wc (Unix) | 1
325 |
326 | #### Contraction
327 |
328 | Most tools count contractions as one word. [Some might argue](http://english.stackexchange.com/questions/80635/counting-contractions-as-one-or-two-words) a contraction is technically more than one word.
329 |
330 | ```
331 | can't
332 | ```
333 | Tool | Word Count
334 | -------------- | ----------
335 | Microsoft Word | 1
336 | Pages | 1
337 | wc (Unix) | 1
338 |
339 | #### Hyphenated Word
340 |
341 | ```
342 | devil-may-care
343 | ```
344 | Tool | Word Count
345 | -------------- | ----------
346 | Microsoft Word | 1
347 | Pages | 3
348 | wc (Unix) | 1
349 |
350 | #### Date
351 |
352 | Most word processing tools do not do recognize dates, but translation CAT tools tend to recognize dates as one word or [placeable](http://www.wordfast.net/wiki/Placeables). This gem checks for many date formats including those that include day or month abbreviations. A few examples are listed below (*not an exhaustive list*).
353 |
354 | ##### Date (example A)
355 | ```
356 | Monday, April 4th, 2011
357 | ```
358 | Tool | Word Count
359 | -------------- | ----------
360 | Microsoft Word | 4
361 | Pages | 4
362 | wc (Unix) | 4
363 |
364 | ##### Date (example B)
365 | ```
366 | 04/04/2011
367 | ```
368 | Tool | Word Count
369 | -------------- | ----------
370 | Microsoft Word | 1
371 | Pages | 3
372 | wc (Unix) | 1
373 |
374 | ##### Date (example C)
375 | ```
376 | 04.04.2011
377 | ```
378 | Tool | Word Count
379 | -------------- | ----------
380 | Microsoft Word | 1
381 | Pages | 1
382 | wc (Unix) | 1
383 |
384 | #### Number
385 |
386 | ##### Simple number
387 | ```
388 | 200
389 | ```
390 | Tool | Word Count
391 | -------------- | ----------
392 | Microsoft Word | 1
393 | Pages | 1
394 | wc (Unix) | 1
395 |
396 | ##### Number with preceding unit
397 | ```
398 | $200
399 | ```
400 | Tool | Word Count
401 | -------------- | ----------
402 | Microsoft Word | 1
403 | Pages | 1
404 | wc (Unix) | 1
405 |
406 |
407 | ##### Number with unit following
408 | ```
409 | 50%
410 | ```
411 | Tool | Word Count
412 | -------------- | ----------
413 | Microsoft Word | 1
414 | Pages | 1
415 | wc (Unix) | 1
416 |
417 | #### Numbered List
418 |
419 | ```
420 | 1. List item a
421 | 2. List item b
422 | 3. List item c
423 | ```
424 | Tool | Word Count
425 | -------------- | ----------
426 | Microsoft Word | 12
427 | Pages | 9
428 | wc (Unix) | 12
429 |
430 | #### XML and HTML Tags
431 |
432 | ```html
433 | Hello world Hello
434 | ```
435 | Tool | Word Count
436 | -------------- | ----------
437 | Microsoft Word | 4
438 | Pages | 12
439 | wc (Unix) | 4
440 |
441 | #### Slashes
442 |
443 | ##### Forward slash
444 | ```
445 | she/he/it
446 | ```
447 | Tool | Word Count
448 | -------------- | ----------
449 | Microsoft Word | 1
450 | Pages | 3
451 | wc (Unix) | 1
452 |
453 | ##### Backslash
454 | ```
455 | c:\Users\johndoe
456 | ```
457 | Tool | Word Count
458 | -------------- | ----------
459 | Microsoft Word | 1
460 | Pages | 3
461 | wc (Unix) | 1
462 |
463 | #### Punctuation
464 |
465 | ##### Dotted line
466 | ```
467 | .........
468 | ```
469 | Tool | Word Count
470 | -------------- | ----------
471 | Microsoft Word | 1
472 | Pages | 0
473 | wc (Unix) | 1
474 |
475 | ```
476 | ………………………
477 | ```
478 | Tool | Word Count
479 | -------------- | ----------
480 | Microsoft Word | 1
481 | Pages | 0
482 | wc (Unix) | 1
483 |
484 | ##### Dashed line
485 | ```
486 | -----------
487 | ```
488 | Tool | Word Count
489 | -------------- | ----------
490 | Microsoft Word | 1
491 | Pages | 0
492 | wc (Unix) | 1
493 |
494 | ##### Underscore
495 | ```
496 | ____________
497 | ```
498 | Tool | Word Count
499 | -------------- | ----------
500 | Microsoft Word | 1
501 | Pages | 0
502 | wc (Unix) | 1
503 |
504 | ##### Punctuation mark surrounded by spaces
505 | ```
506 | :
507 | ```
508 | Tool | Word Count
509 | -------------- | ----------
510 | Microsoft Word | 1
511 | Pages | 0
512 | wc (Unix) | 1
513 |
514 | ## Research
515 |
516 | - *[So how many words do you think it is?](http://multifarious.filkin.com/2012/11/13/wordcount)* - Paul Filkin
517 | - [Word Count](http://en.wikipedia.org/wiki/Word_count) - Wikipedia
518 | - [Words Counted Ruby Gem](https://github.com/abitdodgy/words_counted) - Mohamad El-Husseini
519 |
520 | ## TODO
521 |
522 | - Add language support for languages other than English
523 | - For most languages this is probably as simple as adding in the translations and abbreviations for months and days.
524 | - For languages that use a character count (Japanese, Chinese) there will be larger changes. For these languages need to add an option for how to handle Roman words within the text.
525 | - Improve performace for longer strings (potentially break string into smaller parts and then sum total of each)
526 |
527 | ## Contributing
528 |
529 | 1. Fork it ( https://github.com/diasks2/word_count_analyzer/fork )
530 | 2. Create your feature branch (`git checkout -b my-new-feature`)
531 | 3. Commit your changes (`git commit -am 'Add some feature'`)
532 | 4. Push to the branch (`git push origin my-new-feature`)
533 | 5. Create a new Pull Request
534 |
535 | ## License
536 |
537 | The MIT License (MIT)
538 |
539 | Copyright (c) 2015 Kevin S. Dias
540 |
541 | Permission is hereby granted, free of charge, to any person obtaining a copy
542 | of this software and associated documentation files (the "Software"), to deal
543 | in the Software without restriction, including without limitation the rights
544 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
545 | copies of the Software, and to permit persons to whom the Software is
546 | furnished to do so, subject to the following conditions:
547 |
548 | The above copyright notice and this permission notice shall be included in
549 | all copies or substantial portions of the Software.
550 |
551 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
552 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
553 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
554 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
555 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
556 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
557 | THE SOFTWARE.
558 |
--------------------------------------------------------------------------------
/spec/word_count_analyzer/counter_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WordCountAnalyzer::Counter do
4 | context 'ellipsis' do
5 | it 'handles an invalid ellipsis argument value' do
6 | text = 'hello world.'
7 | ws = WordCountAnalyzer::Counter.new(ellipsis: 'hello')
8 | expect { ws.count(text) }.to raise_error('The value you specified for ellipsis is not a valid option. Please use either `ignore` or `no_special_treatment`. The default option is `ignore`')
9 | end
10 |
11 | it 'ignores ellipses in the word count' do
12 | text = 'hello world ... what day is it.'
13 | ws = WordCountAnalyzer::Counter.new(
14 | ellipsis: 'ignore'
15 | )
16 | expect(ws.count(text)).to eq(6)
17 | end
18 |
19 | it 'does not ignore ellipses in the word count' do
20 | text = 'hello world ... what day is it.'
21 | ws = WordCountAnalyzer::Counter.new(
22 | ellipsis: 'no_special_treatment'
23 | )
24 | expect(ws.count(text)).to eq(7)
25 | end
26 |
27 | it 'does not ignore ellipses in the word count' do
28 | text = 'hello world... what day is it.'
29 | ws = WordCountAnalyzer::Counter.new(
30 | ellipsis: 'no_special_treatment'
31 | )
32 | expect(ws.count(text)).to eq(6)
33 | end
34 |
35 | it 'sets ignore as the default option' do
36 | text = 'hello world ... what day is it.'
37 | ws = WordCountAnalyzer::Counter.new
38 | expect(ws.count(text)).to eq(6)
39 | end
40 | end
41 |
42 | context 'hyperlink' do
43 | it 'handles an invalid hyperlink argument value' do
44 | text = 'hello world.'
45 | ws = WordCountAnalyzer::Counter.new(hyperlink: 'hello')
46 | expect { ws.count(text) }.to raise_error('The value you specified for hyperlink is not a valid option. Please use either `count_as_one`, `split_at_period`, or `no_special_treatment`. The default option is `count_as_one`')
47 | end
48 |
49 | it 'counts a hyperlink as one word in the word count' do
50 | text = 'The site address is http://www.example.com she said.'
51 | ws = WordCountAnalyzer::Counter.new(
52 | hyperlink: 'count_as_one'
53 | )
54 | expect(ws.count(text)).to eq(7)
55 | end
56 |
57 | it 'counts a hyperlink as one word in the word count' do
58 | text = 'The site address is http://www.example.com she said.'
59 | ws = WordCountAnalyzer::Counter.new(
60 | hyperlink: 'split_at_period',
61 | forward_slash: 'count_as_one'
62 | )
63 | expect(ws.count(text)).to eq(9)
64 | end
65 |
66 | it 'does not search for hyperlinks' do
67 | text = 'The site address is http://www.example.com she said.'
68 | ws = WordCountAnalyzer::Counter.new(
69 | hyperlink: 'no_special_treatment'
70 | )
71 | expect(ws.count(text)).to eq(8)
72 | end
73 |
74 | it 'sets count_as_one as the default option' do
75 | text = 'The site address is http://www.example.com she said.'
76 | ws = WordCountAnalyzer::Counter.new
77 | expect(ws.count(text)).to eq(7)
78 | end
79 | end
80 |
81 | context 'contraction' do
82 | it 'handles an invalid contraction argument value' do
83 | text = 'hello world.'
84 | ws = WordCountAnalyzer::Counter.new(contraction: 'hello')
85 | expect { ws.count(text) }.to raise_error('The value you specified for contraction is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
86 | end
87 |
88 | it 'counts a contraction as one word in the word count' do
89 | text = "Don't do that."
90 | ws = WordCountAnalyzer::Counter.new(
91 | contraction: 'count_as_one'
92 | )
93 | expect(ws.count(text)).to eq(3)
94 | end
95 |
96 | it 'splits a contraction into its separate words for the word count' do
97 | text = "Don't do that."
98 | ws = WordCountAnalyzer::Counter.new(
99 | contraction: 'count_as_multiple'
100 | )
101 | expect(ws.count(text)).to eq(4)
102 | end
103 |
104 | it 'sets count_as_one as the default option' do
105 | text = "Don't do that."
106 | ws = WordCountAnalyzer::Counter.new
107 | expect(ws.count(text)).to eq(3)
108 | end
109 | end
110 |
111 | context 'hyphenated_word' do
112 | it 'handles an invalid hyphenated_word argument value' do
113 | text = 'hello world.'
114 | ws = WordCountAnalyzer::Counter.new(hyphenated_word: 'hello')
115 | expect { ws.count(text) }.to raise_error('The value you specified for hyphenated_word is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
116 | end
117 |
118 | it 'counts a hyphenated word as one word in the word count' do
119 | text = 'He has a devil-may-care attitude.'
120 | ws = WordCountAnalyzer::Counter.new(
121 | hyphenated_word: 'count_as_one'
122 | )
123 | expect(ws.count(text)).to eq(5)
124 | end
125 |
126 | it 'splits a hyphenated word into its separate words for the word count' do
127 | text = 'He has a devil-may-care attitude.'
128 | ws = WordCountAnalyzer::Counter.new(
129 | hyphenated_word: 'count_as_multiple'
130 | )
131 | expect(ws.count(text)).to eq(7)
132 | end
133 |
134 | it 'sets count_as_one as the default option' do
135 | text = 'He has a devil-may-care attitude.'
136 | ws = WordCountAnalyzer::Counter.new
137 | expect(ws.count(text)).to eq(5)
138 | end
139 | end
140 |
141 | context 'date' do
142 | it 'handles an invalid date argument value' do
143 | text = 'hello world.'
144 | ws = WordCountAnalyzer::Counter.new(date: 'hello')
145 | expect { ws.count(text) }.to raise_error('The value you specified for date is not a valid option. Please use either `count_as_one` or `no_special_treatment`. The default option is `no_special_treatment`')
146 | end
147 |
148 | it 'ignores date placeables' do
149 | text = 'Today is Tues. March 3rd, 2011.'
150 | ws = WordCountAnalyzer::Counter.new(
151 | date: 'no_special_treatment'
152 | )
153 | expect(ws.count(text)).to eq(6)
154 | end
155 |
156 | it 'counts a date placeable as one word in the word count' do
157 | text = 'Today is Tues. March 3rd, 2011.'
158 | ws = WordCountAnalyzer::Counter.new(
159 | date: 'count_as_one'
160 | )
161 | expect(ws.count(text)).to eq(3)
162 | end
163 |
164 | it 'sets count_as_one as the default option' do
165 | text = 'Today is Tues. March 3rd, 2011.'
166 | ws = WordCountAnalyzer::Counter.new
167 | expect(ws.count(text)).to eq(6)
168 | end
169 | end
170 |
171 | context 'number' do
172 | it 'handles an invalid number argument value' do
173 | text = 'hello world.'
174 | ws = WordCountAnalyzer::Counter.new(number: 'hello')
175 | expect { ws.count(text) }.to raise_error('The value you specified for number is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
176 | end
177 |
178 | it 'counts a number as a word' do
179 | text = 'There is $300 in the safe. The password is 1234.'
180 | ws = WordCountAnalyzer::Counter.new(
181 | number: 'count'
182 | )
183 | expect(ws.count(text)).to eq(10)
184 | end
185 |
186 | it 'ignores numbers in the word count' do
187 | text = 'There is $300 in the safe. The password is 1234.'
188 | ws = WordCountAnalyzer::Counter.new(
189 | number: 'ignore'
190 | )
191 | expect(ws.count(text)).to eq(8)
192 | end
193 |
194 | it 'sets count as the default option' do
195 | text = 'There is $300 in the safe. The password is 1234.'
196 | ws = WordCountAnalyzer::Counter.new
197 | expect(ws.count(text)).to eq(10)
198 | end
199 | end
200 |
201 | context 'number_list' do
202 | it 'handles an invalid number argument value' do
203 | text = 'hello world.'
204 | ws = WordCountAnalyzer::Counter.new(numbered_list: 'hello')
205 | expect { ws.count(text) }.to raise_error('The value you specified for numbered_list is not a valid option. Please use either `ignore` or `count`. The default option is `count`')
206 | end
207 |
208 | it 'counts a numbered list number as a word' do
209 | text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
210 | ws = WordCountAnalyzer::Counter.new(
211 | numbered_list: 'count'
212 | )
213 | expect(ws.count(text)).to eq(17)
214 | end
215 |
216 | it 'ignores numbered list numbers' do
217 | text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
218 | ws = WordCountAnalyzer::Counter.new(
219 | numbered_list: 'ignore'
220 | )
221 | expect(ws.count(text)).to eq(14)
222 | end
223 |
224 | it 'sets count as the default option' do
225 | text = "Number 2. Add a list 1. List item a\n\n2. List item b\n\n3. List item c."
226 | ws = WordCountAnalyzer::Counter.new
227 | expect(ws.count(text)).to eq(17)
228 | end
229 | end
230 |
231 | context 'xhtml' do
232 | it 'handles an invalid number argument value' do
233 | text = 'hello world.'
234 | ws = WordCountAnalyzer::Counter.new(xhtml: 'hello')
235 | expect { ws.count(text) }.to raise_error('The value you specified for xhtml is not a valid option. Please use either `remove` or `keep`. The default option is `remove`')
236 | end
237 |
238 | it 'removes all xhtml from the text' do
239 | text = "Hello world"
240 | ws = WordCountAnalyzer::Counter.new(
241 | xhtml: 'remove'
242 | )
243 | expect(ws.count(text)).to eq(2)
244 | end
245 |
246 | it 'keeps xhtml in the text' do
247 | text = "Hello world"
248 | ws = WordCountAnalyzer::Counter.new(
249 | xhtml: 'keep',
250 | forward_slash: 'count_as_one'
251 | )
252 | expect(ws.count(text)).to eq(3)
253 | end
254 |
255 | it 'keeps xhtml in the text' do
256 | text = "Hello world"
257 | ws = WordCountAnalyzer::Counter.new(
258 | xhtml: 'keep'
259 | )
260 | expect(ws.count(text)).to eq(4)
261 | end
262 |
263 | it 'sets remove as the default option' do
264 | text = "Hello world"
265 | ws = WordCountAnalyzer::Counter.new
266 | expect(ws.count(text)).to eq(2)
267 | end
268 | end
269 |
270 | context 'forward_slash' do
271 | it 'handles an invalid number argument value' do
272 | text = 'hello world.'
273 | ws = WordCountAnalyzer::Counter.new(forward_slash: 'hello')
274 | expect { ws.count(text) }.to raise_error('The value you specified for forward_slash is not a valid option. Please use either `count_as_one`, `count_as_multiple` or `count_as_multiple_except_dates`. The default option is `count_as_multiple_except_dates`')
275 | end
276 |
277 | it 'counts a forward slash as multiple words (except dates) #001' do
278 | text = "She/he/it said hello. 4/22/2013."
279 | ws = WordCountAnalyzer::Counter.new(
280 | forward_slash: 'count_as_multiple_except_dates'
281 | )
282 | expect(ws.count(text)).to eq(6)
283 | end
284 |
285 | it 'counts a forward slash as multiple words #002' do
286 | text = "She/he/it said hello. 4/22/2013."
287 | ws = WordCountAnalyzer::Counter.new(
288 | forward_slash: 'count_as_multiple'
289 | )
290 | expect(ws.count(text)).to eq(8)
291 | end
292 |
293 | it 'counts a forward slash as multiple words #003' do
294 | text = "She/he/it said hello. 4/22/2013."
295 | ws = WordCountAnalyzer::Counter.new(
296 | forward_slash: 'count_as_multiple',
297 | date: 'count_as_one'
298 | )
299 | expect(ws.count(text)).to eq(6)
300 | end
301 |
302 | it 'counts a forward slash as one word' do
303 | text = "She/he/it said hello."
304 | ws = WordCountAnalyzer::Counter.new(
305 | forward_slash: 'count_as_one'
306 | )
307 | expect(ws.count(text)).to eq(3)
308 | end
309 |
310 | it 'sets count_as_multiple_except_dates as the default option' do
311 | text = "She/he/it said hello. 4/22/2013."
312 | ws = WordCountAnalyzer::Counter.new
313 | expect(ws.count(text)).to eq(6)
314 | end
315 | end
316 |
317 | context 'backslash' do
318 | it 'handles an invalid number argument value' do
319 | text = 'hello world.'
320 | ws = WordCountAnalyzer::Counter.new(backslash: 'hello')
321 | expect { ws.count(text) }.to raise_error('The value you specified for backslash is not a valid option. Please use either `count_as_one` or `count_as_multiple`. The default option is `count_as_one`')
322 | end
323 |
324 | it 'counts a token with backslashes as one word' do
325 | text = 'The file location is c:\Users\johndoe'
326 | ws = WordCountAnalyzer::Counter.new(
327 | backslash: 'count_as_one'
328 | )
329 | expect(ws.count(text)).to eq(5)
330 | end
331 |
332 | it 'counts a token with backslashes as multiple words' do
333 | text = 'The file location is c:\Users\johndoe'
334 | ws = WordCountAnalyzer::Counter.new(
335 | backslash: 'count_as_multiple'
336 | )
337 | expect(ws.count(text)).to eq(7)
338 | end
339 |
340 | it 'sets count_as_one as the default option' do
341 | text = 'The file location is c:\Users\johndoe'
342 | ws = WordCountAnalyzer::Counter.new
343 | expect(ws.count(text)).to eq(5)
344 | end
345 | end
346 |
347 | context 'dotted_line' do
348 | it 'handles an invalid number argument value' do
349 | text = 'hello world.'
350 | ws = WordCountAnalyzer::Counter.new(dotted_line: 'hello')
351 | expect { ws.count(text) }.to raise_error('The value you specified for dotted_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
352 | end
353 |
354 | it 'ignores continuous strings of dots in the word count' do
355 | text = 'Here is one …………………………………………………………………… and another ......'
356 | ws = WordCountAnalyzer::Counter.new(
357 | dotted_line: 'ignore'
358 | )
359 | expect(ws.count(text)).to eq(5)
360 | end
361 |
362 | it 'counts a continuous string of dots as a word' do
363 | text = 'Here is one …………………………………………………………………… and another ......'
364 | ws = WordCountAnalyzer::Counter.new(
365 | dotted_line: 'count'
366 | )
367 | expect(ws.count(text)).to eq(7)
368 | end
369 |
370 | it 'sets ignore as the default option' do
371 | text = 'Here is one …………………………………………………………………… and another ......'
372 | ws = WordCountAnalyzer::Counter.new
373 | expect(ws.count(text)).to eq(5)
374 | end
375 | end
376 |
377 | context 'dashed_line' do
378 | it 'handles an invalid number argument value' do
379 | text = 'hello world.'
380 | ws = WordCountAnalyzer::Counter.new(dashed_line: 'hello')
381 | expect { ws.count(text) }.to raise_error('The value you specified for dashed_line is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
382 | end
383 |
384 | it 'ignores continuous strings of dashes in the word count' do
385 | text = 'Here is one ----- and another -----'
386 | ws = WordCountAnalyzer::Counter.new(
387 | dashed_line: 'ignore'
388 | )
389 | expect(ws.count(text)).to eq(5)
390 | end
391 |
392 | it 'counts a continuous string of dashes as a word' do
393 | text = 'Here is one ----- and another -----'
394 | ws = WordCountAnalyzer::Counter.new(
395 | dashed_line: 'count'
396 | )
397 | expect(ws.count(text)).to eq(7)
398 | end
399 |
400 | it 'sets ignore as the default option' do
401 | text = 'Here is one ----- and another -----'
402 | ws = WordCountAnalyzer::Counter.new
403 | expect(ws.count(text)).to eq(5)
404 | end
405 | end
406 |
407 | context 'underscore' do
408 | it 'handles an invalid number argument value' do
409 | text = 'hello world.'
410 | ws = WordCountAnalyzer::Counter.new(underscore: 'hello')
411 | expect { ws.count(text) }.to raise_error('The value you specified for underscore is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
412 | end
413 |
414 | it 'ignores continuous strings of underscores in the word count' do
415 | text = "Here is one ______ and another ______"
416 | ws = WordCountAnalyzer::Counter.new(
417 | underscore: 'ignore'
418 | )
419 | expect(ws.count(text)).to eq(5)
420 | end
421 |
422 | it 'counts a continuous string of underscores as a word' do
423 | text = 'Here is one ______ and another ______'
424 | ws = WordCountAnalyzer::Counter.new(
425 | underscore: 'count'
426 | )
427 | expect(ws.count(text)).to eq(7)
428 | end
429 |
430 | it 'sets ignore as the default option' do
431 | text = 'Here is one ______ and another ______'
432 | ws = WordCountAnalyzer::Counter.new
433 | expect(ws.count(text)).to eq(5)
434 | end
435 | end
436 |
437 | context 'stray_punctuation' do
438 | it 'handles an invalid number argument value' do
439 | text = 'hello world.'
440 | ws = WordCountAnalyzer::Counter.new(stray_punctuation: 'hello')
441 | expect { ws.count(text) }.to raise_error('The value you specified for stray_punctuation is not a valid option. Please use either `ignore` or `count`. The default option is `ignore`')
442 | end
443 |
444 | it 'ignores continuous strings of underscores in the word count' do
445 | text = 'Hello world ? This is another - sentence .'
446 | ws = WordCountAnalyzer::Counter.new(
447 | stray_punctuation: 'ignore'
448 | )
449 | expect(ws.count(text)).to eq(6)
450 | end
451 |
452 | it 'counts a continuous string of underscores as a word' do
453 | text = 'Hello world ? This is another - sentence .'
454 | ws = WordCountAnalyzer::Counter.new(
455 | stray_punctuation: 'count'
456 | )
457 | expect(ws.count(text)).to eq(9)
458 | end
459 |
460 | it 'sets ignore as the default option' do
461 | text = 'Hello world ? This is another - sentence .'
462 | ws = WordCountAnalyzer::Counter.new
463 | expect(ws.count(text)).to eq(6)
464 | end
465 | end
466 |
467 | it 'counts the words in a string #001' do
468 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
469 | ws = WordCountAnalyzer::Counter.new(
470 | ellipsis: 'ignore',
471 | hyperlink: 'count_as_one',
472 | contraction: 'count_as_one',
473 | hyphenated_word: 'count_as_one',
474 | date: 'no_special_treatment',
475 | number: 'count',
476 | numbered_list: 'count',
477 | xhtml: 'remove',
478 | forward_slash: 'count_as_one',
479 | backslash: 'count_as_one',
480 | dotted_line: 'ignore',
481 | dashed_line: 'ignore',
482 | underscore: 'ignore',
483 | stray_punctuation: 'ignore'
484 | )
485 | expect(ws.count(text)).to eq(62)
486 | end
487 |
488 | it 'counts the words in a string #002' do
489 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
490 | ws = WordCountAnalyzer::Counter.new(
491 | ellipsis: 'no_special_treatment',
492 | hyperlink: 'no_special_treatment',
493 | contraction: 'count_as_multiple',
494 | hyphenated_word: 'count_as_multiple',
495 | date: 'count_as_one',
496 | number: 'ignore',
497 | numbered_list: 'ignore',
498 | xhtml: 'keep',
499 | forward_slash: 'count_as_multiple',
500 | backslash: 'count_as_multiple',
501 | dotted_line: 'count',
502 | dashed_line: 'count',
503 | underscore: 'count',
504 | stray_punctuation: 'count'
505 | )
506 | expect(ws.count(text)).to eq(77)
507 | end
508 |
509 | it 'counts the words in a string #003' do
510 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
511 | ws = WordCountAnalyzer::Counter.new
512 | expect(ws.count(text)).to eq(64)
513 | end
514 |
515 | it 'counts the words in a string #004' do
516 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
517 | ws = WordCountAnalyzer::Counter.new(forward_slash: 'count_as_multiple')
518 | expect(ws.count(text)).to eq(66)
519 | end
520 |
521 | it 'counts the words in a string #005' do
522 | text = "Hello world... 11/22/2013"
523 | ws = WordCountAnalyzer::Counter.new
524 | expect(ws.count(text)).to eq(3)
525 | end
526 |
527 | context 'Pages Word Count' do
528 | it 'reverse engineers Pages word count #001' do
529 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list \n\n1. item a \n\n2. item b \n\n3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
530 | ws = WordCountAnalyzer::Counter.new(
531 | ellipsis: 'no_special_treatment',
532 | hyperlink: 'split_at_period',
533 | contraction: 'count_as_one',
534 | hyphenated_word: 'count_as_multiple',
535 | date: 'no_special_treatment',
536 | number: 'count',
537 | numbered_list: 'ignore',
538 | xhtml: 'keep',
539 | forward_slash: 'count_as_multiple',
540 | backslash: 'count_as_multiple',
541 | dotted_line: 'ignore',
542 | dashed_line: 'ignore',
543 | underscore: 'ignore',
544 | stray_punctuation: 'ignore'
545 | )
546 | expect(ws.count(text)).to eq(76)
547 | end
548 |
549 | it 'reverse engineers Pages word count #002' do
550 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
551 | ws = WordCountAnalyzer::Counter.new
552 | expect(ws.pages_count(text)).to eq(76)
553 | end
554 |
555 | it 'reverse engineers Pages word count #003' do
556 | text = "..."
557 | ws = WordCountAnalyzer::Counter.new
558 | expect(ws.pages_count(text)).to eq(0)
559 | end
560 |
561 | it 'reverse engineers Pages word count #004' do
562 | text = "1. List item a\n\n2. List item b\n\n3. List item c"
563 | ws = WordCountAnalyzer::Counter.new
564 | expect(ws.pages_count(text)).to eq(9)
565 | end
566 |
567 | it 'reverse engineers Pages word count #005' do
568 | text = "Hello world Hello"
569 | ws = WordCountAnalyzer::Counter.new
570 | expect(ws.pages_count(text)).to eq(12)
571 | end
572 | end
573 |
574 | context 'Microsoft Word Count' do
575 | it 'reverse engineers the Microsoft Word / wc (Unix) word count #001' do
576 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
577 | ws = WordCountAnalyzer::Counter.new(
578 | ellipsis: 'no_special_treatment',
579 | hyperlink: 'count_as_one',
580 | contraction: 'count_as_one',
581 | hyphenated_word: 'count_as_one',
582 | date: 'no_special_treatment',
583 | number: 'count',
584 | numbered_list: 'count',
585 | xhtml: 'keep',
586 | forward_slash: 'count_as_one',
587 | backslash: 'count_as_one',
588 | dotted_line: 'count',
589 | dashed_line: 'count',
590 | underscore: 'count',
591 | stray_punctuation: 'count'
592 | )
593 | expect(ws.count(text)).to eq(71)
594 | end
595 |
596 | it 'reverse engineers the Microsoft Word / wc (Unix) word count #002' do
597 | text = "This string has a date: Monday, November 3rd, 2011. I was thinking... it also shouldn't have too many contractions, maybe 2. Some HTML and a hyphenated-word. Don't count punctuation ? ? ? Please visit the ____________ ------------ ........ go-to site: https://www.example-site.com today. Let's add a list 1. item a 2. item b 3. item c. Now let's add he/she/it or a c:\\Users\\john. 2/15/2012 is the date! { HYPERLINK 'http://www.hello.com' }"
598 | ws = WordCountAnalyzer::Counter.new
599 | expect(ws.mword_count(text)).to eq(71)
600 | end
601 | end
602 |
603 | context 'example sentences' do
604 | it 'String with common words (no edge cases) #001' do
605 | ws = WordCountAnalyzer::Counter.new
606 | expect(ws.count('This sentence contains nothing crazy.')).to eq(5)
607 | end
608 |
609 | it 'String with a number #002' do
610 | ws = WordCountAnalyzer::Counter.new
611 | expect(ws.count('This sentence contains 1 number.')).to eq(5)
612 | end
613 |
614 | it 'String with a date #003' do
615 | ws = WordCountAnalyzer::Counter.new
616 | expect(ws.count('Today is Monday, April 4th, 2011.')).to eq(6)
617 | end
618 |
619 | it 'String #004' do
620 | ws = WordCountAnalyzer::Counter.new
621 | expect(ws.count('hello world ...')).to eq(2)
622 | end
623 |
624 | it 'does not split on unicode chars' do
625 | ws = WordCountAnalyzer::Counter.new
626 | expect(ws.count('São Paulo')).to eq(2)
627 | end
628 |
629 | it 'should not count HTML tags' do
630 | ws = WordCountAnalyzer::Counter.new
631 | expect(ws.count("the brown fox jumped over the lazy dog")).to eq(8)
632 | end
633 |
634 | it 'should handle special characters' do
635 | ws = WordCountAnalyzer::Counter.new
636 | expect(ws.count("the \"brown\" fox 'jumped' | over \\ the / lazy dog")).to eq(8)
637 | end
638 | end
639 | end
640 |
--------------------------------------------------------------------------------