├── .gitignore ├── .travis.yml ├── CHANGES.txt ├── Gemfile ├── Gemfile.lock ├── MIT_LICENSE ├── README.rdoc ├── Rakefile ├── VERSION ├── html_to_plain_text.gemspec ├── lib └── html_to_plain_text.rb └── spec ├── html_to_plain_text_spec.rb └── spec_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | pkg 2 | tmp 3 | rdoc 4 | *.rbc 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | cache: bundler 3 | sudo: false 4 | rvm: 5 | - 1.9.3 6 | - 2.0.0 7 | - 2.1.7 8 | - 2.2.3 9 | - rbx 10 | - jruby 11 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | 1.0.5 2 | 3 | Only add pipes on tables if border attributes set to non-zero value. 4 | 5 | 1.0.4 6 | 7 | Small tweak to outputing link URLs when they don't make sense. 8 | 9 | 1.0.3 10 | 11 | * improve performance slightly by replacing runtime strings with constants 12 | * testing on modern rubies (grosser) 13 | * using gemspec in Gemfile (grosser) 14 | * not shipping test files for smaller gem / faster installs / smaller cached gems (grosser) 15 | * rake bump:patch -> increment version (grosser) 16 | * rake release -> ship new version (grosser) 17 | 18 | 1.0.2 19 | 20 | * remove trailing whitespace on converted text. 21 | 22 | 1.0.1 23 | 24 | * better handling of non-html or nil text 25 | 26 | 1.0.0 27 | 28 | * initial release 29 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'http://rubygems.org' 2 | 3 | gemspec 4 | 5 | gem 'byebug', :platform => (RUBY_VERSION > "2.0.0" ? :mri : :mswin) 6 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | html_to_plain_text (1.0.5) 5 | nokogiri (>= 1.4.0) 6 | 7 | GEM 8 | remote: http://rubygems.org/ 9 | specs: 10 | bump (0.5.2) 11 | byebug (8.2.1) 12 | diff-lcs (1.2.5) 13 | mini_portile (0.6.2) 14 | nokogiri (1.6.6.2) 15 | mini_portile (~> 0.6.0) 16 | rake (0.9.2) 17 | rspec (3.3.0) 18 | rspec-core (~> 3.3.0) 19 | rspec-expectations (~> 3.3.0) 20 | rspec-mocks (~> 3.3.0) 21 | rspec-core (3.3.2) 22 | rspec-support (~> 3.3.0) 23 | rspec-expectations (3.3.1) 24 | diff-lcs (>= 1.2.0, < 2.0) 25 | rspec-support (~> 3.3.0) 26 | rspec-mocks (3.3.2) 27 | diff-lcs (>= 1.2.0, < 2.0) 28 | rspec-support (~> 3.3.0) 29 | rspec-support (3.3.0) 30 | 31 | PLATFORMS 32 | ruby 33 | 34 | DEPENDENCIES 35 | bump 36 | byebug 37 | html_to_plain_text! 38 | rake 39 | rspec (> 2.6.0) 40 | 41 | BUNDLED WITH 42 | 1.10.6 43 | -------------------------------------------------------------------------------- /MIT_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Brian Durand 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = HTML To Plain Text 2 | 3 | gem install html_to_plain_text 4 | 5 | A simple gem that provide code to convert HTML into a plain text alternative. Line breaks from HTML block level elements will be maintained. Lists and tables will also maintain a little bit of formatting. 6 | 7 | * Line breaks will be approximated using the generally established default margins for HTML tags (i.e.

8 | tag generates two line breaks,

generates one) 9 | * Lists items will be numbered or bulleted with an asterisk 10 | *
tags will add line breaks 11 | *
tags will add a string of hyphens to serve as a horizontal rule 12 | * elements will enclosed in "|" delimiters 13 | * tags will have the href URL appended to the text in parentheses 14 | * Formatting tags like or will be stripped 15 | * Formatting inside
 or  elements will be honored
16 | * Code-like tags like <script> or <style> will be stripped
17 | 
18 | == Usage
19 | 
20 |     require 'html_to_plain_text'
21 |     html = "<h1>Hello</h1><p>world!</p>"
22 |     HtmlToPlainText.plain_text(html)
23 |     => "Hello\n\nworld!"
24 | 
25 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'bundler/setup'
 2 | require 'bundler/gem_tasks'
 3 | require 'rspec/core/rake_task'
 4 | require 'bump/tasks'
 5 | 
 6 | desc 'Default: run unit tests.'
 7 | task :default => :test
 8 | 
 9 | desc 'Run the unit tests'
10 | RSpec::Core::RakeTask.new(:test)
11 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.5
2 | 


--------------------------------------------------------------------------------
/html_to_plain_text.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |s|
 2 |   s.name = 'html_to_plain_text'
 3 |   s.version = File.read(File.expand_path("../VERSION", __FILE__)).strip
 4 |   s.summary = "A simple library for converting HTML into plain text."
 5 |   s.description = "A simple library for converting HTML into an approximation in plain text."
 6 | 
 7 |   s.authors = ['Brian Durand']
 8 |   s.email = ['bdurand@embellishedvisions.com']
 9 |   s.homepage = "https://github.com/bdurand/html_to_plain_text"
10 | 
11 |   s.files = ['README.rdoc', 'VERSION', 'Rakefile', 'MIT_LICENSE'] +  Dir.glob('lib/**/*')
12 | 
13 |   s.has_rdoc = true
14 |   s.rdoc_options = ["--charset=UTF-8", "--main", "README.rdoc"]
15 |   s.extra_rdoc_files = ["README.rdoc"]
16 | 
17 |   s.add_dependency "nokogiri", ">=1.4.0"
18 |   s.add_development_dependency "rspec", ">2.6.0"
19 |   s.add_development_dependency "rake"
20 |   s.add_development_dependency "bump"
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/html_to_plain_text.rb:
--------------------------------------------------------------------------------
  1 | require 'nokogiri'
  2 | 
  3 | # The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation.
  4 | module HtmlToPlainText
  5 |   IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze
  6 |   PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze
  7 |   BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze
  8 |   WHITESPACE = [" ", "\n", "\r"].freeze
  9 |   PLAINTEXT = "plaintext".freeze
 10 |   PRE = "pre".freeze
 11 |   BR = "br".freeze
 12 |   HR = "hr".freeze
 13 |   TD = "td".freeze
 14 |   TH = "th".freeze
 15 |   TR = "tr".freeze
 16 |   OL = "ol".freeze
 17 |   UL = "ul".freeze
 18 |   LI = "li".freeze
 19 |   A = "a".freeze
 20 |   TABLE = "table".freeze
 21 |   NUMBERS = ["1", "a"].freeze
 22 |   ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze
 23 |   HTML_PATTERN = /[<&]/.freeze
 24 |   TRAILING_WHITESPACE = /[ \t]+$/.freeze
 25 |   BODY_TAG_XPATH = "/html/body".freeze
 26 |   CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze
 27 |   LINE_BREAK_PATTERN = /[\n\r]/.freeze
 28 |   NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze
 29 |   NOT_WHITESPACE_PATTERN = /\S/.freeze
 30 |   SPACE = " ".freeze
 31 |   EMPTY = "".freeze
 32 |   NEWLINE = "\n".freeze
 33 |   HREF = "href".freeze
 34 |   TABLE_SEPARATOR = " | ".freeze
 35 | 
 36 |   # Helper instance method for converting HTML into plain text. This method simply calls HtmlToPlainText.plain_text.
 37 |   def plain_text(html)
 38 |     HtmlToPlainText.plain_text(html)
 39 |   end
 40 | 
 41 |   class << self
 42 |     # Convert some HTML into a plain text approximation.
 43 | 
 44 |     def plain_text(html)
 45 |       return nil if html.nil?
 46 |       return html.dup unless html =~ HTML_PATTERN
 47 |       body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first
 48 |       return unless body
 49 |       convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE)
 50 |     end
 51 | 
 52 |     private
 53 | 
 54 |     # Convert an HTML node to plain text. This method is called recursively with the output and
 55 |     # formatting options for special tags.
 56 |     def convert_node_to_plain_text(parent, out = '', options = {})
 57 |       if PARAGRAPH_TAGS.include?(parent.name)
 58 |         append_paragraph_breaks(out)
 59 |       elsif BLOCK_TAGS.include?(parent.name)
 60 |         append_block_breaks(out)
 61 |       end
 62 | 
 63 |       format_list_item(out, options) if parent.name == LI
 64 |       out << "| " if parent.name == TR && data_table?(parent.parent)
 65 | 
 66 |       parent.children.each do |node|
 67 |         if node.text? || node.cdata?
 68 |           text = node.text
 69 |           unless options[:pre]
 70 |             text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE)
 71 |             text.lstrip! if WHITESPACE.include?(out[-1, 1])
 72 |           end
 73 |           out << text
 74 |         elsif node.name == PLAINTEXT
 75 |           out << node.text
 76 |         elsif node.element? && !IGNORE_TAGS.include?(node.name)
 77 |           convert_node_to_plain_text(node, out, child_options(node, options))
 78 | 
 79 |           if node.name == BR
 80 |             out.sub!(TRAILING_WHITESPACE, EMPTY)
 81 |             out << NEWLINE
 82 |           elsif node.name == HR
 83 |             out.sub!(TRAILING_WHITESPACE, EMPTY)
 84 |             out << NEWLINE unless out.end_with?(NEWLINE)
 85 |             out << "-------------------------------\n"
 86 |           elsif node.name == TD || node.name == TH
 87 |             out << (data_table?(parent.parent) ? TABLE_SEPARATOR : SPACE)
 88 |           elsif node.name == A
 89 |             href = node[HREF]
 90 |             if href &&
 91 |                 href =~ ABSOLUTE_URL_PATTERN &&
 92 |                 node.text =~ NOT_WHITESPACE_PATTERN &&
 93 |                 node.text != href &&
 94 |                 node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:a@b.com">a@b.com</a>
 95 |               out << " (#{href}) "
 96 |             end
 97 |           elsif PARAGRAPH_TAGS.include?(node.name)
 98 |             append_paragraph_breaks(out)
 99 |           elsif BLOCK_TAGS.include?(node.name)
100 |             append_block_breaks(out)
101 |           end
102 |         end
103 |       end
104 |       out
105 |     end
106 | 
107 |     # Set formatting options that will be passed to child elements for a tag.
108 |     def child_options(node, options)
109 |       if node.name == UL
110 |         level = options[:ul] || -1
111 |         level += 1
112 |         options.merge(:list => :ul, :ul => level)
113 |       elsif node.name == OL
114 |         level = options[:ol] || -1
115 |         level += 1
116 |         options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2])
117 |       elsif node.name == PRE
118 |         options.merge(:pre => true)
119 |       else
120 |         options
121 |       end
122 |     end
123 | 
124 |     # Add double line breaks between paragraph elements. If line breaks already exist,
125 |     # new ones will only be added to get to two.
126 |     def append_paragraph_breaks(out)
127 |       out.sub!(TRAILING_WHITESPACE, EMPTY)
128 |       if out.end_with?(NEWLINE)
129 |         out << NEWLINE unless out.end_with?("\n\n")
130 |       else
131 |         out << "\n\n"
132 |       end
133 |     end
134 | 
135 |     # Add a single line break between block elements. If a line break already exists,
136 |     # none will be added.
137 |     def append_block_breaks(out)
138 |       out.sub!(TRAILING_WHITESPACE, EMPTY)
139 |       out << NEWLINE unless out.end_with?(NEWLINE)
140 |     end
141 | 
142 |     # Add an appropriate bullet or number to a list element.
143 |     def format_list_item(out, options)
144 |       if options[:list] == :ul
145 |         out << "#{'*' * (options[:ul] + 1)} "
146 |       elsif options[:list] == :ol
147 |         number = options[:number]
148 |         options[:number] = number.next
149 |         out << "#{number}. "
150 |       end
151 |     end
152 | 
153 |     def data_table?(table)
154 |       table.attributes['border'].to_s.to_i > 0
155 |     end
156 |   end
157 | end
158 | 


--------------------------------------------------------------------------------
/spec/html_to_plain_text_spec.rb:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | require 'spec_helper'
  3 | 
  4 | RSpec.describe HtmlToPlainText do
  5 |   def text(html)
  6 |     HtmlToPlainText.plain_text(html)
  7 |   end
  8 | 
  9 |   it "formats paragraph tags" do
 10 |     html = "<h1>Test</h1><h2>More Test</h2>\t \t<p>\n\tThis is a test\n</p>"
 11 |     expect(text(html)).to eq "Test\n\nMore Test\n\nThis is a test"
 12 |   end
 13 | 
 14 |   it "formats block tags" do
 15 |     html = "<div>Test</div><div>More Test<div>\t This is a test\t </div></div>"
 16 |     expect(text(html)).to eq "Test\nMore Test\nThis is a test"
 17 |   end
 18 | 
 19 |   it "formats <br> tags" do
 20 |     html = "<div>Test</div><br><div>More Test \t <br />This is a test"
 21 |     expect(text(html)).to eq "Test\n\nMore Test\nThis is a test"
 22 |   end
 23 | 
 24 |   it "formats <hr> tags" do
 25 |     html = "<div>Test</div><hr><div>More Test \t <hr />This is a test"
 26 |     expect(text(html)).to eq "Test\n-------------------------------\nMore Test\n-------------------------------\nThis is a test"
 27 |   end
 28 | 
 29 |   it "keeps text formatting in <pre> tag blocks" do
 30 |     html = "<div>This \n is a \ntest</div><pre>with\n  pre tags</pre>end"
 31 |     expect(text(html)).to eq "This is a test\nwith\n  pre tags\nend"
 32 |   end
 33 | 
 34 |   it "removes inline formatting tags" do
 35 |     html = "This is <strong>so</strong> cool. I<em> mean <em>it."
 36 |     expect(text(html)).to eq "This is so cool. I mean it."
 37 |   end
 38 | 
 39 |   it "removes script, style, object, applet, and iframe tags" do
 40 |     html = "script <script>do_something</script> style <style>css</style> object <object>config</object> applet <applet>config</applet> iframe <iframe>config</iframe>"
 41 |     expect(text(html)).to eq "script style object applet iframe"
 42 |   end
 43 | 
 44 |   it "handles plaintext tags" do
 45 |     html = "<div>my\nhtml</div><plaintext>my\n text"
 46 |     expect(text(html)).to eq "my html\nmy\n text"
 47 |   end
 48 | 
 49 |   it "does not add extraneous spaces or line breaks" do
 50 |     html = "this<p><p>  is   \n    \n pretty bad lo<em>oking htm</em>l!"
 51 |     expect(text(html)).to eq "this\n\nis pretty bad looking html!"
 52 |   end
 53 | 
 54 |   it "formats bullet lists" do
 55 |     html = "List<ul><li>one</li><li>two<ul><li>a</li><li>b</li></ul></li><li>three</li></ul>"
 56 |     expect(text(html)).to eq "List\n\n* one\n* two\n\n** a\n** b\n\n* three"
 57 |   end
 58 | 
 59 |   it "formats numbered lists" do
 60 |     html = "List<ol><li>one</li><li>two<ol><li>a</li><li>b</li></ol></li><li>three</li></ol>"
 61 |     expect(text(html)).to eq "List\n\n1. one\n2. two\n\na. a\nb. b\n\n3. three"
 62 |   end
 63 | 
 64 |   describe "tables" do
 65 |     it "formats a simgple table" do
 66 |       html = "Table<table border='1'><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
 67 |       expect(text(html)).to eq "Table\n\n| Col 1 | Col 2 |\n| 1 | 2 |\n| 3 | 4 |"
 68 |     end
 69 | 
 70 |     it "does not add bars to a layout table" do
 71 |       html = "Table<table border='0'><tr><th>Col 1</th><th>Col 2</th></tr><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>"
 72 |       expect(text(html)).to eq "Table\n\nCol 1 Col 2\n1 2\n3 4"
 73 |     end
 74 |   end
 75 | 
 76 |   it "ignores inline tags without bodies" do
 77 |     html = "This is an <img src=\"/image\"> image"
 78 |     expect(text(html)).to eq "This is an image"
 79 |   end
 80 | 
 81 |   it "ignores comments" do
 82 |     html = "This is <!-- html comment here --> html"
 83 |     expect(text(html)).to eq "This is html"
 84 |   end
 85 | 
 86 |   it "unencodes entities" do
 87 |     html = "High &amp; Low"
 88 |     expect(text(html)).to eq "High & Low"
 89 |   end
 90 | 
 91 |   it "normalizes the line breaks" do
 92 |     html = "<pre>These are\rreturn\r\nlines</pre>"
 93 |     expect(text(html)).to eq "These are\nreturn\nlines"
 94 |   end
 95 | 
 96 |   describe "a" do
 97 |     it "discards missing href" do
 98 |       expect(text("<a name='links'>Links</a>")).to eq "Links"
 99 |     end
100 | 
101 |     it "discards paths" do
102 |       expect(text("<a href='/test'>Links</a>")).to eq "Links"
103 |     end
104 | 
105 |     it "includes absolute link URLs" do
106 |       html = "<a href='http://example.com/test'>full</a>"
107 |       expect(text(html)).to eq "full (http://example.com/test)"
108 |     end
109 | 
110 |     it "only uses the name for exact duplicates" do
111 |       html = "<a href='http://example.com'>http://example.com</a>"
112 |       expect(text(html)).to eq "http://example.com"
113 |     end
114 | 
115 |     it "only uses the name for close duplicates" do
116 |       html = "<a href='http://example.com'>example.com</a>"
117 |       expect(text(html)).to eq "example.com"
118 |     end
119 | 
120 |     it "only uses the name for mailto" do
121 |       html = "<a href='mailto:john@example.com'>john@example.com</a>"
122 |       expect(text(html)).to eq "john@example.com"
123 |     end
124 | 
125 |     it "ignores empty" do
126 |       expect(text("<a href='http://example.com/test2'> <img src='test'> </a>")).to eq ""
127 |     end
128 |   end
129 | 
130 |   it "unescapes entities" do
131 |     html = "This &amp; th&#97;t"
132 |     expect(text(html)).to eq "This & that"
133 |   end
134 | 
135 |   it "handles nil" do
136 |     expect(text(nil)).to eq nil
137 |   end
138 | 
139 |   it "handles empty text" do
140 |     expect(text((""))).to eq ""
141 |   end
142 | 
143 |   it "handles non-html text" do
144 |     expect(text(("test"))).to eq "test"
145 |   end
146 | 
147 |   it "handles UTF-8 characters" do
148 |     html = "<p>ümlaut</p>"
149 |     expect(text(html)).to eq "ümlaut"
150 |   end
151 | end
152 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | require File.expand_path("../../lib/html_to_plain_text.rb", __FILE__)
2 | 


--------------------------------------------------------------------------------