├── .editorconfig ├── .github └── workflows │ └── build.yml ├── .gitignore ├── .rubocop.yml ├── CHANGELOG.md ├── Gemfile ├── Gemfile.lock ├── LICENSE.md ├── README.md ├── Rakefile ├── html2text.gemspec ├── lib ├── html2text.rb └── html2text │ └── version.rb └── spec ├── examples ├── anchors.html ├── anchors.txt ├── basic.html ├── basic.txt ├── dom-processing.html ├── dom-processing.txt ├── empty.html ├── empty.txt ├── full_email.html ├── full_email.txt ├── huge-msoffice.html ├── huge-msoffice.txt ├── images.html ├── images.txt ├── invalid.html ├── invalid.txt ├── lists.html ├── lists.txt ├── malformed-style.html ├── malformed-style.txt ├── more-anchors.html ├── more-anchors.txt ├── msoffice.html ├── msoffice.txt ├── nbsp.html ├── nbsp.txt ├── nested-divs.html ├── nested-divs.txt ├── newlines.html ├── newlines.txt ├── non-breaking-spaces.html ├── non-breaking-spaces.txt ├── pre.html ├── pre.txt ├── table.html ├── table.txt ├── test3.html ├── test3.txt ├── test4.html ├── test4.txt ├── utf8-example.html ├── utf8-example.txt ├── windows-1252-example.html ├── windows-1252-example.txt ├── zero-width-non-joiners.html └── zero-width-non-joiners.txt ├── examples_spec.rb ├── html2text_spec.rb └── spec_helper.rb /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: http://EditorConfig.org 2 | # top-most EditorConfig file 3 | root = true 4 | 5 | # Unix-style newlines with a newline ending every file 6 | [*] 7 | end_of_line = lf 8 | charset = utf-8 9 | insert_final_newline = true 10 | trim_trailing_whitespace = true 11 | indent_style = space 12 | indent_size = 2 13 | 14 | [spec/examples/*] 15 | indent_style = tabs 16 | trim_trailing_whitespace = false 17 | insert_final_newline = false 18 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby 7 | 8 | name: Build 9 | 10 | on: 11 | push: 12 | branches: [ "master" ] 13 | pull_request: 14 | branches: [ "master" ] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | test: 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | ruby-version: ['3.0', '3.1', '3.2', '3.3'] 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Ruby 29 | uses: ruby/setup-ruby@v1 30 | with: 31 | ruby-version: ${{ matrix.ruby-version }} 32 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 33 | - name: Run tests 34 | run: bundle exec rake 35 | 36 | lint: 37 | runs-on: ubuntu-latest 38 | strategy: 39 | matrix: 40 | ruby-version: ['3.0', '3.1', '3.2', '3.3'] 41 | 42 | steps: 43 | - uses: actions/checkout@v4 44 | - name: Set up Ruby 45 | uses: ruby/setup-ruby@v1 46 | with: 47 | ruby-version: ${{ matrix.ruby-version }} 48 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 49 | - name: Run Rubocop 50 | run: bundle exec rubocop 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.DS_Store 2 | *.gem 3 | spec/examples/*.output 4 | .byebug_history 5 | .idea 6 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | require: 2 | - rubocop-performance 3 | - rubocop-rake 4 | 5 | AllCops: 6 | NewCops: enable 7 | TargetRubyVersion: 3.0 8 | 9 | Metrics/MethodLength: 10 | Max: 30 11 | 12 | Metrics/ClassLength: 13 | Max: 200 14 | 15 | Metrics/ModuleLength: 16 | Max: 200 17 | 18 | Metrics/BlockLength: 19 | Max: 50 20 | 21 | Gemspec/DevelopmentDependencies: 22 | EnforcedStyle: gemspec 23 | 24 | # TODO: Enable these cops after fixing the issues 25 | Metrics/CyclomaticComplexity: 26 | Enabled: false 27 | 28 | Metrics/PerceivedComplexity: 29 | Enabled: false 30 | 31 | Metrics/AbcSize: 32 | Enabled: false 33 | 34 | Style/Documentation: 35 | Enabled: false 36 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased](https://github.com/soundasleep/html2text_ruby/compare/v0.4.0...master) 8 | 9 | ## [0.4.0](https://github.com/soundasleep/html2text_ruby/compare/0.3.1...v0.4.0) - 2024-06-08 10 | ### Added 11 | - Switch from Travis to Github Actions for Build and Test 12 | - Add rubocop for linting and cleanup existing violations ([#36](https://github.com/soundasleep/html2text_ruby/pull/36)) 13 | 14 | ### Changed 15 | - Add support for Ruby 3.x, removed support for Ruby < 3.0 since it is EOL 16 | - Allow subclassing of `Html2Text` to override the default behaviour ([#30](https://github.com/soundasleep/html2text_ruby/pull/30)) 17 | 18 | ### Fixed 19 | - Loosen nokogiri dependency to allow for nokogiri < 2.0 ([#17](https://github.com/soundasleep/html2text_ruby/pull/17)) 20 | - Fix `NoMethodError` when parsing nodes with no name ([#15](https://github.com/soundasleep/html2text_ruby/pull/15)) 21 | 22 | ## [0.3.1] - 2019-06-12 23 | ### Security 24 | - Bumped nokogiri requirement to ~> 1.10.3, resolving [CVE-2019-11068](https://nvd.nist.gov/vuln/detail/CVE-2019-11068) 25 | ([#8](https://github.com/soundasleep/html2text_ruby/issues/8)) 26 | 27 | ## [0.3.0] - 2019-02-15 28 | ### Added 29 | - Zero-width non-joiners are now stripped ([#5](https://github.com/soundasleep/html2text_ruby/pull/5)) 30 | - Support both UTF-8 and Windows-1252 encoded files 31 | - Support converting `
` blocks, including whitespace within these blocks
32 | - MS Office (MsoNormal) documents are now rendered closer to actual render output
33 |   - Note this assumes that the input MS Office document has standard `MsoNormal` CSS.
34 |     This component is _not_ designed to try and interpret CSS within an HTML document.
35 | 
36 | ### Changed
37 | - Behaviour with multiple and nested `

`, `

` tags has been improved to be more in line with 38 | actual browser render behaviour (see test suite) 39 | 40 | ### Fixed 41 | - Update nokogiri dependency to 1.8.5 42 | 43 | ## [0.2.1] - 2017-09-27 44 | ### Fixed 45 | - Convert non-string input into strings ([#3](https://github.com/soundasleep/html2text_ruby/pull/3)) 46 | 47 | [Unreleased]: https://github.com/soundasleep/html2text_ruby/compare/0.3.1...HEAD 48 | [0.3.1]: https://github.com/soundasleep/html2text_ruby/compare/0.3.0...0.3.1 49 | [0.3.0]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.3.0 50 | [0.2.1]: https://github.com/soundasleep/html2text_ruby/compare/0.2.1...0.2.1 51 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | # Declare your gem's dependencies in whatever.gemspec. 6 | # Bundler will treat runtime dependencies like base dependencies, and 7 | # development dependencies will be added by default to the :development group. 8 | gemspec 9 | 10 | # Declare any dependencies that are still in development here instead of in 11 | # your gemspec. These might include edge Rails or gems from your path or 12 | # Git. Remember to move these dependencies to your gemspec before releasing 13 | # your gem to rubygems.org. 14 | 15 | # To use a debugger 16 | # gem 'byebug', group: [:development, :test] 17 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | html2text (0.4.0) 5 | nokogiri (>= 1.0, < 2.0) 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | ast (2.4.2) 11 | bundler-audit (0.6.1) 12 | bundler (>= 1.2.0, < 3) 13 | thor (~> 0.18) 14 | colorize (0.7.7) 15 | diff-lcs (1.3) 16 | json (2.7.2) 17 | language_server-protocol (3.17.0.3) 18 | mini_portile2 (2.8.7) 19 | nokogiri (1.16.5) 20 | mini_portile2 (~> 2.8.2) 21 | racc (~> 1.4) 22 | parallel (1.24.0) 23 | parser (3.3.2.0) 24 | ast (~> 2.4.1) 25 | racc 26 | racc (1.8.0) 27 | rainbow (3.1.1) 28 | rake (12.3.3) 29 | regexp_parser (2.9.2) 30 | rexml (3.3.9) 31 | rspec (3.8.0) 32 | rspec-core (~> 3.8.0) 33 | rspec-expectations (~> 3.8.0) 34 | rspec-mocks (~> 3.8.0) 35 | rspec-collection_matchers (1.1.2) 36 | rspec-expectations (>= 2.99.0.beta1) 37 | rspec-core (3.8.0) 38 | rspec-support (~> 3.8.0) 39 | rspec-expectations (3.8.2) 40 | diff-lcs (>= 1.2.0, < 2.0) 41 | rspec-support (~> 3.8.0) 42 | rspec-mocks (3.8.0) 43 | diff-lcs (>= 1.2.0, < 2.0) 44 | rspec-support (~> 3.8.0) 45 | rspec-support (3.8.0) 46 | rubocop (1.64.1) 47 | json (~> 2.3) 48 | language_server-protocol (>= 3.17.0) 49 | parallel (~> 1.10) 50 | parser (>= 3.3.0.2) 51 | rainbow (>= 2.2.2, < 4.0) 52 | regexp_parser (>= 1.8, < 3.0) 53 | rexml (>= 3.2.5, < 4.0) 54 | rubocop-ast (>= 1.31.1, < 2.0) 55 | ruby-progressbar (~> 1.7) 56 | unicode-display_width (>= 2.4.0, < 3.0) 57 | rubocop-ast (1.31.3) 58 | parser (>= 3.3.1.0) 59 | rubocop-performance (1.21.0) 60 | rubocop (>= 1.48.1, < 2.0) 61 | rubocop-ast (>= 1.31.1, < 2.0) 62 | rubocop-rake (0.6.0) 63 | rubocop (~> 1.0) 64 | ruby-progressbar (1.13.0) 65 | thor (0.20.3) 66 | unicode-display_width (2.5.0) 67 | 68 | PLATFORMS 69 | ruby 70 | 71 | DEPENDENCIES 72 | bundler-audit 73 | colorize 74 | html2text! 75 | rake 76 | rspec 77 | rspec-collection_matchers 78 | rubocop 79 | rubocop-performance 80 | rubocop-rake 81 | 82 | BUNDLED WITH 83 | 2.5.11 84 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2015 Jevon Wright 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | html2text ![Build](https://github.com/soundasleep/html2text_ruby/actions/workflows/build.yml/badge.svg) [![Gem Version](https://badge.fury.io/rb/html2text.svg)](https://badge.fury.io/rb/html2text) 2 | --- 3 | 4 | `html2text` is a very simple gem that uses DOM methods to convert HTML into a format similar to what would be 5 | rendered by a browser - perfect for places where you need a quick text representation. For example: 6 | 7 | ```html 8 | 9 | Ignored Title 10 | 11 |

Hello, World!

12 | 13 |

This is some e-mail content. 14 | Even though it has whitespace and newlines, the e-mail converter 15 | will handle it correctly. 16 | 17 |

Even mismatched tags.

18 | 19 |
A div
20 |
Another div
21 |
A div
within a div
22 | 23 | A link 24 | 25 | 26 | 27 | ``` 28 | 29 | Will be converted into: 30 | 31 | ```text 32 | Hello, World! 33 | 34 | This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. 35 | 36 | Even mismatched tags. 37 | 38 | A div 39 | Another div 40 | A div 41 | within a div 42 | 43 | [A link](https://foo.com) 44 | ``` 45 | 46 | See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531). 47 | 48 | ## Installing 49 | 50 | Add [the gem](https://rubygems.org/gems/html2text) into your Gemfile and run `bundle install`: 51 | 52 | ```ruby 53 | gem 'html2text' 54 | ``` 55 | 56 | Then you can: 57 | 58 | ```ruby 59 | require 'html2text' 60 | 61 | text = Html2Text.convert(html) 62 | ``` 63 | 64 | ## Tests 65 | 66 | See all of the test cases defined in [spec/examples/](spec/examples/). These can be run with `bundle exec rake`. 67 | 68 | ## License 69 | 70 | `html2text` is [licensed under MIT](LICENSE.md). 71 | 72 | ## Other versions 73 | 74 | 1. [html2text](https://github.com/soundasleep/html2text), the original PHP implementation. 75 | 2. [actionmailer-html2text](https://github.com/soundasleep/actionmailer-html2text), automatically generate text parts for HTML emails sent with ActionMailer. 76 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rubygems' 4 | require 'bundler' 5 | 6 | Bundler.setup(:default, :development) 7 | 8 | require 'rake' 9 | 10 | Bundler::GemHelper.install_tasks 11 | 12 | require 'rspec/core/rake_task' 13 | 14 | RSpec::Core::RakeTask.new(:spec) 15 | 16 | task default: :spec 17 | -------------------------------------------------------------------------------- /html2text.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | $LOAD_PATH.push File.expand_path('lib', __dir__) 4 | 5 | # Maintain your gem's version: 6 | require 'html2text/version' 7 | 8 | # Describe your gem and declare its dependencies: 9 | Gem::Specification.new do |s| 10 | s.name = 'html2text' 11 | s.version = Html2Text::VERSION 12 | s.authors = ['Jevon Wright'] 13 | s.email = ['jevon@jevon.org'] 14 | s.homepage = 'https://github.com/soundasleep/html2text_ruby' 15 | s.summary = 'Convert HTML into plain text.' 16 | s.description = 'A Ruby component to convert HTML into a plain text format.' 17 | s.license = 'MIT' 18 | s.required_ruby_version = '>= 3.0' 19 | 20 | s.files = Dir['lib/**/*', 'LICENSE.md', 'README.md', 'CHANGELOG.md'] 21 | 22 | s.add_dependency 'nokogiri', ['>= 1.0', '< 2.0'] 23 | 24 | s.add_development_dependency 'bundler-audit' 25 | s.add_development_dependency 'colorize' 26 | s.add_development_dependency 'rake' 27 | s.add_development_dependency 'rspec' 28 | s.add_development_dependency 'rspec-collection_matchers' 29 | s.add_development_dependency 'rubocop' 30 | s.add_development_dependency 'rubocop-performance' 31 | s.add_development_dependency 'rubocop-rake' 32 | 33 | s.metadata['rubygems_mfa_required'] = 'true' 34 | end 35 | -------------------------------------------------------------------------------- /lib/html2text.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'nokogiri' 4 | 5 | class Html2Text 6 | attr_reader :doc 7 | 8 | def initialize(doc) 9 | @doc = doc 10 | end 11 | 12 | def self.convert(html) 13 | html = html.to_s 14 | 15 | if office_document?(html) 16 | # Emulate the CSS rendering of Office documents 17 | html = html.gsub('

', '
') 18 | .gsub(' ', '
') 19 | .gsub('', '') 20 | end 21 | 22 | unless html.include?(' tags 24 | html = "

#{html}
" 25 | end 26 | 27 | html = fix_newlines(replace_entities(html)) 28 | doc = Nokogiri::HTML(html) 29 | 30 | new(doc).convert 31 | end 32 | 33 | def self.fix_newlines(text) 34 | # rubocop:disable Performance/StringReplacement 35 | text.gsub("\r\n", "\n").gsub("\r", "\n") 36 | # rubocop:enable Performance/StringReplacement 37 | end 38 | 39 | def self.replace_entities(text) 40 | # rubocop:disable Performance/StringReplacement 41 | text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '') 42 | # rubocop:enable Performance/StringReplacement 43 | end 44 | 45 | def convert 46 | output = iterate_over(doc) 47 | output = remove_leading_and_trailing_whitespace(output) 48 | output = remove_unnecessary_empty_lines(output) 49 | output.strip 50 | end 51 | 52 | DO_NOT_TOUCH_WHITESPACE = '' 53 | 54 | def remove_leading_and_trailing_whitespace(text) 55 | # ignore any
 blocks, which we don't want to interact with
 56 |     pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE)
 57 | 
 58 |     output = []
 59 |     pre_blocks.each.with_index do |block, index|
 60 |       output << if index.even?
 61 |                   block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
 62 |                 else
 63 |                   block
 64 |                 end
 65 |     end
 66 | 
 67 |     output.join
 68 |   end
 69 | 
 70 |   private_class_method def self.office_document?(text)
 71 |     text.include?('urn:schemas-microsoft-com:office')
 72 |   end
 73 | 
 74 |   private
 75 | 
 76 |   def remove_unnecessary_empty_lines(text)
 77 |     text.gsub(/\n\n\n*/im, "\n\n")
 78 |   end
 79 | 
 80 |   def trimmed_whitespace(text)
 81 |     # Replace whitespace characters with a space (equivalent to \s)
 82 |     # and force any text encoding into UTF-8
 83 |     if text.valid_encoding?
 84 |       text.gsub(/[\t\n\f\r ]+/im, ' ')
 85 |     else
 86 |       text.force_encoding('WINDOWS-1252')
 87 |       trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
 88 |     end
 89 |   end
 90 | 
 91 |   def iterate_over(node)
 92 |     return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
 93 | 
 94 |     return trimmed_whitespace(node.text) if node.text?
 95 | 
 96 |     return '' if %w[style head title meta script].include?(node.name.downcase)
 97 | 
 98 |     return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
 99 | 
100 |     output = []
101 | 
102 |     output << prefix_whitespace(node)
103 |     output += node.children.map do |child|
104 |       iterate_over(child) unless child.name.nil?
105 |     end
106 |     output << suffix_whitespace(node)
107 | 
108 |     output = output.compact.join || ''
109 | 
110 |     unless node.name.nil?
111 |       if node.name.downcase == 'a'
112 |         output = wrap_link(node, output)
113 |       elsif node.name.downcase == 'img'
114 |         output = image_text(node)
115 |       end
116 |     end
117 | 
118 |     output
119 |   end
120 | 
121 |   # rubocop:disable Lint/DuplicateBranch
122 |   def prefix_whitespace(node)
123 |     case node.name.downcase
124 |     when 'hr'
125 |       "\n---------------------------------------------------------------\n"
126 | 
127 |     when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
128 |       "\n\n"
129 | 
130 |     when 'p'
131 |       "\n\n"
132 | 
133 |     when 'tr'
134 |       "\n"
135 | 
136 |     when 'div'
137 |       if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
138 |         ''
139 |       else
140 |         "\n"
141 |       end
142 | 
143 |     when 'td', 'th'
144 |       "\t"
145 | 
146 |     when 'li'
147 |       '- '
148 |     end
149 |   end
150 |   # rubocop:enable Lint/DuplicateBranch
151 | 
152 |   # rubocop:disable Lint/DuplicateBranch
153 |   def suffix_whitespace(node)
154 |     case node.name.downcase
155 |     when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
156 |       # add another line
157 |       "\n\n"
158 | 
159 |     when 'p'
160 |       "\n\n"
161 | 
162 |     when 'br'
163 |       "\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
164 | 
165 |     when 'li'
166 |       "\n"
167 | 
168 |     when 'div'
169 |       if next_node_is_text?(node)
170 |         "\n"
171 |       elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
172 |         "\n"
173 |       end
174 |     end
175 |   end
176 |   # rubocop:enable Lint/DuplicateBranch
177 | 
178 |   # links are returned in [text](link) format
179 |   def wrap_link(node, output)
180 |     href = node.attribute('href')
181 |     name = node.attribute('name')
182 | 
183 |     output = output.strip
184 | 
185 |     # remove double [[ ]]s from linking images
186 |     if output[0] == '[' && output[-1] == ']'
187 |       output = output[1, output.length - 2]
188 | 
189 |       # for linking images, the title of the  overrides the title of the 
190 |       output = node.attribute('title').to_s if node.attribute('title')
191 |     end
192 | 
193 |     # if there is no link text, but a title attr
194 |     output = node.attribute('title').to_s if output.empty? && node.attribute('title')
195 | 
196 |     if href.nil?
197 |       output = "[#{output}]" unless name.nil?
198 |     else
199 |       href = href.to_s
200 | 
201 |       if href != output && href != "mailto:#{output}" &&
202 |          href != "http://#{output}" && href != "https://#{output}"
203 |         output = if output.empty?
204 |                    href
205 |                  else
206 |                    "[#{output}](#{href})"
207 |                  end
208 |       end
209 |     end
210 | 
211 |     case next_node_name(node)
212 |     when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
213 |       output += "\n"
214 |     end
215 | 
216 |     output
217 |   end
218 | 
219 |   def image_text(node)
220 |     if node.attribute('title')
221 |       "[#{node.attribute('title')}]"
222 |     elsif node.attribute('alt')
223 |       "[#{node.attribute('alt')}]"
224 |     else
225 |       ''
226 |     end
227 |   end
228 | 
229 |   def next_node_name(node)
230 |     next_node = node.next_sibling
231 |     until next_node.nil?
232 |       break if next_node.element?
233 | 
234 |       next_node = next_node.next_sibling
235 |     end
236 | 
237 |     return unless next_node&.element?
238 | 
239 |     next_node.name.downcase
240 |   end
241 | 
242 |   def next_node_is_text?(node)
243 |     !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
244 |   end
245 | 
246 |   def previous_node_name(node)
247 |     previous_node = node.previous_sibling
248 |     until previous_node.nil?
249 |       break if previous_node.element?
250 | 
251 |       previous_node = previous_node.previous_sibling
252 |     end
253 | 
254 |     return unless previous_node&.element?
255 | 
256 |     previous_node.name.downcase
257 |   end
258 | 
259 |   def previous_node_is_text?(node)
260 |     !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
261 |   end
262 | 
263 |   # def previous_node_is_not_text?(node)
264 |   #   return node.previous_sibling.nil? || !node.previous_sibling.text? || node.previous_sibling.text.strip.empty?
265 |   # end
266 | end
267 | 


--------------------------------------------------------------------------------
/lib/html2text/version.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | class Html2Text
4 |   VERSION = '0.4.0'
5 | end
6 | 


--------------------------------------------------------------------------------
/spec/examples/anchors.html:
--------------------------------------------------------------------------------
 1 | A document without any HTML open/closing tags.
 2 | 
 3 | 
4 | 5 | We try and use the representation given by common browsers of the 6 | HTML document, so that it looks similar when converted to plain text. 7 | 8 |
visit foo.com - or http://www.foo.com 9 | 10 | link 11 | 12 |

An anchor which will not appear

13 | -------------------------------------------------------------------------------- /spec/examples/anchors.txt: -------------------------------------------------------------------------------- 1 | A document without any HTML open/closing tags. 2 | --------------------------------------------------------------- 3 | We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com) 4 | 5 | [An anchor which will not appear] -------------------------------------------------------------------------------- /spec/examples/basic.html: -------------------------------------------------------------------------------- 1 | 2 | Ignored Title 3 | 4 |

Hello, World!

5 | 6 |

This is some e-mail content. 7 | Even though it has whitespace and newlines, the e-mail converter 8 | will handle it correctly. 9 | 10 |

Even mismatched tags.

11 | 12 |
A div
13 |
Another div
14 |
A div
within a div
15 | 16 |

Another line
Yet another line

17 | 18 | A link 19 | 20 | 21 | -------------------------------------------------------------------------------- /spec/examples/basic.txt: -------------------------------------------------------------------------------- 1 | Hello, World! 2 | 3 | This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. 4 | 5 | Even mismatched tags. 6 | 7 | A div 8 | Another div 9 | A div 10 | within a div 11 | 12 | Another line 13 | Yet another line 14 | 15 | [A link](http://foo.com) -------------------------------------------------------------------------------- /spec/examples/dom-processing.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | Hello 7 | 8 | -------------------------------------------------------------------------------- /spec/examples/dom-processing.txt: -------------------------------------------------------------------------------- 1 | Hello -------------------------------------------------------------------------------- /spec/examples/empty.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soundasleep/html2text_ruby/5cc0c315c64972ca4824d8a457d148a009e45226/spec/examples/empty.html -------------------------------------------------------------------------------- /spec/examples/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soundasleep/html2text_ruby/5cc0c315c64972ca4824d8a457d148a009e45226/spec/examples/empty.txt -------------------------------------------------------------------------------- /spec/examples/full_email.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 22 | 23 | 24 | 38 | 39 |
25 | 26 | 27 | 31 | 35 | 36 |
28 |
37 |
40 | 41 | 42 | 43 | 52 | 53 |
44 |

45 | Hi Susan 46 |

47 |

48 | Here is your cat report. 49 |

50 | 51 |
54 | 55 | 56 | 57 | 58 | 59 | 60 | 69 | 70 |
61 | 62 | 63 |
64 |
65 | Find more cats 66 |
67 |
68 |
71 | 72 | 73 | 74 | 122 | 123 |
75 |
76 |

Down the road

77 |

Across the hall

78 | 79 |

Your achievements

80 | 81 | 82 | 83 | 88 | 89 | 90 | 91 | 92 | 93 |
84 |
You're currently finding about
85 |
12 cats
86 |
per day
87 |
 
Number of cats found
94 |
95 | 96 | 97 |
98 |
99 | 100 |

Your last cat was found two days ago.

101 |

One type of cat is a kitten.

102 | 103 | 104 | 105 | 108 | 112 | 115 | 116 | 117 |
106 | 107 | 109 |

Special account A1 110 |

111 |
113 |

12.345

114 |
118 | 119 |
120 | 121 |
124 | 125 | 130 | 131 | 132 | 133 | 188 | 189 |
134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 163 | 168 | 173 | 174 | 175 | 176 | 179 | 182 | 185 | 186 |

How can you find more cats?

Look in trash cans

Start meowing

Eat cat food

Some cats like to hang out in trash cans. Some cats do not.Some cats are attracted to similar tones.So one day your tears may smell like cat food, attracting more cats.
159 | 160 | 161 | 162 | 164 | 165 | 166 | 167 | 169 | 170 | 171 | 172 |
187 |
190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /spec/examples/full_email.txt: -------------------------------------------------------------------------------- 1 | http://localhost/home 16 December 2015 2 | Account 123 3 | 4 | Hi Susan 5 | 6 | Here is your cat report. 7 | 8 | You have found 5 cats less than anyone else 9 | [Find more cats](http://localhost/cats) 10 | 11 | Down the road 12 | 13 | Across the hall 14 | 15 | Your achievements 16 | 17 | You're currently finding about 18 | 12 cats 19 | per day 20 | 21 | [Number of cats found] 22 | 23 | --------------------------------------------------------------- 24 | 25 | Your last cat was found two days ago. 26 | 27 | One type of cat is a kitten. 28 | 29 | Special account A1 30 | 31 | 12.345 32 | 33 | http://localhost/logout 34 | 35 | How can you find more cats? 36 | 37 | Look in trash cans 38 | 39 | Start meowing 40 | 41 | Eat cat food 42 | 43 | Some cats like to hang out in trash cans. Some cats do not. Some cats are attracted to similar tones. So one day your tears may smell like cat food, attracting more cats. 44 | https://localhost/about https://localhost/about https://localhost/about 45 | [Cats are great.](https://github.com/soundasleep/html2text_ruby) [Find more cats.](https://github.com/soundasleep/html2text_ruby) [Do more things.](https://github.com/soundasleep/html2text_ruby) 46 | 47 | [Contact us](http://localhost/contact) 48 | 49 | cats@cats.com 50 | Monday and Friday 51 | 52 | https://github.com/soundasleep/html2text https://github.com/soundasleep/html2text_ruby 53 | 54 | Having trouble seeing this email? [View it online](http://localhost/view_it_online). -------------------------------------------------------------------------------- /spec/examples/images.html: -------------------------------------------------------------------------------- 1 | 2 |

3 | One: 4 |

5 | 6 |

7 | Two: two 8 |

9 | 10 |

11 | Three: 12 |

13 | 14 |

15 | Four: four alt 16 |

17 | 18 |

With links

19 | 20 |

21 | One: 22 |

23 | 24 |

25 | Two: two 26 |

27 | 28 |

29 | Three: 30 |

31 | 32 |

33 | Four: four alt 34 |

35 | 36 |

With links with titles

37 | 38 |

39 | One: 40 |

41 | 42 |

43 | Two: two 44 |

45 | 46 |

47 | Three: 48 |

49 | 50 |

51 | Four: four alt 52 |

53 | 54 | -------------------------------------------------------------------------------- /spec/examples/images.txt: -------------------------------------------------------------------------------- 1 | One: 2 | 3 | Two: [two] 4 | 5 | Three: [three] 6 | 7 | Four: [four] 8 | 9 | With links 10 | 11 | One: http://localhost 12 | 13 | Two: [two](http://localhost) 14 | 15 | Three: [three](http://localhost) 16 | 17 | Four: [four](http://localhost) 18 | 19 | With links with titles 20 | 21 | One: [one link](http://localhost) 22 | 23 | Two: [two link](http://localhost) 24 | 25 | Three: [three link](http://localhost) 26 | 27 | Four: [four link](http://localhost) -------------------------------------------------------------------------------- /spec/examples/invalid.html: -------------------------------------------------------------------------------- 1 | Hello &nbsnbsp; world 2 |
4 | title 5 | 6 | 7 | 8 | 50 |

Some body

51 | 52 | -------------------------------------------------------------------------------- /spec/examples/malformed-style.txt: -------------------------------------------------------------------------------- 1 | Some body -------------------------------------------------------------------------------- /spec/examples/more-anchors.html: -------------------------------------------------------------------------------- 1 |

Anchor tests

2 | 3 |

4 | Visit http://openiaml.org or openiaml.org or http://openiaml.org. 5 |

6 | 7 |

8 | To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org. 9 |

10 | 11 |

12 | To mail, email support@openiaml.org or mailto:support@openiaml.org 13 | or support@openiaml.org or mailto:support@openiaml.org. 14 |

15 | -------------------------------------------------------------------------------- /spec/examples/more-anchors.txt: -------------------------------------------------------------------------------- 1 | Anchor tests 2 | 3 | Visit http://openiaml.org or openiaml.org or http://openiaml.org. 4 | 5 | To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org. 6 | 7 | To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org. -------------------------------------------------------------------------------- /spec/examples/msoffice.html: -------------------------------------------------------------------------------- 1 |

Dear html2text,

 

This is an example email that can be used to test html2text conversion of outlook / exchange emails.

 

The addition of <o:p> tags is very annoying!

This is a single line return

 

This is bold

This is italic

This is underline

 

Andrew

-------------------------------------------------------------------------------- /spec/examples/msoffice.txt: -------------------------------------------------------------------------------- 1 | Dear html2text, 2 | 3 | This is an example email that can be used to test html2text conversion of outlook / exchange emails. 4 | 5 | The addition of tags is very annoying! 6 | This is a single line return 7 | 8 | This is bold 9 | This is italic 10 | This is underline 11 | 12 | Andrew -------------------------------------------------------------------------------- /spec/examples/nbsp.html: -------------------------------------------------------------------------------- 1 | hello   world & people < > &NBSP; -------------------------------------------------------------------------------- /spec/examples/nbsp.txt: -------------------------------------------------------------------------------- 1 | hello world & people < > &NBSP; -------------------------------------------------------------------------------- /spec/examples/nested-divs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Just two divs 5 |
6 |
7 | Hanging out 8 |
9 |
Nested divs and line breaks

10 |
Nested divs and line breaks
More text
11 |

12 |
Just text
13 |
Just text
14 |
Just text

15 | This is the end! 16 | 17 | 18 | -------------------------------------------------------------------------------- /spec/examples/nested-divs.txt: -------------------------------------------------------------------------------- 1 | Just two divs 2 | Hanging out 3 | Nested divs and line breaks 4 | 5 | Nested divs and line breaks 6 | More text 7 | 8 | Just text 9 | Just text 10 | Just text 11 | 12 | This is the end! -------------------------------------------------------------------------------- /spec/examples/newlines.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Hello 5 |
6 |
7 |
8 | How are you? 9 |
10 |
11 | 12 |

13 | How are you? 14 |
15 |

16 | 17 |

18 | How are you? 19 |
20 |

21 | 22 |
23 | Just two divs 24 |
25 |
26 | Hanging out 27 |
28 | 29 | This is not the end! 30 |
31 | How are you again? 32 |
33 |
34 | This is the end! 35 |
36 | Just kidding 37 |

Header 1

38 | Some text 39 |
40 | Some more text 41 |

Paragraph tag!

42 |

Header 2

43 |
44 |

Header 3

45 | Some text 46 |

Header 4

47 |

Paragraph tag!

48 | Final line 49 | 50 | -------------------------------------------------------------------------------- /spec/examples/newlines.txt: -------------------------------------------------------------------------------- 1 | Hello 2 | How are you? 3 | 4 | How are you? 5 | 6 | How are you? 7 | 8 | Just two divs 9 | Hanging out 10 | This is not the end! 11 | How are you again? 12 | This is the end! 13 | Just kidding 14 | 15 | Header 1 16 | 17 | Some text 18 | --------------------------------------------------------------- 19 | Some more text 20 | 21 | Paragraph tag! 22 | 23 | Header 2 24 | 25 | --------------------------------------------------------------- 26 | 27 | Header 3 28 | 29 | Some text 30 | 31 | Header 4 32 | 33 | Paragraph tag! 34 | 35 | Final line -------------------------------------------------------------------------------- /spec/examples/non-breaking-spaces.html: -------------------------------------------------------------------------------- 1 | these spaces are non-breaking -------------------------------------------------------------------------------- /spec/examples/non-breaking-spaces.txt: -------------------------------------------------------------------------------- 1 | these spaces are non-breaking -------------------------------------------------------------------------------- /spec/examples/pre.html: -------------------------------------------------------------------------------- 1 | Here is the code 2 |
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | 
 6 | int main(){
 7 | 	return 0;
 8 | };
 9 | 
10 | 
-------------------------------------------------------------------------------- /spec/examples/pre.txt: -------------------------------------------------------------------------------- 1 | Here is the code 2 | 3 | #include 4 | #include 5 | 6 | int main(){ 7 | return 0; 8 | }; -------------------------------------------------------------------------------- /spec/examples/table.html: -------------------------------------------------------------------------------- 1 | 2 | Ignored Title 3 | 4 |

Hello, World!

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 17 | 20 | 21 | 22 | 25 | 28 | 29 | 30 | 33 | 36 | 37 | 38 | 39 | 40 | 43 | 46 | 47 | 48 | 49 | 50 |
Col ACol B
15 | Data A1 16 | 18 | Data B1 19 |
23 | Data A2 24 | 26 | Data B2 27 |
31 | Data A3 32 | 34 | Data B4 35 |
41 | Total A 42 | 44 | Total B 45 |
51 | 52 | 53 | -------------------------------------------------------------------------------- /spec/examples/table.txt: -------------------------------------------------------------------------------- 1 | Hello, World! 2 | 3 | Col A Col B 4 | Data A1 Data B1 5 | Data A2 Data B2 6 | Data A3 Data B4 7 | Total A Total B -------------------------------------------------------------------------------- /spec/examples/test3.html: -------------------------------------------------------------------------------- 1 | test one
test two -------------------------------------------------------------------------------- /spec/examples/test3.txt: -------------------------------------------------------------------------------- 1 | test one 2 | test two -------------------------------------------------------------------------------- /spec/examples/test4.html: -------------------------------------------------------------------------------- 1 | 1
2
3
4
5 < 6 -------------------------------------------------------------------------------- /spec/examples/test4.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 < 6 -------------------------------------------------------------------------------- /spec/examples/utf8-example.html: -------------------------------------------------------------------------------- 1 |
    2 |
  • ÅÄÖ
  • 3 |
  • åäö
  • 4 |
5 | -------------------------------------------------------------------------------- /spec/examples/utf8-example.txt: -------------------------------------------------------------------------------- 1 | - ÅÄÖ 2 | - åäö -------------------------------------------------------------------------------- /spec/examples/windows-1252-example.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soundasleep/html2text_ruby/5cc0c315c64972ca4824d8a457d148a009e45226/spec/examples/windows-1252-example.html -------------------------------------------------------------------------------- /spec/examples/windows-1252-example.txt: -------------------------------------------------------------------------------- 1 | - ÅÄÖ 2 | - åäö -------------------------------------------------------------------------------- /spec/examples/zero-width-non-joiners.html: -------------------------------------------------------------------------------- 1 |

foo‌bar

2 | -------------------------------------------------------------------------------- /spec/examples/zero-width-non-joiners.txt: -------------------------------------------------------------------------------- 1 | foobar -------------------------------------------------------------------------------- /spec/examples_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'spec_helper' 4 | 5 | describe Html2Text do 6 | describe '#convert' do 7 | let(:text) { Html2Text.convert(html) } 8 | 9 | examples = Dir["#{File.dirname(__FILE__)}/examples/*.html"] 10 | 11 | examples.each do |filename| 12 | context filename.to_s do 13 | let(:html) { File.read(filename) } 14 | let(:text_file) { filename.sub('.html', '.txt') } 15 | let(:expected) { Html2Text.fix_newlines(File.read(text_file)) } 16 | 17 | it 'has an expected output' do 18 | expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist" 19 | end 20 | 21 | it 'converts to text' do 22 | # Write the output if it failed, for easier comparison 23 | File.write(filename.sub('.html', '.output'), text) unless text.eql?(expected) 24 | 25 | # Quick check, don't try to generate a 500kb+ diff, 26 | # which can halt the rspec for minutes+ 27 | expect(text.length).to eq expected.length if text.length > 10_000 28 | 29 | # More complete check 30 | expect(text).to eq expected 31 | end 32 | end 33 | end 34 | 35 | it 'has examples to test' do 36 | expect(examples.size).to_not eq(0) 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /spec/html2text_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'spec_helper' 4 | 5 | describe Html2Text do 6 | describe '#convert' do 7 | let(:text) { Html2Text.convert(html) } 8 | 9 | context 'an empty line' do 10 | let(:html) { '' } 11 | 12 | it 'is an empty line' do 13 | expect(text).to eq('') 14 | end 15 | end 16 | 17 | context 'a simple string' do 18 | let(:html) { 'hello world' } 19 | 20 | it 'is an empty line' do 21 | expect(text).to eq('hello world') 22 | end 23 | end 24 | 25 | context 'input value is non-string' do 26 | let(:html) { nil } 27 | it '(nil)' do 28 | expect(text).to eq('') 29 | end 30 | end 31 | 32 | context 'input value is non-string' do 33 | let(:html) { 1234 } 34 | it '(number)' do 35 | expect(text).to eq('1234') 36 | end 37 | end 38 | 39 | context 'input value is non-string' do 40 | let(:html) { 1234.5600 } 41 | it '(float number)' do 42 | expect(text).to eq('1234.56') 43 | end 44 | end 45 | end 46 | 47 | describe '#remove_leading_and_trailing_whitespace' do 48 | let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) } 49 | 50 | context 'an empty string' do 51 | let(:input) { '' } 52 | it { is_expected.to eq('') } 53 | end 54 | 55 | context 'many new lines' do 56 | let(:input) { "hello\n world \n yes" } 57 | it { is_expected.to eq("hello\nworld\nyes") } 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rspec' 4 | require 'rspec/collection_matchers' 5 | 6 | require File.join(File.dirname(__FILE__), '..', 'lib', 'html2text') 7 | --------------------------------------------------------------------------------