├── spec
├── examples
│ ├── empty.txt
│ ├── empty.html
│ ├── dom-processing.txt
│ ├── malformed-style.txt
│ ├── invalid.txt
│ ├── test3.html
│ ├── test3.txt
│ ├── utf8-example.txt
│ ├── zero-width-non-joiners.txt
│ ├── nbsp.txt
│ ├── test4.txt
│ ├── windows-1252-example.txt
│ ├── test4.html
│ ├── non-breaking-spaces.html
│ ├── non-breaking-spaces.txt
│ ├── zero-width-non-joiners.html
│ ├── nbsp.html
│ ├── utf8-example.html
│ ├── invalid.html
│ ├── dom-processing.html
│ ├── pre.txt
│ ├── table.txt
│ ├── windows-1252-example.html
│ ├── pre.html
│ ├── lists.txt
│ ├── nested-divs.txt
│ ├── msoffice.txt
│ ├── basic.txt
│ ├── more-anchors.txt
│ ├── lists.html
│ ├── anchors.txt
│ ├── nested-divs.html
│ ├── images.txt
│ ├── anchors.html
│ ├── basic.html
│ ├── newlines.txt
│ ├── more-anchors.html
│ ├── newlines.html
│ ├── malformed-style.html
│ ├── table.html
│ ├── images.html
│ ├── full_email.txt
│ ├── msoffice.html
│ └── full_email.html
├── spec_helper.rb
├── examples_spec.rb
└── html2text_spec.rb
├── .gitignore
├── lib
├── html2text
│ └── version.rb
└── html2text.rb
├── Rakefile
├── .editorconfig
├── Gemfile
├── .rubocop.yml
├── LICENSE.md
├── html2text.gemspec
├── .github
└── workflows
│ └── build.yml
├── README.md
├── Gemfile.lock
└── CHANGELOG.md
/spec/examples/empty.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/spec/examples/empty.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/spec/examples/dom-processing.txt:
--------------------------------------------------------------------------------
1 | Hello
--------------------------------------------------------------------------------
/spec/examples/malformed-style.txt:
--------------------------------------------------------------------------------
1 | Some body
--------------------------------------------------------------------------------
/spec/examples/invalid.txt:
--------------------------------------------------------------------------------
1 | Hello &nbsnbsp; world
--------------------------------------------------------------------------------
/spec/examples/test3.html:
--------------------------------------------------------------------------------
1 | test one
test two
--------------------------------------------------------------------------------
/spec/examples/test3.txt:
--------------------------------------------------------------------------------
1 | test one
2 | test two
--------------------------------------------------------------------------------
/spec/examples/utf8-example.txt:
--------------------------------------------------------------------------------
1 | - ÅÄÖ
2 | - åäö
--------------------------------------------------------------------------------
/spec/examples/zero-width-non-joiners.txt:
--------------------------------------------------------------------------------
1 | foobar
--------------------------------------------------------------------------------
/spec/examples/nbsp.txt:
--------------------------------------------------------------------------------
1 | hello world & people < > &NBSP;
--------------------------------------------------------------------------------
/spec/examples/test4.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 4
5 | 5 < 6
--------------------------------------------------------------------------------
/spec/examples/windows-1252-example.txt:
--------------------------------------------------------------------------------
1 | - ÅÄÖ
2 | - åäö
--------------------------------------------------------------------------------
/spec/examples/test4.html:
--------------------------------------------------------------------------------
1 | 1
2
3
4
5 < 6
--------------------------------------------------------------------------------
/spec/examples/non-breaking-spaces.html:
--------------------------------------------------------------------------------
1 | these spaces are non-breaking
--------------------------------------------------------------------------------
/spec/examples/non-breaking-spaces.txt:
--------------------------------------------------------------------------------
1 | these spaces are non-breaking
--------------------------------------------------------------------------------
/spec/examples/zero-width-non-joiners.html:
--------------------------------------------------------------------------------
1 |
visit foo.com - or
http://www.foo.com
9 |
10 |
link
11 |
12 |
13 |
--------------------------------------------------------------------------------
/spec/examples/basic.html:
--------------------------------------------------------------------------------
1 |
2 |
Ignored Title
3 |
4 |
Hello, World!
5 |
6 |
This is some e-mail content.
7 | Even though it has whitespace and newlines, the e-mail converter
8 | will handle it correctly.
9 |
10 |
Even mismatched tags.
11 |
12 |
A div
13 |
Another div
14 |
15 |
16 |
Another line
Yet another line
17 |
18 |
A link
19 |
20 |
21 |
--------------------------------------------------------------------------------
/spec/examples/newlines.txt:
--------------------------------------------------------------------------------
1 | Hello
2 | How are you?
3 |
4 | How are you?
5 |
6 | How are you?
7 |
8 | Just two divs
9 | Hanging out
10 | This is not the end!
11 | How are you again?
12 | This is the end!
13 | Just kidding
14 |
15 | Header 1
16 |
17 | Some text
18 | ---------------------------------------------------------------
19 | Some more text
20 |
21 | Paragraph tag!
22 |
23 | Header 2
24 |
25 | ---------------------------------------------------------------
26 |
27 | Header 3
28 |
29 | Some text
30 |
31 | Header 4
32 |
33 | Paragraph tag!
34 |
35 | Final line
--------------------------------------------------------------------------------
/spec/examples/more-anchors.html:
--------------------------------------------------------------------------------
1 |
Anchor tests
2 |
3 |
4 | Visit http://openiaml.org or openiaml.org or http://openiaml.org.
5 |
6 |
7 |
8 | To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org.
9 |
10 |
11 |
12 | To mail, email support@openiaml.org or mailto:support@openiaml.org
13 | or support@openiaml.org or mailto:support@openiaml.org.
14 |
15 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | source 'https://rubygems.org'
4 |
5 | # Declare your gem's dependencies in whatever.gemspec.
6 | # Bundler will treat runtime dependencies like base dependencies, and
7 | # development dependencies will be added by default to the :development group.
8 | gemspec
9 |
10 | # Declare any dependencies that are still in development here instead of in
11 | # your gemspec. These might include edge Rails or gems from your path or
12 | # Git. Remember to move these dependencies to your gemspec before releasing
13 | # your gem to rubygems.org.
14 |
15 | # To use a debugger
16 | # gem 'byebug', group: [:development, :test]
17 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | require:
2 | - rubocop-performance
3 | - rubocop-rake
4 |
5 | AllCops:
6 | NewCops: enable
7 | TargetRubyVersion: 3.0
8 |
9 | Metrics/MethodLength:
10 | Max: 30
11 |
12 | Metrics/ClassLength:
13 | Max: 200
14 |
15 | Metrics/ModuleLength:
16 | Max: 200
17 |
18 | Metrics/BlockLength:
19 | Max: 50
20 |
21 | Gemspec/DevelopmentDependencies:
22 | EnforcedStyle: gemspec
23 |
24 | # TODO: Enable these cops after fixing the issues
25 | Metrics/CyclomaticComplexity:
26 | Enabled: false
27 |
28 | Metrics/PerceivedComplexity:
29 | Enabled: false
30 |
31 | Metrics/AbcSize:
32 | Enabled: false
33 |
34 | Style/Documentation:
35 | Enabled: false
36 |
--------------------------------------------------------------------------------
/spec/examples/newlines.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Hello
5 |
6 |
7 |
8 | How are you?
9 |
10 |
11 |
12 |
13 | How are you?
14 |
15 |
16 |
17 |
18 | How are you?
19 |
20 |
21 |
22 |
23 | Just two divs
24 |
25 |
26 | Hanging out
27 |
28 |
29 | This is not the end!
30 |
31 | How are you again?
32 |
33 |
34 | This is the end!
35 |
36 | Just kidding
37 |
Header 1
38 | Some text
39 |
40 | Some more text
41 |
Paragraph tag!
42 |
Header 2
43 |
44 |
Header 3
45 | Some text
46 |
Header 4
47 |
Paragraph tag!
48 | Final line
49 |
50 |
--------------------------------------------------------------------------------
/spec/examples/malformed-style.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
title
5 |
6 |
7 |
8 |
50 |
Some body
51 |
52 |
--------------------------------------------------------------------------------
/spec/examples/table.html:
--------------------------------------------------------------------------------
1 |
2 |
Ignored Title
3 |
4 |
Hello, World!
5 |
6 |
7 |
8 | | Col A |
9 | Col B |
10 |
11 |
12 |
13 |
14 | |
15 | Data A1
16 | |
17 |
18 | Data B1
19 | |
20 |
21 |
22 | |
23 | Data A2
24 | |
25 |
26 | Data B2
27 | |
28 |
29 |
30 | |
31 | Data A3
32 | |
33 |
34 | Data B4
35 | |
36 |
37 |
38 |
39 |
40 | |
41 | Total A
42 | |
43 |
44 | Total B
45 | |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 2015 Jevon Wright
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining
4 | a copy of this software and associated documentation files (the
5 | "Software"), to deal in the Software without restriction, including
6 | without limitation the rights to use, copy, modify, merge, publish,
7 | distribute, sublicense, and/or sell copies of the Software, and to
8 | permit persons to whom the Software is furnished to do so, subject to
9 | the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/html2text.gemspec:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | $LOAD_PATH.push File.expand_path('lib', __dir__)
4 |
5 | # Maintain your gem's version:
6 | require 'html2text/version'
7 |
8 | # Describe your gem and declare its dependencies:
9 | Gem::Specification.new do |s|
10 | s.name = 'html2text'
11 | s.version = Html2Text::VERSION
12 | s.authors = ['Jevon Wright']
13 | s.email = ['jevon@jevon.org']
14 | s.homepage = 'https://github.com/soundasleep/html2text_ruby'
15 | s.summary = 'Convert HTML into plain text.'
16 | s.description = 'A Ruby component to convert HTML into a plain text format.'
17 | s.license = 'MIT'
18 | s.required_ruby_version = '>= 3.0'
19 |
20 | s.files = Dir['lib/**/*', 'LICENSE.md', 'README.md', 'CHANGELOG.md']
21 |
22 | s.add_dependency 'nokogiri', ['>= 1.0', '< 2.0']
23 |
24 | s.add_development_dependency 'bundler-audit'
25 | s.add_development_dependency 'colorize'
26 | s.add_development_dependency 'rake'
27 | s.add_development_dependency 'rspec'
28 | s.add_development_dependency 'rspec-collection_matchers'
29 | s.add_development_dependency 'rubocop'
30 | s.add_development_dependency 'rubocop-performance'
31 | s.add_development_dependency 'rubocop-rake'
32 |
33 | s.metadata['rubygems_mfa_required'] = 'true'
34 | end
35 |
--------------------------------------------------------------------------------
/spec/examples_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'spec_helper'
4 |
5 | describe Html2Text do
6 | describe '#convert' do
7 | let(:text) { Html2Text.convert(html) }
8 |
9 | examples = Dir["#{File.dirname(__FILE__)}/examples/*.html"]
10 |
11 | examples.each do |filename|
12 | context filename.to_s do
13 | let(:html) { File.read(filename) }
14 | let(:text_file) { filename.sub('.html', '.txt') }
15 | let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
16 |
17 | it 'has an expected output' do
18 | expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist"
19 | end
20 |
21 | it 'converts to text' do
22 | # Write the output if it failed, for easier comparison
23 | File.write(filename.sub('.html', '.output'), text) unless text.eql?(expected)
24 |
25 | # Quick check, don't try to generate a 500kb+ diff,
26 | # which can halt the rspec for minutes+
27 | expect(text.length).to eq expected.length if text.length > 10_000
28 |
29 | # More complete check
30 | expect(text).to eq expected
31 | end
32 | end
33 | end
34 |
35 | it 'has examples to test' do
36 | expect(examples.size).to_not eq(0)
37 | end
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/spec/examples/images.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | One:
4 |
5 |
6 |
7 | Two:
8 |
9 |
10 |
11 | Three:
12 |
13 |
14 |
15 | Four:
16 |
17 |
18 |
With links
19 |
20 |
21 | One:
22 |
23 |
24 |
25 | Two:
26 |
27 |
28 |
29 | Three:
30 |
31 |
32 |
33 | Four:
34 |
35 |
36 |
With links with titles
37 |
38 |
39 | One:
40 |
41 |
42 |
43 | Two:
44 |
45 |
46 |
47 | Three:
48 |
49 |
50 |
51 | Four:
52 |
53 |
54 |