├── .github
└── workflows
│ └── test.yml
├── .gitignore
├── .rspec
├── .ruby-version
├── CHANGELOG.md
├── Gemfile
├── LICENSE.md
├── README.md
├── Rakefile
├── bin
├── ci
└── upmark
├── coverage
└── .gitkeep
├── lib
├── upmark.rb
└── upmark
│ ├── errors.rb
│ ├── parser
│ └── xml.rb
│ ├── transform
│ ├── ignore.rb
│ ├── markdown.rb
│ ├── normalise.rb
│ └── preprocess.rb
│ └── transform_helpers.rb
├── spec
├── acceptance
│ └── upmark_spec.rb
├── errors_spec.rb
├── spec_helper.rb
└── unit
│ └── lib
│ └── upmark
│ ├── parser
│ └── xml_spec.rb
│ └── transform
│ └── markdown_spec.rb
└── upmark.gemspec
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Ruby Test
2 | on: [push]
3 | jobs:
4 | build:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v2
8 | - name: Use Ruby
9 | uses: ruby/setup-ruby@v1
10 | with:
11 | ruby-version: 3.2
12 | bundler-cache: true
13 | - name: Install dependencies
14 | run: bundle install
15 | - name: Run tests
16 | run: bundle exec rake
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | .bundle
3 | coverage
4 | .rvmrc
5 | Gemfile.lock
6 | .examples
7 |
--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --colour
2 | --require spec_helper
3 |
--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 3.2.2
2 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## 1.1.0 / 2024-04-19
4 | * basic handling for nested lists
5 |
6 | ## 1.0.0 / 2018-03-27
7 | * Delegate `ascii_tree` method to cause object
8 | * Travis CI
9 | * Upgrade deps
10 | * Various cleanups
11 |
12 | ## v0.10.0 (17th June 2016)
13 | * better handling of HTML entities
14 | * better handling of hyperlinks with query strings
15 |
16 | ## v0.9.0 (8th November 2015)
17 | * bump parslet dependency to 1.7.x
18 | * converts some new HTML elements to markdown - h4, h5 and h6
19 | * detects more content that looks like an unordered list and converts
20 | it to a markdown list
21 | * strips some additional HTML tags and leaves the content, including
22 | * table
23 | * span
24 | * strips some HTML elements
25 | * img tags with non http[s] src
26 | * improved error when unbalanced HTML tags are detected
27 |
28 | ## v0.2.0 (21st July 2014)
29 | * Upmark.convert() now raises an Upmark::ParseError exception if the supplied
30 | HTML can't be parsed
31 | * Locked the parslet dependency to 1.4.0 for now
32 | * We depend on the Parslet:ParseError exception which was removed in parslet 1.5.0
33 |
34 | ## v0.1.4 (2nd August 2012)
35 | * BUGFIX: handle single quotes in attribute values
36 |
37 | ## v0.1.3 (28th March 2012)
38 | * BUGFIX: handle ampersands in attribute values
39 |
40 | ## v0.1.2 (26th September 2011)
41 | * BUGFIX: handle newlines after a
element
42 |
43 | ## v0.1.1 (26th September 2011)
44 | * lots of refactoring
45 |
46 | ## v0.1.0 (25th September 2011)
47 | * lots of refactoring
48 |
49 | ## v0.0.1 (23rd September 2011)
50 | * initial release
51 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 |
3 | # Specify your gem's dependencies in upmark.gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016 The Conversation Media Group
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining
4 | a copy of this software and associated documentation files (the
5 | "Software"), to deal in the Software without restriction, including
6 | without limitation the rights to use, copy, modify, merge, publish,
7 | distribute, sublicense, and/or sell copies of the Software, and to
8 | permit persons to whom the Software is furnished to do so, subject to
9 | the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Upmark
2 |
3 | A HTML to Markdown converter.
4 |
5 | ## Installation
6 |
7 | > gem install upmark
8 |
9 | ## Usage
10 |
11 | In ruby:
12 |
13 | ```ruby
14 | require "upmark"
15 | html = "
messenger bag skateboard
" 16 | markdown = Upmark.convert(html) 17 | puts markdown 18 | ``` 19 | 20 | From the command-line: 21 | 22 | > upmark foo.html 23 | 24 | You can also pipe poorly formatted HTML documents through `tidy` before piping them into `upmark`: 25 | 26 | > cat bar.html | tidy -asxhtml -indent -quiet --show-errors 0 --show-warnings 0 --show-body-only 1 --wrap 0 | upmark 27 | 28 | ## Features 29 | 30 | Upmark will convert the following (arbitrarily nested) HTML elements to Markdown: 31 | 32 | * `strong` 33 | * `em` 34 | * `p` 35 | * `a` 36 | * `h1`, `h2`, `h3`, `h4`, `h5`, `h6` 37 | * `ul` 38 | * `ol` 39 | * `br` 40 | 41 | It will also pass through block and span-level HTML elements (e.g. `table`, `div`, `span`, etc) which aren't used by Markdown. 42 | 43 | ## How it works 44 | 45 | Upmark defines a parsing expression grammar (PEG) using the very awesome [Parslet](https://github.com/kschiess/parslet/) gem. This PEG is then used to convert HTML into Markdown in 4 steps: 46 | 47 | 1. Parse the XHTML into an abstract syntax tree (AST). 48 | 2. Normalize the AST into a nested hash of HTML elements. 49 | 3. Mark the block and span-level subtrees which should be ignored (`table`, `div`, `span`, etc). 50 | 4. Convert the AST leaves into Markdown. 51 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | namespace :spec do 5 | desc "Run acceptance specs" 6 | RSpec::Core::RakeTask.new(:acceptance) do |t| 7 | t.pattern = "./spec/acceptance/**/*_spec.rb" 8 | end 9 | 10 | desc "Run unit specs" 11 | RSpec::Core::RakeTask.new(:unit) do |t| 12 | t.pattern = "./spec/unit/**/*_spec.rb" 13 | end 14 | 15 | desc "Run unit and acceptance specs" 16 | task all: [:"spec:unit", :"spec:acceptance"] 17 | end 18 | 19 | task default: :"spec:all" 20 | -------------------------------------------------------------------------------- /bin/ci: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This script is the standard way to run a CI build for all TC projects. 4 | # It's primarily used by buildbox 5 | 6 | # Exits bash immediately if any command fails 7 | set -e 8 | 9 | # Will output commands as the run 10 | set -x 11 | 12 | # prepare the repository state, load dependencies, etc 13 | ruby -v 14 | hostname 15 | bundle check || bundle --local --path=vendor/bundle || bundle --path=vendor/bundle 16 | 17 | # .. and now start the build 18 | bundle exec rake 19 | -------------------------------------------------------------------------------- /bin/upmark: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | $:.push File.expand_path("../../lib", __FILE__) 4 | 5 | require "upmark" 6 | 7 | if ARGV.length > 0 8 | ARGV.each do |arg| 9 | puts Upmark.convert(File.read(arg)) 10 | end 11 | else 12 | puts Upmark.convert(ARGF.read) 13 | end 14 | 15 | -------------------------------------------------------------------------------- /coverage/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/conversation/upmark/2b805a308d493c33529f19bbb2d1cd183df52a4c/coverage/.gitkeep -------------------------------------------------------------------------------- /lib/upmark.rb: -------------------------------------------------------------------------------- 1 | require "parslet" 2 | 3 | require 'upmark/errors' 4 | require "upmark/parser/xml" 5 | require 'upmark/transform_helpers' 6 | require "upmark/transform/markdown" 7 | require "upmark/transform/normalise" 8 | require "upmark/transform/preprocess" 9 | 10 | module Upmark 11 | def self.convert(html) 12 | xml = Parser::XML.new 13 | normalise = Transform::Normalise.new 14 | preprocess = Transform::Preprocess.new 15 | markdown = Transform::Markdown.new 16 | 17 | ast = xml.parse(html.strip) 18 | ast = normalise.apply(ast) 19 | ast = preprocess.apply(ast) 20 | ast = markdown.apply(ast) 21 | 22 | # The result is either a String or an Array. 23 | ast = ast.join if ast.is_a?(Array) 24 | 25 | # Remove trailing whitespace 26 | ast.gsub!(/ +$/,'') 27 | 28 | # Compress bullet point lists 29 | ast.gsub!(/^•\s*([^•\n]*)\n+(?=•)/,"* #{'\1'}\n") 30 | 31 | # Any more than two consecutive newline characters is superflous. 32 | ast.gsub!(/\n(\s*\n)+/, "\n\n") 33 | 34 | # Remove other bullet points 35 | ast.gsub!(/^•\s*/,"* ") 36 | 37 | ast.strip 38 | rescue Parslet::ParseFailed => e 39 | raise Upmark::ParseFailed.new('Parse failed', e) 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/upmark/errors.rb: -------------------------------------------------------------------------------- 1 | module Upmark 2 | class ParseFailed < StandardError 3 | attr_reader :cause 4 | 5 | def initialize(message, cause) 6 | @cause = cause 7 | super(message) 8 | end 9 | 10 | def ascii_tree 11 | @cause && @cause.ascii_tree 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/upmark/parser/xml.rb: -------------------------------------------------------------------------------- 1 | module Upmark 2 | module Parser 3 | # The XML parser class. 4 | # 5 | # Parses a XML document into an abstract syntax tree (AST). 6 | # 7 | # It's worth referring to the XML spec: 8 | # http://www.w3.org/TR/2000/REC-xml-20001006 9 | # 10 | class XML < Parslet::Parser 11 | root(:node) 12 | 13 | rule(:node) do 14 | ( 15 | empty_element.as(:empty) | 16 | element.as(:element) | 17 | text.as(:text) 18 | ).repeat(0) 19 | end 20 | 21 | rule(:empty_element) do 22 | start_tag.as(:start_tag) >> 23 | match(/\s+/) >> 24 | end_tag.as(:end_tag) 25 | end 26 | 27 | rule(:element) do 28 | empty_br.as(:empty_tag) | 29 | ( 30 | start_tag.as(:start_tag) >> 31 | node.as(:children) >> 32 | end_tag.as(:end_tag) 33 | ) | 34 | empty_tag.as(:empty_tag) 35 | end 36 | 37 | rule(:text) do 38 | match(/\A[\s\n\t ]+\Z/m).absent? >> # ignore entirely empty strings 39 | match(/[^<>]/).repeat(1) 40 | end 41 | 42 | rule(:start_tag) do 43 | str('<') >> 44 | name.as(:name) >> 45 | (space >> attribute).repeat.as(:attributes) >> 46 | space? >> 47 | str('>') 48 | end 49 | 50 | rule(:end_tag) do 51 | str('') >> 52 | name.as(:name) >> 53 | space? >> 54 | str('>') 55 | end 56 | 57 | rule(:empty_br) do 58 | str('<') >> space? >> str('br').as(:name) >> space? >> str('>') 59 | end 60 | 61 | rule(:empty_tag) do 62 | str('<') >> 63 | name.as(:name) >> 64 | (space >> attribute).repeat.as(:attributes) >> 65 | space? >> 66 | str('/>') 67 | end 68 | 69 | rule(:name) do 70 | match(/[a-zA-Z_:]/) >> match(/[\w:\.-]/).repeat 71 | end 72 | 73 | rule(:attribute) do 74 | name.as(:name) >> 75 | str('=') >> ( 76 | (str('"') >> double_quoted_attribute_value.as(:value) >> str('"')) | # double quotes 77 | (str("'") >> single_quoted_attribute_value.as(:value) >> str("'")) # single quotes 78 | ) 79 | end 80 | 81 | rule(:double_quoted_attribute_value) do 82 | (str('"').absent? >> (match(/[^<]/) | string_entity | numeric_entity)).repeat 83 | end 84 | 85 | rule(:single_quoted_attribute_value) do 86 | (str("'").absent? >> (match(/[^<]/) | string_entity | numeric_entity)).repeat 87 | end 88 | 89 | rule(:string_entity) { match("&") >> name >> match(";") } 90 | rule(:numeric_entity) { match(/\d+;/) } 91 | 92 | rule(:space) { match(/\s/).repeat(1) } 93 | rule(:space?) { space.maybe } 94 | end 95 | end 96 | end 97 | -------------------------------------------------------------------------------- /lib/upmark/transform/ignore.rb: -------------------------------------------------------------------------------- 1 | module Upmark 2 | module Transform 3 | # A transform class which marks all elements in a subtree as ignored. 4 | class Ignore < Parslet::Transform 5 | include TransformHelpers 6 | 7 | element(:*) do |element| 8 | { 9 | element: { 10 | name: element[:name], 11 | attributes: element[:attributes], 12 | children: element[:children], 13 | ignore: true 14 | } 15 | } 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/upmark/transform/markdown.rb: -------------------------------------------------------------------------------- 1 | module Upmark 2 | module Transform 3 | # A transform class which converts an abstract syntax tree (AST) into 4 | # a Markdown document. 5 | class Markdown < Parslet::Transform 6 | include TransformHelpers 7 | 8 | rule(text: simple(:value)) { value.to_s } 9 | 10 | # Pass all unmatched elements through. 11 | rule( 12 | element: { 13 | name: simple(:name), 14 | attributes: subtree(:attributes), 15 | children: sequence(:children), 16 | ignore: simple(:ignore) 17 | } 18 | ) do |element| 19 | attributes = map_attributes_subtree(element[:attributes]) 20 | children = element[:children].join 21 | name = element[:name] 22 | 23 | attributes_list = 24 | if attributes.any? 25 | " " + attributes.map {|name, value| %Q{#{name}="#{value}"} }.join(" ") 26 | else 27 | "" 28 | end 29 | 30 | if children.empty? 31 | %Q{<#{name}#{attributes_list} />} 32 | else 33 | %Q{<#{name}#{attributes_list}>#{children}#{name}>} 34 | end 35 | end 36 | 37 | def self.text(element) 38 | element[:children].join.gsub(/(\n)[\n ]+/, '\1') 39 | end 40 | 41 | element(:p) {|element| "#{text(element)}\n\n" } 42 | element(:h1) {|element| "# #{text(element)}" } 43 | element(:h2) {|element| "## #{text(element)}" } 44 | element(:h3) {|element| "### #{text(element)}" } 45 | element(:h4) {|element| "#### #{text(element)}" } 46 | element(:h5) {|element| "##### #{text(element)}" } 47 | element(:h6) {|element| "###### #{text(element)}" } 48 | element(:li) {|element| "#{text(element)}" } 49 | 50 | element(:ul) do |element| 51 | children = element[:children].flatten.map {|value| value.strip != "" ? value : nil }.compact 52 | children.map {|value| "* #{value.gsub(/^\s*•\s*/,'')}\n" } 53 | end 54 | 55 | element(:ol) do |element| 56 | children = element[:children].flatten.map {|value| value.strip != "" ? value : nil }.compact 57 | children.map.with_index {|value, i| "#{i + 1}. #{value}\n" } 58 | end 59 | 60 | element(:a) do |element| 61 | attributes = map_attributes_subtree(element[:attributes]) 62 | href = attributes[:href] 63 | title = attributes[:title] 64 | 65 | if /^(?:http|mailto)/ =~ href 66 | %Q{[#{text(element)}](#{href} "#{title}")} 67 | else 68 | text(element) 69 | end 70 | end 71 | 72 | element(:img) do |element| 73 | attributes = map_attributes_subtree(element[:attributes]) 74 | href = attributes[:src] 75 | title = attributes[:title] 76 | alt_text = attributes[:alt] 77 | 78 | if /^http/ =~ href 79 | %Q{} 80 | else 81 | "#{alt_text || title}" 82 | end 83 | end 84 | 85 | element(:b, :strong) {|element| "**#{text(element)}**" } 86 | element(:i, :em) {|element| "*#{text(element)}*" } 87 | 88 | element(:br) { "\n" } 89 | rule(element: { name: "br"}) { "\n" } 90 | 91 | end 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /lib/upmark/transform/normalise.rb: -------------------------------------------------------------------------------- 1 | module Upmark 2 | module Transform 3 | # A transform class withich normalises start/end/empty tags into the 4 | # same structure. 5 | class Normalise < Parslet::Transform 6 | 7 | rule(element: subtree(:invalid)) do 8 | raise Upmark::ParseFailed.new('Invalid parse result', nil) 9 | end 10 | 11 | # Strip empty tags 12 | rule(empty: subtree(:invalid)) do 13 | ' ' 14 | end 15 | 16 | rule( 17 | element: { 18 | start_tag: {name: simple(:name), attributes: subtree(:attributes)}, 19 | end_tag: {name: simple(:end_tag_name)}, 20 | children: subtree(:children) 21 | } 22 | ) do 23 | unless name == end_tag_name 24 | raise Upmark::ParseFailed.new('Mismatched tags', nil) 25 | end 26 | { 27 | element: { 28 | name: name, 29 | attributes: attributes, 30 | children: children, 31 | ignore: false 32 | } 33 | } 34 | end 35 | 36 | rule( 37 | element: { 38 | empty_tag: { name: simple(:name) } 39 | } 40 | ) do 41 | { 42 | element: { 43 | name: name, 44 | attributes: [], 45 | children: [], 46 | ignore: false 47 | } 48 | } 49 | end 50 | 51 | rule( 52 | element: { 53 | empty_tag: {name: simple(:name), attributes: subtree(:attributes)} 54 | } 55 | ) do 56 | { 57 | element: { 58 | name: name, 59 | attributes: attributes, 60 | children: [], 61 | ignore: false 62 | } 63 | } 64 | end 65 | 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /lib/upmark/transform/preprocess.rb: -------------------------------------------------------------------------------- 1 | require "upmark/transform/ignore" 2 | 3 | module Upmark 4 | module Transform 5 | # A transform class which marks block-level elements as ignored. 6 | # i.e. These elements should not be converted to Markdown. 7 | class Preprocess < Parslet::Transform 8 | include TransformHelpers 9 | 10 | element(:div, :pre) do |element| 11 | { 12 | element: { 13 | name: element[:name], 14 | attributes: element[:attributes], 15 | children: Ignore.new.apply(element[:children]), 16 | ignore: true 17 | } 18 | } 19 | end 20 | 21 | element(:span) do |element| 22 | element[:children] 23 | end 24 | 25 | # table content elements are stripped ignoring their spacing 26 | element(:table, :thead, :tbody, :tfoot) do |element| 27 | element[:children].reject! do |c| 28 | Hash === c && c[:text].to_s =~ /\A[\n ]*\Z/m 29 | end 30 | element[:children] 31 | end 32 | 33 | # table content elements are stripped 34 | element(:td, :th) do |element| 35 | element[:children] 36 | end 37 | 38 | # table rows are treated as 'paragraph' blocks 39 | element(:tr) do |element| 40 | element[:children] 41 | .select { |c| Array === c } 42 | .map do |children| 43 | children.map do |child| 44 | if child.is_a?(Hash) # if the td doesn't contain nested elements 45 | if child[:text] 46 | child[:text].to_s.gsub!(/^\n */,'') 47 | end 48 | end 49 | child 50 | end + ["\n"] 51 | end + ["\n"] 52 | end 53 | end 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /lib/upmark/transform_helpers.rb: -------------------------------------------------------------------------------- 1 | module Upmark 2 | module TransformHelpers 3 | def self.included(base) 4 | base.send :extend, ClassMethods 5 | end 6 | 7 | module ClassMethods 8 | def element(*names, &block) 9 | names.each do |name| 10 | name = name.to_s.downcase 11 | rule( 12 | { 13 | element: { 14 | name: (name != "*" ? name : simple(:name)), 15 | attributes: subtree(:attributes), 16 | children: subtree(:children), 17 | ignore: false 18 | } 19 | } 20 | ) do |element| 21 | element[:name] ||= name 22 | block.call(element) 23 | end 24 | end 25 | end 26 | 27 | def map_attributes_subtree(ast) 28 | ast.inject({}) do |hash, attribute| 29 | hash[attribute[:name].to_sym] = attribute[:value] 30 | hash 31 | end 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /spec/acceptance/upmark_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Upmark, ".convert" do 2 | RSpec::Matchers.define :convert_to do |expected| 3 | match do 4 | actual == expected 5 | end 6 | 7 | def actual 8 | @converted_actual ||= Upmark.convert(@actual) 9 | end 10 | 11 | diffable 12 | end 13 | 14 | context "" do 15 | specify 'converts to []()' do 16 | expect(<<-HTML.strip 17 | 18 | HTML 19 | ).to convert_to <<-MD.strip 20 | [messenger **bag** skateboard](http://helvetica.com/ "art party organic") 21 | MD 22 | end 23 | end 24 | 25 | context " hard" do 26 | specify 'converts as []()' do 27 | expect(<<-HTML.strip 28 | 29 | HTML 30 | ).to convert_to <<-MD.strip 31 | [Manager, Business Solutions](http://jobs.latrobe.edu.au/jobDetails.asp?sJobIDs=545808&sKeywords=business "") 32 | MD 33 | end 34 | end 35 | 36 | context " with numeric entity" do 37 | specify 'converts as []()' do 38 | expect(<<-HTML.strip 39 | 40 | HTML 41 | ).to convert_to <<-MD.strip 42 | [blah](http://www.abc.net.au/news/2016-02-18/haylen-we-need-a-drug-summit-because-we're-losing-the-war/7177152 "") 43 | MD 44 | end 45 | end 46 | 47 | context " with query string" do 48 | specify 'converts as []()' do 49 | expect(<<-HTML.strip 50 | 51 | HTML 52 | ).to convert_to <<-MD.strip 53 | [blah](http://www.abc.net.au/news/2016-02-18/blah?blah=lol&lol=rofl "") 54 | MD 55 | end 56 | end 57 | 58 | context " with inline elements, no href" do 59 | specify 'converts as plain text' do 60 | expect(<<-HTML.strip 61 | How Australia can respond to the security challenges posed by climate change in the Asian Century 62 | HTML 63 | ).to convert_to <<-MD.strip 64 | How Australia can respond to the security challenges posed by climate change in the Asian Century 65 | MD 66 | end 67 | end 68 | 69 | context " with id href" do 70 | specify 'converts as plain text' do 71 | expect(<<-HTML.strip 72 | Labor MP calls to end dogs 73 | HTML 74 | ).to convert_to <<-MD.strip 75 | Labor MP calls to end dogs 76 | MD 77 | end 78 | end 79 | 80 | context "" do 92 | specify 'converts as plaintext' do 93 | expect(<<-HTML.strip 94 |
• Bullet 1
95 |• Bullet 2
96 |messenger bag skateboard
97 | 98 |art party
99 | organic
art party
102 | organic
105 |
106 | 107 |
• Bullet 3
108 |• Bullet 4
109 |• Bullet 5
110 |• Bullet 6
111 |• Bullet 7
112 |Something else
113 | HTML 114 | ).to convert_to <<-MD.strip 115 | * Bullet 1 116 | * Bullet 2 117 | 118 | messenger **bag** skateboard 119 | 120 | art party 121 | organic 122 | 123 | art party 124 | organic 125 | 126 | * Bullet 3 127 | * Bullet 4 128 | * Bullet 5 129 | * Bullet 6 130 | * Bullet 7 131 | 132 | Something else 133 | MD 134 | end 135 | 136 | it 'converts paragraph utf-8 bullet points to a markdown list' do 137 | expect("• Bullet 1
• Bullet 2
").to convert_to "* Bullet 1\n* Bullet 2" 138 | end 139 | end 140 | 141 | context "messenger
bag
skateboard
messenger
bag
skateboard
messenger |
245 | bag |
246 |
messenger |
249 | bag |
250 |
skateboarding | 253 |is cool with all the kids |
255 |
Messenger bags | 258 |are in with the hipsters though. | 259 |
" do 284 | let(:html) { <<-HTML.strip } 285 |286 |290 | HTML 291 | 292 | specify 'are left alone' do 293 | expect(html).to convert_to html 294 | end 295 | end 296 | end 297 | 298 | context " elements" do 299 | specify 'are stripped' do 300 | expect(<<-HTML.strip 301 | messenger bag skateboard 302 | HTML 303 | ).to convert_to <<-MD.strip 304 | messenger **bag** skateboard 305 | MD 306 | end 307 | end 308 | 309 | context "plain text" do 310 | it 'containing plain bullet points converts to markdown' do 311 | expect( 312 | "• Bullet 1\n• Bullet 2\n" 313 | ).to convert_to "* Bullet 1\n* Bullet 2" 314 | end 315 | end 316 | 317 | context "unbalanced elements" do 318 | let(:html) { "foo" } 319 | 320 | it "should raise an exception" do 321 | expect { 322 | Upmark.convert(html) 323 | }.to raise_error(Upmark::ParseFailed) 324 | end 325 | end 326 | 327 | context "unbalanced elements" do 328 | let(:html) { "287 | messenger bag skateboard 288 |
289 |foo" } 329 | 330 | it "should raise an exception" do 331 | expect { 332 | Upmark.convert(html) 333 | }.to raise_error(Upmark::ParseFailed) 334 | end 335 | end 336 | 337 | context "nested table" do 338 | let(:html) { "
"} 339 | 340 | it "should strip both tables" do 341 | expect(html).to convert_to("Hi\nthere") 342 | end 343 | end 344 | 345 | context "nested unordered lists" do 346 | let(:html) do 347 | <<-HTML 348 |
Hi
there349 |
353 | HTML 354 | end 355 | 356 | it "generates readable output" do 357 | expect(html).to convert_to("* * List item") 358 | end 359 | end 360 | 361 | context "nested ordered lists" do 362 | let(:html) do 363 | <<-HTML 364 |350 |
352 |- List item
351 |365 |
369 | HTML 370 | end 371 | 372 | it "generates readable output" do 373 | expect(html).to convert_to("1. 1. List item") 374 | end 375 | end 376 | end 377 | -------------------------------------------------------------------------------- /spec/errors_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Upmark::ParseFailed, ".ascii_tree" do 2 | it "delegates to a cause object" do 3 | cause = double(ascii_tree: double) 4 | error = Upmark::ParseFailed.new("oh noes", cause) 5 | expect(error.ascii_tree).to be(cause.ascii_tree) 6 | end 7 | 8 | it "returns nil when there is no cause" do 9 | error = Upmark::ParseFailed.new("oh noes", nil) 10 | expect(error.ascii_tree).to be_nil 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "parslet/rig/rspec" 2 | require "rspec" 3 | require "simplecov" 4 | 5 | if ENV["BUILDBOX"] 6 | SimpleCov.start do 7 | add_filter "spec/" 8 | add_filter "vendor/bundle/" 9 | end 10 | end 11 | 12 | require "upmark" 13 | 14 | RSpec.configure do |config| 15 | config.disable_monkey_patching! 16 | 17 | config.order = :random 18 | 19 | config.example_status_persistence_file_path = '.examples' 20 | end 21 | -------------------------------------------------------------------------------- /spec/unit/lib/upmark/parser/xml_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Upmark::Parser::XML do 2 | let(:parser) { Upmark::Parser::XML.new } 3 | 4 | context "#node" do 5 | it 'will parse ""' do 6 | expect(parser.node).to parse "" 7 | end 8 | it 'will parse "messenger bag skateboard"' do 9 | expect(parser.node).to parse "messenger bag skateboard" 10 | end 11 | it 'will parse html br tags' do 12 | expect(parser.node).to parse '366 |
368 |- List item
367 |One
' 13 | end 14 | it 'will parse "
Twomessenger bag skateboard
"' do 15 | expect( 16 | parser.node 17 | ).to parse "messenger bag skateboard
" 18 | end 19 | it 'will parse "messengerbag
skateboard"' do 20 | expect( 21 | parser.node 22 | ).to parse "messengerbag
skateboard" 23 | end 24 | it 'will parse "messenger
bag
skateboard
"' do 25 | expect( 26 | parser.node 27 | ).to parse "messenger
bag
skateboard
" 28 | end 29 | it 'will parse "messenger
\nbag
\nskateboard
"' do 30 | expect( 31 | parser.node 32 | ).to parse "messenger
\nbag
\nskateboard
" 33 | end 34 | it 'will parse "messenger bag skateboard
"' do 35 | expect( 36 | parser.node 37 | ).to parse "messenger bag skateboard
" 38 | end 39 | end 40 | 41 | context "#empty_element" do 42 | it 'will parse' do 43 | expect(parser.empty_element).to parse '
' 44 | end 45 | end 46 | 47 | context "#element" do 48 | it 'will parse ""' do 49 | expect(parser.element).to parse "" 50 | end 51 | it 'will parse "
messenger bag skateboard
"' do 52 | expect(parser.element).to parse "messenger bag skateboard
" 53 | end 54 | it 'will parse "Some
"' do 55 | expect(parser.element).to parse "
TextSome
" 56 | end 57 | it 'will parse %q{
Text}' do 58 | expect(parser.element).to parse %q{ } 59 | end 60 | it 'will not parse " "' do 61 | expect(parser.element).to_not parse "
" 62 | end 63 | it 'will not parse "
messenger bag skateboard"' do 64 | expect(parser.element).to_not parse "
messenger bag skateboard" 65 | end 66 | it 'will not parse "messenger bag skateboard
"' do 67 | expect(parser.element).to_not parse "messenger bag skateboard" 68 | end 69 | it 'will not parse "messenger bag skateboard
"' do 70 | expect(parser.element).to_not parse "
messenger bag skateboard
" 71 | end 72 | end 73 | 74 | context "#text" do 75 | it 'will parse "messenger bag skateboard"' do 76 | expect(parser.text).to parse "messenger bag skateboard" 77 | end 78 | it 'will not parse "
messenger bag skateboard
"' do 79 | expect(parser.text).to_not parse "messenger bag skateboard
" 80 | end 81 | it 'will not parse " "' do 82 | expect(parser.text).to_not parse " " 83 | end 84 | it 'will not parse ""' do 85 | expect(parser.text).to_not parse "" 86 | end 87 | end 88 | 89 | context "#start_tag" do 90 | it 'will parse %q{}' do 91 | expect(parser.start_tag).to parse %q{ } 92 | end 93 | it 'will parse %q{ }' do 94 | expect(parser.start_tag).to parse %q{ } 95 | end 96 | it 'will parse " " 113 | end 114 | it 'will not parse ""' do 97 | expect(parser.start_tag).to parse " " 101 | end 102 | it 'will not parse "" 98 | end 99 | it 'will not parse " "' do 100 | expect(parser.start_tag).to_not parse ""' do 106 | expect(parser.start_tag).to_not parse "tofu>" 107 | end 108 | end 109 | 110 | context "#end_tag" do 111 | it 'will parse " "' do 112 | expect(parser.end_tag).to parse ""' do 115 | expect(parser.end_tag).to_not parse " " 116 | end 117 | it 'will not parse " "' do 121 | expect(parser.end_tag).to_not parse "/tofu>" 122 | end 123 | end 124 | 125 | context "#empty_br" do 126 | it 'will parse html br tags' do 127 | expect(parser.empty_br).to parse '
' 128 | end 129 | end 130 | 131 | context "#empty_tag" do 132 | it 'will parse %q{}' do 133 | expect(parser.empty_tag).to parse %q{ } 134 | end 135 | it 'will parse %q{ }' do 136 | expect(parser.empty_tag).to parse %q{ } 137 | end 138 | it 'will parse %q{ }' do 139 | expect(parser.empty_tag).to parse %q{ } 140 | end 141 | it 'will not parse " "' do 142 | expect(parser.empty_tag).to_not parse " " 146 | end 147 | it 'will not parse "" 143 | end 144 | it 'will not parse " "' do 145 | expect(parser.empty_tag).to_not parse ""' do 151 | expect(parser.empty_tag).to_not parse "/tofu>" 152 | end 153 | end 154 | 155 | context "#name" do 156 | it 'will parse "p"' do 157 | expect(parser.name).to parse "p" 158 | end 159 | it 'will parse "h1"' do 160 | expect(parser.name).to parse "h1" 161 | end 162 | it 'will not parse "1h"' do 163 | expect(parser.name).to_not parse "1h" 164 | end 165 | it 'will not parse "h 1"' do 166 | expect(parser.name).to_not parse "h 1" 167 | end 168 | end 169 | 170 | context "#attribute" do 171 | it 'will parse %q{art="party organic"}' do 172 | expect(parser.attribute).to parse %q{art="party organic"} 173 | end 174 | it 'will parse %q{art=\'party organic\'}' do 175 | expect(parser.attribute).to parse %q{art='party organic'} 176 | end 177 | it 'will parse %q{art="party\'organic"}' do 178 | expect(parser.attribute).to parse %q{art="party'organic"} 179 | end 180 | it 'will parse %q{art=\'party"organic\'}' do 181 | expect(parser.attribute).to parse %q{art='party"organic'} 182 | end 183 | it 'will not parse "art"' do 184 | expect(parser.attribute).to_not parse "art" 185 | end 186 | it 'will not parse "art="' do 187 | expect(parser.attribute).to_not parse "art=" 188 | end 189 | it 'will not parse "art=party"' do 190 | expect(parser.attribute).to_not parse "art=party" 191 | end 192 | it 'will not parse %q{="party organic"}' do 193 | expect(parser.attribute).to_not parse %q{="party organic"} 194 | end 195 | it 'will not parse %q{art="party organic\'}' do 196 | expect(parser.attribute).to_not parse %q{art="party organic'} 197 | end 198 | it 'will not parse %q{art=\'party organic"}' do 199 | expect(parser.attribute).to_not parse %q{art='party organic"} 200 | end 201 | end 202 | 203 | context "#parse" do 204 | RSpec::Matchers.define :convert do |html| 205 | match do |parser| 206 | @actual = parser.parse(html) 207 | @actual == @expected 208 | end 209 | 210 | chain :to do |ast| 211 | @expected = ast 212 | end 213 | attr_reader :expected 214 | 215 | failure_message do 216 | %Q{expected "#{html}" to parse to "#{@expected.inspect}" but was #{@result.inspect}} 217 | end 218 | 219 | diffable 220 | end 221 | 222 | context "single tag" do 223 | it 'is parsed as a single element' do 224 | expect(parser).to convert(" messenger
").to([ 225 | { 226 | element: { 227 | start_tag: {name: "p", attributes: []}, 228 | end_tag: {name: "p"}, 229 | children: [{text: "messenger"}] 230 | } 231 | } 232 | ]) 233 | end 234 | 235 | it 'will ignore empty text tags' do 236 | expect(parser).to convert('').to( 237 | [ 238 | { 239 | empty: 240 | { 241 | start_tag: { name: "p", attributes: [] }, 242 | end_tag: { name: "p" }, 243 | } 244 | } 245 | ] 246 | ) 247 | end 248 | end 249 | 250 | context "empty tag" do 251 | it 'is parsed an empty_tag element' do 252 | expect(parser).to convert("
").to([ 253 | { 254 | element: { 255 | empty_tag: {name: "br", attributes: []} 256 | } 257 | } 258 | ]) 259 | end 260 | end 261 | 262 | context "single tag with attributes" do 263 | let(:html) { %q{messenger bag skateboard} } 264 | 265 | it 'is parsed an element with an attribute subtree' do 266 | expect(parser).to convert(html).to([ 267 | { 268 | element: { 269 | start_tag: { 270 | name: "a", 271 | attributes: [ 272 | {name: "href", value: "http://helvetica.com/"}, 273 | {name: "title", value: "art party organic"} 274 | ] 275 | }, 276 | end_tag: {name: "a"}, 277 | children: [{text: "messenger bag skateboard"}] 278 | } 279 | } 280 | ]) 281 | end 282 | end 283 | 284 | context "multiple inline tags" do 285 | let(:html) { "messenger
bag
skateboard
" } 286 | 287 | it 'converts to multiple elements' do 288 | expect(parser).to convert(html).to([ 289 | { 290 | element: { 291 | start_tag: {name: "p", attributes: []}, 292 | end_tag: {name: "p"}, 293 | children: [{text: "messenger"}] 294 | } 295 | }, { 296 | element: { 297 | start_tag: {name: "p", attributes: []}, 298 | end_tag: {name: "p"}, 299 | children: [{text: "bag"}] 300 | } 301 | }, { 302 | element: { 303 | start_tag: {name: "p", attributes: []}, 304 | end_tag: {name: "p"}, 305 | children: [{text: "skateboard"}] 306 | } 307 | } 308 | ]) 309 | end 310 | end 311 | 312 | context "multiple tags" do 313 | let(:html) { "messenger
\nbag
\nskateboard
" } 314 | 315 | it 'converts to multiple elements' do 316 | expect(parser).to convert(html).to([ 317 | { 318 | element: { 319 | start_tag: {name: "p", attributes: []}, 320 | end_tag: {name: "p"}, 321 | children: [{text: "messenger"}] 322 | } 323 | }, { 324 | text: "\n" 325 | }, { 326 | element: { 327 | start_tag: {name: "p", attributes: []}, 328 | end_tag: {name: "p"}, 329 | children: [{text: "bag"}] 330 | } 331 | }, { 332 | text: "\n" 333 | }, { 334 | element: { 335 | start_tag: {name: "p", attributes: []}, 336 | end_tag: {name: "p"}, 337 | children: [{text: "skateboard"}] 338 | } 339 | } 340 | ]) 341 | end 342 | end 343 | 344 | context "nested tags" do 345 | let(:html) { "messenger bag skateboard
" } 346 | 347 | it 'converts to multiple nested elements' do 348 | expect(parser).to convert(html).to([ 349 | { 350 | element: { 351 | start_tag: {name: "p", attributes: []}, 352 | end_tag: {name: "p"}, 353 | children: [ 354 | { 355 | text: "messenger " 356 | }, { 357 | element: { 358 | start_tag: {name: "strong", attributes: []}, 359 | children: [{text: "bag"}], 360 | end_tag: {name: "strong"} 361 | } 362 | }, { 363 | text: " skateboard" 364 | } 365 | ] 366 | } 367 | } 368 | ]) 369 | end 370 | end 371 | end 372 | end 373 | -------------------------------------------------------------------------------- /spec/unit/lib/upmark/transform/markdown_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Upmark::Transform::Markdown do 2 | def transform(ast) 3 | Upmark::Transform::Markdown.new.apply(ast) 4 | end 5 | 6 | let(:transformed_ast) { transform(ast) } 7 | 8 | context "#apply" do 9 | context '
' do 10 | let(:ast) { [{ element: { name: 'br' }}] } 11 | 12 | it 'will transform to markdown' do 13 | expect(transformed_ast).to eq ["\n"] 14 | end 15 | end 16 | 17 | context "" do 18 | context "single tag" do 19 | let(:ast) do 20 | [ 21 | { 22 | element: { 23 | name: "p", 24 | attributes: [], 25 | children: [{text: "messenger bag skateboard"}], 26 | ignore: false 27 | } 28 | } 29 | ] 30 | end 31 | 32 | it 'transforms to markdown' do 33 | expect( 34 | transformed_ast 35 | ).to eq(["messenger bag skateboard\n\n"]) 36 | end 37 | end 38 | 39 | context "multiple tags" do 40 | let(:ast) do 41 | [ 42 | { 43 | element: { 44 | name: "p", 45 | attributes: [], 46 | children: [{text: "messenger"}], 47 | ignore: false 48 | } 49 | }, { 50 | element: { 51 | name: "p", 52 | attributes: [], 53 | children: [{text: "bag"}], 54 | ignore: false 55 | } 56 | }, { 57 | element: { 58 | name: "p", 59 | attributes: [], 60 | children: [{text: "skateboard"}], 61 | ignore: false 62 | } 63 | } 64 | ] 65 | end 66 | 67 | it 'transforms to markdown' do 68 | expect( 69 | transformed_ast 70 | ).to eq(["messenger\n\n", "bag\n\n", "skateboard\n\n"]) 71 | end 72 | end 73 | end 74 | 75 | context "" do 76 | context "single tag" do 77 | let(:ast) do 78 | a_tag( 79 | href: "http://helvetica.com/", 80 | title: "art party organic", 81 | ) 82 | end 83 | 84 | def a_tag(attributes) 85 | [ 86 | { 87 | element: { 88 | name: "a", 89 | attributes: attributes.map do |key, value| 90 | { name: key.to_s, value: value } 91 | end, 92 | children: [{text: "messenger bag skateboard"}], 93 | ignore: false 94 | } 95 | } 96 | ] 97 | end 98 | 99 | it 'transforms to markdown' do 100 | expect( 101 | transformed_ast 102 | ).to eq([%q{[messenger bag skateboard](http://helvetica.com/ "art party organic")}]) 103 | end 104 | 105 | it 'transforms mailto to markdown' do 106 | expect( 107 | transform a_tag(href: 'mailto:a@example.com', title: 'Some Path') 108 | ).to eq([%q{[messenger bag skateboard](mailto:a@example.com "Some Path")}]) 109 | end 110 | 111 | it 'strips local urls to their text' do 112 | expect( 113 | transform a_tag(href: 'file://some/path', title: 'Some Path') 114 | ).to eq ['messenger bag skateboard'] 115 | end 116 | 117 | it 'strips relative urls to their alt text' do 118 | expect( 119 | transform a_tag(src: 'some/path', title: 'Some Path') 120 | ).to eq ['messenger bag skateboard'] 121 | end 122 | end 123 | end 124 | 125 | context "
" do 126 | context "empty tag" do 127 | let(:ast) do 128 | img( 129 | src: "http://helvetica.com/image.gif", 130 | title: "art party organic", 131 | alt: "messenger bag skateboard", 132 | ) 133 | end 134 | 135 | def img(attributes) 136 | [ 137 | { 138 | element: { 139 | name: "img", 140 | attributes: attributes.map do |key, value| 141 | { name: key.to_s, value: value } 142 | end, 143 | children: [], 144 | ignore: false 145 | } 146 | } 147 | ] 148 | end 149 | 150 | it 'transforms to markdown' do 151 | expect( 152 | transformed_ast 153 | ).to eq([%q{}]) 154 | end 155 | 156 | it 'strips file urls to their alt text or title' do 157 | expect( 158 | transform img(src: 'file://some/path', alt: 'Some', title: 'Path') 159 | ).to eq ['Some'] 160 | expect( 161 | transform img(src: 'file://some/path', title: 'Some Path') 162 | ).to eq ['Some Path'] 163 | end 164 | 165 | it 'strips relative urls to their alt text' do 166 | expect( 167 | transform img(src: 'some/path', alt: 'Some', title: 'Path') 168 | ).to eq ['Some'] 169 | expect( 170 | transform img(src: 'some/path', title: 'Some Path') 171 | ).to eq ['Some Path'] 172 | end 173 | end 174 | end 175 | end 176 | end 177 | -------------------------------------------------------------------------------- /upmark.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = "upmark" 3 | s.version = "1.1.0" 4 | s.authors = ["Josh Bassett", "Gus Gollings", "James Healy"] 5 | s.email = "dev@theconversation.edu.au" 6 | s.homepage = "http://github.com/conversation/upmark" 7 | s.summary = "A HTML to Markdown converter." 8 | s.description = "Upmark has the skills to convert your HTML to Markdown." 9 | 10 | s.required_ruby_version = ">= 1.9.3" 11 | s.rubyforge_project = "upmark" 12 | 13 | s.files = Dir.glob("{lib,spec}/**/*") + ["Rakefile", "LICENSE.md", "README.md"] 14 | s.test_files = Dir.glob("{spec}/**/*") 15 | s.executables = ["upmark"] 16 | 17 | s.add_development_dependency "rspec", "~> 3.7" 18 | s.add_development_dependency "rake" 19 | s.add_development_dependency "simplecov" 20 | 21 | s.add_runtime_dependency "parslet", "~> 1.8.2" 22 | end 23 | --------------------------------------------------------------------------------