├── .rspec ├── README.mdown ├── convert.rb ├── example.rb ├── html2confluence.gemspec ├── lib └── html2confluence.rb └── spec ├── checkbox_examples_spec.rb ├── combination_examples_spec.rb ├── complex_tables_spec.rb ├── html2confluence_spec.rb └── jira_examples_spec.rb /.rspec: -------------------------------------------------------------------------------- 1 | --color -------------------------------------------------------------------------------- /README.mdown: -------------------------------------------------------------------------------- 1 | # HTML2Confluence # 2 | 3 | A quick and simple way to convert HTML to Confluence Wiki Markup, based on 4 | html2textile. 5 | 6 | parser = HTMLToConfluenceParser.new 7 | parser.feed(your_html) 8 | puts parser.to_wiki_markup 9 | 10 | There are some JIRA/Confluence markup that we do not support: 11 | 12 | * The [mailto:] tag. 13 | * The {anchor:} tag. 14 | 15 | 16 | ## Installation ## 17 | 18 | $ gem build html2confluence.gemspec 19 | $ gem install html2confluence-1.3.18.gem 20 | 21 | ## Command line usage ## 22 | 23 | $ ruby convert.rb /path/to/file.html 24 | -------------------------------------------------------------------------------- /convert.rb: -------------------------------------------------------------------------------- 1 | require 'html2confluence' 2 | 3 | if ARGV.empty? 4 | puts "Error: Pass HTML file as first argument" 5 | exit 1 6 | end 7 | 8 | file = File.open(ARGV[0]) 9 | parser = HTMLToConfluenceParser.new 10 | parser.feed(file.read) 11 | puts parser.to_wiki_markup 12 | -------------------------------------------------------------------------------- /example.rb: -------------------------------------------------------------------------------- 1 | require 'html2confluence' 2 | 3 | first_block = < 5 |

6 | Converting HTML to Textile with Ruby 7 |

8 | 9 |

10 | 23 November 2007 11 | (7:51 pm) 12 |

13 | 14 |

15 | 16 | 17 |

18 | 19 |

20 | By James Stewart 21 |
filed under: 22 | Snippets 23 |
tagged: , 24 | , 25 | , 26 | , 27 | , 28 | 29 |

30 | 31 | 32 |
33 | 34 |
35 | 36 | END 37 | 38 | parser = HTMLToConfluenceParser.new 39 | parser.feed(first_block) 40 | puts parser.to_wiki_markup 41 | -------------------------------------------------------------------------------- /html2confluence.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.platform = Gem::Platform::RUBY 3 | s.name = 'html2confluence' 4 | s.version = "1.3.23" 5 | s.summary = 'Converter from HTML to Confluence Wiki Markup' 6 | s.description = 'Provides an SGML parser to convert HTML into the Wiki Markup format' 7 | 8 | s.required_ruby_version = '>= 1.8.6' 9 | s.required_rubygems_version = ">= 1.3.6" 10 | 11 | s.authors = ['k1w1', 'James Stewart', 'Mark Woods'] 12 | s.homepage = 'http://github.com/k1w1/html2confluence' 13 | 14 | s.require_path = 'lib' 15 | s.files = Dir.glob("{lib,spec}/**/*") + %w(example.rb README.mdown) 16 | 17 | s.add_dependency "nokogiri" 18 | end 19 | -------------------------------------------------------------------------------- /lib/html2confluence.rb: -------------------------------------------------------------------------------- 1 | require 'rexml/document' 2 | 3 | require 'nokogiri' # For validating html from our editor 4 | 5 | # A class to convert HTML to confluence markup. Based on the python parser 6 | # found at http://aftnn.org/content/code/html2textile/ 7 | # 8 | # Author:: James Stewart (mailto:james@jystewart.net) 9 | # Copyright:: Copyright (c) 2007 James Stewart 10 | # License:: Distributes under the same terms as Ruby 11 | 12 | # This class is an implementation of an SGMLParser designed to convert 13 | # HTML to atlassian confluence wiki markup. 14 | # 15 | # Example usage: 16 | # parser = HTMLToConfluenceParser.new 17 | # parser.feed(input_html) 18 | # puts parser.to_wiki_markup 19 | class HTMLToConfluenceParser 20 | 21 | attr_accessor :result 22 | attr_accessor :data_stack 23 | attr_accessor :a_href 24 | attr_accessor :a_title 25 | attr_accessor :list_stack 26 | 27 | def initialize(verbose=nil) 28 | @output = String.new 29 | @stack = [] 30 | @preserveWhitespace = false 31 | @last_write = "" 32 | @tableHeaderRow = false 33 | self.result = [] 34 | self.data_stack = [] 35 | self.list_stack = [] 36 | end 37 | 38 | # Normalise space in the same manner as HTML. Any substring of multiple 39 | # whitespace characters will be replaced with a single space char. 40 | def normalise_space(s) 41 | return s if @preserveWhitespace 42 | s.to_s.gsub(/\s+/x, ' ') 43 | end 44 | 45 | # Escape any special characters. 46 | def escape_special_characters(s) 47 | return s 48 | # Escaping is disabled now since it caused more problems that not having 49 | # it. The insertion of unecessary escaping was annoying for JIRA users. 50 | s.to_s.gsub(/[*#+\-_{}|]/) do |s| 51 | "\\#{s}" 52 | end 53 | end 54 | 55 | def make_block_start_pair(tag, attributes) 56 | if tag == 'p' 57 | # don't markup paragraphs explicitly unless necessary (i.e. there are id or class attributes) 58 | write("\n\n") 59 | else 60 | write("\n\n#{tag}. ") 61 | end 62 | start_capture(tag) 63 | end 64 | 65 | def make_block_end_pair 66 | stop_capture_and_write 67 | write("\n\n") 68 | end 69 | 70 | def make_quicktag_start_pair(tag, wrapchar, attributes) 71 | @skip_quicktag = ( tag == 'span') 72 | start_capture(tag) 73 | end 74 | 75 | def make_quicktag_end_pair(wrapchar) 76 | content = stop_capture 77 | 78 | # Don't make quicktags with empty content. 79 | if content.join("").strip.empty? 80 | write(content) 81 | return 82 | end 83 | 84 | unless @skip_quicktag 85 | unless in_nested_quicktag? 86 | #write([" "]) 87 | end 88 | write(["#{wrapchar}"]) 89 | end 90 | write(content.collect(&:strip)) 91 | write([wrapchar]) unless @skip_quicktag 92 | unless in_nested_quicktag? 93 | #write([" "]) 94 | end 95 | end 96 | 97 | def in_nested_quicktag? 98 | @quicktags ||= QUICKTAGS.keys 99 | @stack.size > 1 && @quicktags.include?(@stack[@stack.size-1]) && @quicktags.include?(@stack[@stack.size-2]) 100 | end 101 | 102 | def write(d) 103 | @last_write = Array(d).join("") 104 | if self.data_stack.size < 2 105 | self.result += Array(d) 106 | else 107 | self.data_stack[-1] += Array(d) 108 | end 109 | end 110 | 111 | def start_capture(tag) 112 | self.data_stack.push([]) 113 | end 114 | 115 | def stop_capture 116 | self.data_stack.pop 117 | end 118 | 119 | def stop_capture_and_write 120 | self.write(self.data_stack.pop) 121 | end 122 | 123 | def handle_data(data) 124 | if @preserveWhitespace 125 | write(data) 126 | else 127 | data ||= "" 128 | data = normalise_space(escape_special_characters(data)) 129 | if @last_write[-1] =~ /\s/ 130 | data = data.lstrip # Collapse whitespace if the previous character was whitespace. 131 | end 132 | 133 | write(data) 134 | end 135 | end 136 | 137 | %w[1 2 3 4 5 6].each do |num| 138 | define_method "start_h#{num}" do |attributes| 139 | make_block_start_pair("h#{num}", attributes) 140 | end 141 | 142 | define_method "end_h#{num}" do 143 | make_block_end_pair 144 | end 145 | end 146 | 147 | PAIRS = { 'bq' => 'bq', 'p' => 'p' } 148 | QUICKTAGS = { 'b' => '*', 'strong' => '*', 'del' => '-', 'strike' => '-', 149 | 'i' => '_', 'ins' => '+', 'u' => '+', 'em' => '_', 'cite' => '??', 150 | 'sup' => '^', 'sub' => '~'} 151 | 152 | PAIRS.each do |key, value| 153 | define_method "start_#{key}" do |attributes| 154 | make_block_start_pair(value, attributes) 155 | end 156 | 157 | define_method "end_#{key}" do 158 | make_block_end_pair 159 | end 160 | end 161 | 162 | QUICKTAGS.each do |key, value| 163 | define_method "start_#{key}" do |attributes| 164 | make_quicktag_start_pair(key, value, attributes) 165 | end 166 | 167 | define_method "end_#{key}" do 168 | make_quicktag_end_pair(value) 169 | end 170 | end 171 | 172 | def start_div(attrs) 173 | write("\n\n") 174 | start_capture("div") 175 | end 176 | 177 | def end_div 178 | stop_capture_and_write 179 | write("\n\n") 180 | end 181 | 182 | def start_tt(attrs) 183 | write("{{") 184 | end 185 | 186 | def end_tt 187 | write("}}") 188 | end 189 | 190 | def start_ol(attrs) 191 | self.list_stack.push :ol 192 | end 193 | 194 | def end_ol 195 | self.list_stack.pop 196 | if self.list_stack.empty? 197 | write("\n") 198 | end 199 | end 200 | 201 | def start_ul(attrs) 202 | if attrs['type'] == "square" 203 | self.list_stack.push :ul_square 204 | else 205 | self.list_stack.push :ul 206 | end 207 | end 208 | 209 | def end_ul 210 | self.list_stack.pop 211 | if self.list_stack.empty? 212 | write("\n") 213 | end 214 | end 215 | 216 | def start_li(attrs) 217 | write("\n") 218 | write(self.list_stack.collect {|s| 219 | case s 220 | when :ol then "#" 221 | when :ul then "*" 222 | when :ul_square then "-" 223 | end 224 | }.join("")) 225 | write(" ") 226 | start_capture("li") 227 | end 228 | 229 | def end_li 230 | stop_capture_and_write 231 | end 232 | 233 | def start_a(attrs) 234 | self.a_href = attrs['href'] 235 | self.a_title = attrs['title'] 236 | if self.a_href 237 | write("[") 238 | start_capture("a") 239 | end 240 | end 241 | 242 | def start_input(attrs) 243 | if attrs['type'] == "checkbox" 244 | if attrs['checked'] 245 | write("(/) ") 246 | else 247 | write("(x) ") 248 | end 249 | end 250 | end 251 | 252 | # Jira doesn't like it when there's space padding around images in links, like thumbnails. 253 | # By checking to see if the link content is a single image, we can strip those out and preserve the link function 254 | def link_content_filtering_out_images(content) 255 | if content.join("").match(/\A\s*!(.+)!\s*\Z/) 256 | ["!" + $1 + "!"] 257 | else 258 | content 259 | end 260 | end 261 | 262 | def end_a 263 | if self.a_href 264 | content = stop_capture 265 | if self.a_href.gsub(/^#/, "") == content.join("") 266 | write([self.a_href, "] "]) 267 | else 268 | write(link_content_filtering_out_images(content)) 269 | write(["|", self.a_href, "] "]) 270 | end 271 | 272 | self.a_href = self.a_title = false 273 | end 274 | end 275 | 276 | def start_font(attrs) 277 | color = attrs['color'] 278 | write("{color:#{color}}") 279 | end 280 | 281 | def end_font 282 | write("{color}") 283 | end 284 | 285 | def start_img(attrs) 286 | write([" !", attrs["src"], "! "]) 287 | end 288 | 289 | def end_img 290 | end 291 | 292 | def start_table(attrs) 293 | write("\n\n") 294 | end 295 | 296 | def end_table 297 | write("\n\n") 298 | end 299 | 300 | def start_caption(attrs) 301 | write("\n") 302 | end 303 | 304 | def end_caption 305 | write("\n") 306 | end 307 | 308 | def start_tr(attrs) 309 | write("\n") 310 | end 311 | 312 | def end_tr 313 | if @tableHeaderRow 314 | write("||") 315 | else 316 | write("|") 317 | end 318 | end 319 | 320 | def start_th(attrs) 321 | write("||") 322 | start_capture("th") 323 | @tableHeaderRow = true 324 | end 325 | 326 | def end_th 327 | s = stop_capture 328 | write(cleanup_table_cell(s)) 329 | end 330 | 331 | def start_td(attrs) 332 | write("|") 333 | start_capture("td") 334 | @tableHeaderRow = false 335 | end 336 | 337 | def end_td 338 | s = stop_capture 339 | write(cleanup_table_cell(s)) 340 | end 341 | 342 | def cleanup_table_cell(s) 343 | clean_content = (s || []).join("").strip.gsub(/\n{2,}/, "\n" + '\\\\\\' + "\n") 344 | # Don't allow a completely empty cell because that will look like a header. 345 | clean_content = " " if clean_content.empty? 346 | [clean_content] 347 | end 348 | 349 | def start_br(attrs) 350 | write("\n") 351 | end 352 | 353 | def start_hr(attrs) 354 | write("----") 355 | end 356 | 357 | def start_blockquote(attrs) 358 | write("\n{quote}\n") 359 | start_capture("blockquote") 360 | end 361 | 362 | def end_blockquote 363 | stop_capture_and_write 364 | write("\n{quote}\n") 365 | end 366 | 367 | def start_code(attrs) 368 | @preserveWhitespace = true 369 | write("{code}") 370 | end 371 | 372 | def end_code 373 | stop_capture_and_write 374 | write("{code}") 375 | @preserveWhitespace = false 376 | end 377 | 378 | def start_pre(attrs) 379 | @preserveWhitespace = true 380 | write("{noformat}\n") 381 | end 382 | 383 | def end_pre 384 | stop_capture_and_write 385 | write("{noformat}") 386 | @preserveWhitespace = false 387 | end 388 | 389 | def preprocess(data) 390 | # clean up leading and trailing spaces within phrase modifier tags 391 | quicktags_for_re = QUICKTAGS.keys.uniq.join('|') 392 | leading_spaces_re = /(<(?:#{quicktags_for_re})(?:\s+[^>]*)?>)( +|)/ 393 | tailing_spaces_re = /( +|)(<\/(?:#{quicktags_for_re})(?:\s+[^>]*)?>)/ 394 | while data =~ leading_spaces_re 395 | data.gsub!(leading_spaces_re,'\2\1') 396 | end 397 | while data =~ tailing_spaces_re 398 | data.gsub!(tailing_spaces_re,'\2\1') 399 | end 400 | # replace non-breaking spaces 401 | data.gsub!(/&(nbsp|#160);/,' ') 402 | # replace special entities. 403 | data.gsub!(/&(mdash|#8212);/,'---') 404 | data.gsub!(/&(ndash|#8211);/,'--') 405 | 406 | # remove empty blockquotes and list items (other empty elements are easy enough to deal with) 407 | data.gsub!(/
\s*(]*>)?\s*<\/blockquote>/x,' ') 408 | 409 | # Fix unclosed
410 | data.gsub!(/]*>/, "
") 411 | 412 | # Remove 413 | data.gsub!(/]*>/, "") 414 | 415 | # Fix unclosed
416 | data.gsub!(/]*>/, "
") 417 | 418 | # Fix unclosed 419 | data.gsub!(/(]+)(?/, '\1 />') 420 | 421 | # Convert jira emoji 422 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/smile\.(gif|png)"[^<>]*>/, ":)") 423 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/sad\.(gif|png)"[^<>]*>/, ":(") 424 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/tongue\.(gif|png)"[^<>]*>/, ":P") 425 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/biggrin\.(gif|png)"[^<>]*>/, ":D") 426 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/wink\.(gif|png)"[^<>]*>/, ";)") 427 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/thumbs_up\.(gif|png)"[^<>]*>/, "(y)") 428 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/thumbs_down\.(gif|png)"[^<>]*>/, "(n)") 429 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/information\.(gif|png)"[^<>]*>/, "(i)") 430 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/check\.(gif|png)"[^<>]*>/, "(/)") 431 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/error\.(gif|png)"[^<>]*>/, "(x)") 432 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/warning\.(gif|png)"[^<>]*>/, "(!)") 433 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/add\.(gif|png)"[^<>]*>/, "(+)") 434 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/forbidden\.(gif|png)"[^<>]*>/, "(-)") 435 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/help_16\.(gif|png)"[^<>]*>/, "(?)") 436 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/lightbulb_on\.(gif|png)"[^<>]*>/, "(on)") 437 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/lightbulb\.(gif|png)"[^<>]*>/, "(off)") 438 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/star_yellow\.(gif|png)"[^<>]*>/, "(*)") 439 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/star_red\.(gif|png)"[^<>]*>/, "(*r)") 440 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/star_green\.(gif|png)"[^<>]*>/, "(*g)") 441 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/star_blue\.(gif|png)"[^<>]*>/, "(*b)") 442 | data.gsub!(/]*src="([\w.-_:\/]+|\/)images\/icons\/emoticons\/star_yellow\.(gif|png)"[^<>]*>/, "(*y)") 443 | 444 | # Parse with nokogiri to ensure not tags are left unclosed 445 | # Ensure a parsing error from Nokogiri can't stop processing to get better error from REXML 446 | begin 447 | validated_data = Nokogiri::HTML::fragment(data.gsub('&', '&_')).to_xml 448 | data = validated_data.gsub('&_', '&') 449 | rescue Nokogiri::XML::SyntaxError => e 450 | end 451 | 452 | data 453 | end 454 | 455 | # Return the textile after processing 456 | def to_wiki_markup 457 | fix_textile_whitespace!(result.join).gsub(/\n(\*|#)+\s*\n(\*|#)+/) do |match| 458 | "\n#{match.split("\n").last.squeeze(' ')}" 459 | end 460 | end 461 | 462 | def fix_textile_whitespace!(output) 463 | # fixes multiple blank lines, blockquote indicator followed by blank lines, and trailing whitespace after quicktags 464 | # modifies input string and also returns it 465 | output.gsub!(/(\n\s*){2,}/,"\n\n") 466 | output.gsub!(/bq. \n+(\w)/,'bq. \1') 467 | QUICKTAGS.values.uniq.each do |t| 468 | output.gsub!(/ #{Regexp.escape(t)}[ \t]+#{Regexp.escape(t)} /,' ') # removes empty quicktags 469 | #output.gsub!(/(\[?#{Regexp.escape(t)})(\w+)([^#{Regexp.escape(t)}]+)(\s+)(#{Regexp.escape(t)}\]?)/,'\1\2\3\5\4') # fixes trailing whitespace before closing quicktags 470 | end 471 | #output.squeeze!(' ') 472 | #output.gsub!(/^[ \t]/,'') # leading whitespace 473 | #output.gsub!(/[ \t]$/,'') # trailing whitespace 474 | output.strip! 475 | return output 476 | end 477 | 478 | def feed(data) 479 | stream = StringIO.new(preprocess("
#{data}
")) 480 | 481 | REXML::Document.parse_stream(stream, self) 482 | end 483 | 484 | def tag_start(name, attributes = {}) 485 | #puts "

Start #{name}

" 486 | @stack.push(name) 487 | if self.respond_to?("start_#{name}") 488 | self.send("start_#{name}", attributes) 489 | end 490 | end 491 | 492 | def tag_end(name) 493 | #puts "

End #{name}

" 494 | if self.respond_to?("end_#{name}") 495 | self.send("end_#{name}") 496 | end 497 | @stack.pop 498 | end 499 | 500 | def text(string) 501 | handle_data(string) 502 | end 503 | 504 | def comment(comment) 505 | # Comments are ignored. 506 | end 507 | 508 | def cdata(data) 509 | # CDATA is ignored. 510 | end 511 | end 512 | -------------------------------------------------------------------------------- /spec/checkbox_examples_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | $LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib') 3 | require 'html2confluence' 4 | 5 | describe HTMLToConfluenceParser, "when running checkbox examples" do 6 | 7 | it "should match checkboxes" do 8 | html = <<-END 9 |
    10 |
  • 11 |
  • 12 |
  • 13 |
  • 14 |
  • 15 |
16 |
17 | END 18 | 19 | markup = <<-END 20 | * (/) Example 1 21 | * (/) Example 2 22 | * (x) Example 3 23 | * (/) Example 4 24 | * (x) Example 5 25 | END 26 | 27 | parser = HTMLToConfluenceParser.new 28 | parser.feed(html) 29 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 30 | end 31 | 32 | end 33 | -------------------------------------------------------------------------------- /spec/combination_examples_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | $LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib') 3 | require 'html2confluence' 4 | 5 | describe HTMLToConfluenceParser, "when running combination examples" do 6 | 7 | it "should match complex examples" do 8 | html = <<-END 9 |
    10 |
  1. a
  2. 11 |
  3. numbered item that is underlined.
  4. 12 |
  5. list
  6. 13 |
14 | END 15 | 16 | markup = <<-END 17 | # a 18 | # numbered *item* that is +underlined+. 19 | # list 20 | END 21 | 22 | 23 | parser = HTMLToConfluenceParser.new 24 | parser.feed(html) 25 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 26 | end 27 | 28 | it "should match nested lists" do 29 | html = <<-END 30 |

One line

31 |
    32 |
  • Nested
  • 33 |
  • 34 |
      35 |
    1. bullets
    2. 36 |
    3. go
    4. 37 |
    5. here
    6. 38 |
    7. 39 |
        40 |
      1. dfsdf
      2. 41 |
      3. dsfs
      4. 42 |
      43 |
    8. 44 |
    45 |
  • 46 |
  • Final bullet
  • 47 |
48 | 49 |

More stuff too

50 | 51 |
    52 |
  • In
  • 53 |
  • 54 |
      55 |
    • and
    • 56 |
    57 |
  • 58 |
  • out
  • 59 |
  • 60 |
      61 |
    1. with numbers
    2. 62 |
    3. 63 |
        64 |
      • and sub-bullets
      • 65 |
      66 |
    4. 67 |
    68 |
  • 69 |
  • and back out
  • 70 |
71 | 72 |

With nice formatting.

73 | END 74 | 75 | markup = <<-END 76 | One line 77 | 78 | * Nested 79 | *# bullets 80 | *# go 81 | *# here 82 | *## dfsdf 83 | *## dsfs 84 | * Final bullet 85 | 86 | More stuff too 87 | 88 | * In 89 | ** and 90 | * out 91 | *# with numbers 92 | *#* and sub-bullets 93 | * and back out 94 | 95 | h1. With +nice+ formatting. 96 | END 97 | 98 | parser = HTMLToConfluenceParser.new 99 | parser.feed(html) 100 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 101 | end 102 | 103 | it "should match nested blockquotes" do 104 | html = <<-END 105 |
content here
106 | END 107 | 108 | markup = <<-END 109 | {quote}\nbq. content here\n{quote} 110 | END 111 | 112 | parser = HTMLToConfluenceParser.new 113 | parser.feed(html) 114 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 115 | end 116 | 117 | it "should handle empty paragraphs" do 118 | html = <<-END 119 |

Previous


Scenario 4a: Existing deletes their ID
120 | Given I am an existing user

121 | END 122 | 123 | markup = "Previous\n\n*Scenario 4a: Existing deletes their ID*\n*Given* I am an existing user" 124 | 125 | parser = HTMLToConfluenceParser.new 126 | parser.feed(html) 127 | expect(parser.to_wiki_markup.strip).to eq(markup) 128 | end 129 | 130 | it "should handle empty bold sections" do 131 | html = <<-END 132 |

Previous line

133 |


Scenario 4a: Existing deletes their ID
134 | Given I am an existing user

135 | END 136 | 137 | markup = "Previous line\n\n*Scenario 4a: Existing deletes their ID*\n*Given* I am an existing user" 138 | 139 | parser = HTMLToConfluenceParser.new 140 | parser.feed(html) 141 | expect(parser.to_wiki_markup.strip).to eq(markup) 142 | end 143 | 144 | it "doesn't remove extra newlines" do 145 | html = "

And first line

\n\n


second line

\n\n" 146 | 147 | markup = "*And* first line\n\n*second line*" 148 | 149 | parser = HTMLToConfluenceParser.new 150 | parser.feed(html) 151 | expect(parser.to_wiki_markup.strip).to eq(markup) 152 | end 153 | 154 | it "handles unclosed img tags" do 155 | html = "
\n\n" 156 | 157 | markup = "!a source!" 158 | 159 | parser = HTMLToConfluenceParser.new 160 | parser.feed(html) 161 | expect(parser.to_wiki_markup.strip).to eq(markup) 162 | end 163 | 164 | it "handles wbr tags" do 165 | html = "
familiar with the XMLHttpRequest Object
\n\n" 166 | 167 | markup = "familiar with the XMLHttpRequest Object" 168 | parser = HTMLToConfluenceParser.new 169 | parser.feed(html) 170 | expect(parser.to_wiki_markup.strip).to eq(markup) 171 | 172 | end 173 | 174 | it "should handle unclosed tags" do 175 | html = <<-END 176 |

Previous line

177 |
178 | END 179 | 180 | markup = "Previous line\n\n----" 181 | 182 | parser = HTMLToConfluenceParser.new 183 | parser.feed(html) 184 | expect(parser.to_wiki_markup.strip).to eq(markup) 185 | end 186 | 187 | it "should handle HTML comments" do 188 | html = <<-END 189 |

A

190 | END 191 | 192 | markup = "A" 193 | 194 | parser = HTMLToConfluenceParser.new 195 | parser.feed(html) 196 | expect(parser.to_wiki_markup.strip).to eq(markup) 197 | end 198 | 199 | it "should handle CDATA elements" do 200 | html = <<-END 201 |

A

202 | 203 | END 204 | 205 | markup = "A" 206 | 207 | parser = HTMLToConfluenceParser.new 208 | parser.feed(html) 209 | expect(parser.to_wiki_markup.strip).to eq(markup) 210 | end 211 | 212 | end 213 | 214 | 215 | 216 | -------------------------------------------------------------------------------- /spec/complex_tables_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | $LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib') 3 | require 'html2confluence' 4 | 5 | describe HTMLToConfluenceParser, "when running complex tables examples" do 6 | 7 | it "should handle table with newlines" do 8 | html = <<-END 9 |
As a...I would like...Because...

Student
or

Teacher

There to be more candy

Candy is:

  • Delicious
  • Shiny
  • Good for my teeth
10 | END 11 | 12 | markup = <<-END 13 | |As a...|I would like...|Because...| 14 | |Student 15 | or 16 | \\\\ 17 | Teacher|There to be more candy|Candy is: 18 | \\\\ 19 | * Delicious 20 | * Shiny 21 | * Good for my teeth| 22 | END 23 | 24 | parser = HTMLToConfluenceParser.new 25 | parser.feed(html) 26 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 27 | end 28 | 29 | it "should handle table empty cells" do 30 | html = <<-END 31 |


Empty


32 | END 33 | 34 | markup = <<-END 35 | | |Empty| | 36 | END 37 | 38 | parser = HTMLToConfluenceParser.new 39 | parser.feed(html) 40 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 41 | end 42 | 43 | it "should handle pre in table empty cells" do 44 | html = <<-END 45 |
a
d
b
c
46 | END 47 | 48 | markup = <<-END 49 | |{noformat} 50 | a{noformat} |d | 51 | |{noformat} 52 | b{noformat} |c | 53 | END 54 | 55 | parser = HTMLToConfluenceParser.new 56 | parser.feed(html) 57 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 58 | end 59 | 60 | it "should handle pre in table" do 61 | html = <<-END 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 |
A B C
1
2
3
74 | END 75 | 76 | markup = <<-END 77 | |A |{{B}} |C | 78 | |1 |{noformat} 79 | 2{noformat} |3 | 80 | END 81 | 82 | parser = HTMLToConfluenceParser.new 83 | parser.feed(html) 84 | expect(parser.to_wiki_markup.strip).to include(markup.strip) 85 | end 86 | 87 | end 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /spec/html2confluence_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | $LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib') 3 | require 'html2confluence' 4 | 5 | describe HTMLToConfluenceParser, "when converting html to textile" do 6 | context "in a large html document" do 7 | before :all do 8 | html = <<-END 9 |
10 | 11 | Some text inside a div
with two lines 12 |

13 | Converting HTML to Textile with Ruby 14 | Converting HTML to Textile with Ruby 15 |

16 | 17 |
A note
Followed by another note
18 | 19 |

20 | 23 November 2007 21 | (7:51 pm) 22 |

23 | 24 |

25 | By James Stewart
filed under: 26 | Snippets 27 |
tagged: , 28 | , 29 | , 30 | , 31 | , 32 | 33 |

34 | 35 |

test paragraph without id or class attributes

36 | 37 |

test paragraph without closing tag

38 | 39 |

Break not closed
at all

40 | 41 |
  • test invalid list item 1
  • 42 |
  • test invalid list item 2
  • 43 | 44 |
      45 |
    1. test 1
    2. 46 |
    3. test 2
      with a line break in the middle
    4. 47 |
    5. test 3
    6. 48 |

    7. 49 |
    50 | 51 | x> y 52 | 53 |
    54 |

    paragraph inside a blockquote

    55 |

    another paragraph inside a blockquote

    56 |
    57 | 58 |

    59 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. 60 | Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure 61 | dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 62 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 63 |

    64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 |
    table caption
    heading 1heading 2
    value 1value 2
    75 | 76 | Hughes & Hughes 77 | 78 | Something & something else and a useless span 79 | 80 | Some text before a table 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 |
    heading 1heading 2
    value 1value 2
    90 | 91 |

    92 | Please apply online at:
    www.something.co.uk/careers

    93 | 94 |

    test test emphasised bold text test 95 | An ordinal number - 1st 96 |

    97 | 98 | 102 | 103 |

     

    104 |
     
    105 | more bold text
    106 | 107 |

    Some text with underlining is here.

    108 | 109 |

    Æïœü

    110 | 111 | some_good_code 112 | 113 | © Copyright statement, let's see what happens to this… € 100 114 | 115 | An unknown named entity reference - &unknownref; 116 | 117 | strike 1 118 | strike 2 119 | 120 | # Not a list 121 | * Not a list 122 | - Not a list 123 | *Not bold* 124 | _Not a emph_ 125 | {Not curly} 126 | |Not table 127 |
    128 | END 129 | parser = HTMLToConfluenceParser.new 130 | parser.feed(html) 131 | @textile = parser.to_wiki_markup 132 | end 133 | 134 | it "should convert heading tags" do 135 | expect(@textile).to match(/^h1(\([^\)]+\))?\./) 136 | end 137 | 138 | it "should convert underline tags" do 139 | expect(@textile).to include("text with +underlining+ is here") 140 | end 141 | 142 | it "should not explicitly markup paragraphs unnecessarily" do 143 | expect(@textile).to_not include("p. test paragraph without id or class attributes") 144 | end 145 | 146 | it "should treat divs as block level elements, but ignore any attributes (effectively converting them to paragraphs)" do 147 | expect(@textile).to include("\n\nA note\n\nFollowed by another note\n\n") 148 | end 149 | 150 | it "should not convert pointless spans to textile (i.e. without supported attributes)" do 151 | expect(@textile).to_not include("%a useless span%") 152 | end 153 | 154 | it "should convert class and id attributes" do 155 | # We don't convert classes. expect(@textile).to include("h1(story.title entry-title#post-312).") 156 | end 157 | 158 | it "should convert tables" do 159 | expect(@textile).to include("\n\n||heading 1 ||heading 2 || \n|value 1 |value 2 | \n") 160 | end 161 | 162 | it "should convert tables with text immediately preceding the opening table tag" do 163 | expect(@textile).to include("Some text before a table\n\n||heading 1 ||heading 2 || \n|value 1 |value 2 | \n") 164 | end 165 | 166 | it "should respect line breaks within block level elements" do 167 | expect(@textile).to include("\n# test 1 \n# test 2\nwith a line break in the middle") 168 | end 169 | 170 | it "should handle paragraphs nested within blockquote" do 171 | expect(@textile).to include("{quote}\n\nparagraph inside a blockquote\n\nanother paragraph inside a blockquote\n\n{quote}") 172 | end 173 | 174 | it "should retain leading and trailing whitespace within inline elements" do 175 | expect(@textile).to include("test *invalid* list item 1") 176 | end 177 | 178 | it "should respect trailing line break tags within other elements" do 179 | expect(@textile).to include("*Please apply online at:*\n[www.something.co.uk/careers|http://www.something.co.uk/careers]") 180 | end 181 | 182 | it "should handle nested inline elements" do 183 | expect(@textile).to include(" *_test emphasised bold text_* test") 184 | end 185 | 186 | it "should remove empty quicktags before returning" do 187 | expect(@textile).to_not include("*more bold text* *\n*") 188 | end 189 | 190 | it "should remove unsupported elements (e.g. script)" do 191 | expect(@textile).to_not include('script') 192 | end 193 | 194 | it "should remove unsupported attributes (i.e. everything but class and id)" do 195 | expect(@textile).to_not include('summary') 196 | expect(@textile).to_not include('a table with a caption') 197 | expect(@textile).to_not include('style') 198 | expect(@textile).to_not include('color:red;') 199 | end 200 | 201 | it "should clean up multiple blank lines created by tolerant parsing before returning" do 202 | expect(@textile).to_not match(/(\n\n\s*){2,}/) 203 | end 204 | 205 | it "should keep entity references" do 206 | expect(@textile).to include("©") 207 | end 208 | 209 | it "should output unknown named entity references" do 210 | expect(@textile).to include("&unknownref;") 211 | end 212 | 213 | it "should convert numerical entity references to a utf-8 character" do 214 | expect(@textile).to include("…") 215 | end 216 | 217 | it "should ignore entities that are already converted" do 218 | expect(@textile).to include("Æïœü") 219 | end 220 | 221 | it "should ignore ampersands that are not part of an entity reference" do 222 | expect(@textile).to include("Hughes & Hughes") 223 | end 224 | 225 | it "should retain whitespace surrounding entity references" do 226 | expect(@textile).to include("… € 100") 227 | expect(@textile).to include("Something & something") 228 | end 229 | 230 | it "should escape special characters" do 231 | # This test currently fails. We would like it to pass, but only by escaping 232 | # characters that would otherwise be mistaken for markup. It should not 233 | # escape every instance of these characters. 234 | pending 'only escape correct characters' 235 | expect(@textile).to include("\\# Not a list") 236 | expect(@textile).to include("\\* Not a list") 237 | expect(@textile).to include("\\- Not a list") 238 | expect(@textile).to include("\\*Not bold\\*") 239 | expect(@textile).to include("\\_Not a emph\\_") 240 | expect(@textile).to include("\\{Not curly\\}") 241 | expect(@textile).to include("\\|Not table") 242 | end 243 | 244 | it "should support strikethrough" do 245 | expect(@textile).to include("-strike 1-") 246 | expect(@textile).to include("-strike 2-") 247 | end 248 | 249 | it "should transform code" do 250 | expect(@textile).to include("{code}some_good_code{code}") 251 | end 252 | end 253 | 254 | it "should convert ending blockquotes" do 255 | html = <<-END 256 |
    257 |
    258 |       
    259 |
    260 |
    261 |
    262 |           
    263 |
    264 |
    265 |
    266 | END 267 | 268 | parser = HTMLToConfluenceParser.new 269 | parser.feed(html) 270 | @textile = parser.to_wiki_markup 271 | 272 | expect(@textile).to eq("{quote}\n{noformat}\n\n{noformat} \n\n{noformat}\n\n{noformat} \n\n{quote}") 273 | end 274 | 275 | it "should convert ending blockquotes without a leading pre" do 276 | html = <<-END 277 |
    278 |
    279 |
    280 |
    281 |           
    282 |
    283 |
    284 |
    285 | END 286 | 287 | parser = HTMLToConfluenceParser.new 288 | parser.feed(html) 289 | @textile = parser.to_wiki_markup 290 | 291 | expect(@textile).to eq("{quote}\n\n{noformat}\n\n{noformat} \n\n{quote}") 292 | end 293 | 294 | it "should convert ending blockquotes without a nested pre" do 295 | html = <<-END 296 |
    297 |
    298 |       
    299 |
    300 |
    301 |
    302 |
    303 |
    304 | END 305 | 306 | parser = HTMLToConfluenceParser.new 307 | parser.feed(html) 308 | @textile = parser.to_wiki_markup 309 | 310 | expect(@textile).to eq("{quote}\n{noformat}\n\n{noformat} \n\n{quote}") 311 | end 312 | end 313 | -------------------------------------------------------------------------------- /spec/jira_examples_spec.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | $LOAD_PATH << File.join(File.dirname(__FILE__), '..', 'lib') 3 | require 'html2confluence' 4 | 5 | describe HTMLToConfluenceParser, "when running JIRA examples" do 6 | 7 | before :all do 8 | html = <<-END 9 |

    Biggest heading

    10 |

    Bigger heading

    11 |

    Big heading

    12 |

    Normal heading

    13 |
    Small heading
    14 |
    Smallest heading
    15 | 16 |

    strong
    17 | emphasis
    18 | citation
    19 | deleted
    20 | inserted
    21 | superscript
    22 | subscript
    23 | monospaced

    24 |
    Some block quoted text
    25 | 26 |
    27 |

    here is quotable
    28 | content to be quoted

    29 | 30 |


    31 | look ma, red text!

    32 | 33 |

    a
    b

    34 | 35 |

    a
    36 | b

    37 | 38 |
    39 | 40 |

    a – b
    41 | a — b

    42 | 43 |

    anchor

    44 | 45 |

    http://jira.atlassian.com
    46 | Atlassian

    47 | 48 |

    file:///c:/temp/foo.txt

    49 | 50 |

    51 | 52 |
      53 |
    • some
    • 54 |
    • bullet 55 |
        56 |
      • indented
      • 57 |
      • bullets
      • 58 |
      59 |
    • 60 |
    • points
    • 61 |
    62 | 63 | 64 |
      65 |
    • different
    • 66 |
    • bullet
    • 67 |
    • types
    • 68 |
    69 | 70 | 71 |
      72 |
    1. a
    2. 73 |
    3. numbered
    4. 74 |
    5. list
    6. 75 |
    76 | 77 | 78 |
      79 |
    1. a
    2. 80 |
    3. numbered 81 |
        82 |
      • with
      • 83 |
      • nested
      • 84 |
      • bullet
      • 85 |
      86 |
    4. 87 |
    5. list
    6. 88 |
    89 | 90 | 91 |
      92 |
    • a
    • 93 |
    • bulleted 94 |
        95 |
      1. with
      2. 96 |
      3. nested
      4. 97 |
      5. numbered
      6. 98 |
      99 |
    • 100 |
    • list
    • 101 |
    102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 |
    heading 1heading 2heading 3
    col A1col A2col A3
    col B1col B2col B3
    121 | 122 | 123 | 124 | 125 | 126 | 127 |
    128 |
    preformatted piece of text
    129 |  so *no* further _formatting_ is done here
    130 | 
    131 |
    132 | END 133 | 134 | markup = <<-END 135 | h1. Biggest heading 136 | h2. Bigger heading 137 | h3. Big heading 138 | h4. Normal heading 139 | h5. Small heading 140 | h6. Smallest heading 141 | 142 | *strong* 143 | _emphasis_ 144 | ??citation?? 145 | -deleted- 146 | +inserted+ 147 | ^superscript^ 148 | ~subscript~ 149 | {{monospaced}} 150 | bq. Some block quoted text 151 | 152 | {quote} 153 | here is quotable 154 | content to be quoted 155 | {quote} 156 | 157 | {color:red} 158 | look ma, red text! 159 | {color} 160 | 161 | a\\b 162 | 163 | a 164 | b 165 | 166 | ---- 167 | 168 | a -- b 169 | a --- b 170 | 171 | [#anchor] 172 | 173 | [http://jira.atlassian.com] 174 | [Atlassian|http://atlassian.com] 175 | 176 | [file:///c:/temp/foo.txt] 177 | 178 | {anchor:anchorname} 179 | 180 | * some 181 | * bullet 182 | ** indented 183 | ** bullets 184 | * points 185 | 186 | - different 187 | - bullet 188 | - types 189 | 190 | # a 191 | # numbered 192 | # list 193 | 194 | # a 195 | # numbered 196 | #* with 197 | #* nested 198 | #* bullet 199 | # list 200 | 201 | * a 202 | * bulleted 203 | *# with 204 | *# nested 205 | *# numbered 206 | * list 207 | 208 | ||heading 1||heading 2||heading 3|| 209 | |col A1|col A2|col A3| 210 | |col B1|col B2|col B3| 211 | 212 | {noformat} 213 | preformatted piece of text 214 | so *no* further _formatting_ is done here 215 | {noformat} 216 | END 217 | 218 | 219 | parser = HTMLToConfluenceParser.new 220 | parser.feed(html) 221 | @textile = parser.to_wiki_markup 222 | #puts @textile 223 | #puts RedCloth.new(@textile).to_html 224 | end 225 | 226 | it "should convert images within a link" do 227 | imagetarget = "https://example.com/image.jpg" 228 | link = "https://example.com/index.html" 229 | test_html = %{ 230 |

    231 | 232 | 235 | 236 |

    237 | } 238 | 239 | parser = HTMLToConfluenceParser.new 240 | parser.feed test_html 241 | @textile = parser.to_wiki_markup 242 | 243 | expect(@textile).to eq("[!#{imagetarget}!|#{link}]") 244 | end 245 | 246 | it "should convert heading tags" do 247 | expect(@textile).to match(/^h1. Biggest heading/) 248 | expect(@textile).to match(/^h2. Bigger heading/) 249 | expect(@textile).to match(/^h3. Big heading/) 250 | expect(@textile).to match(/^h4. Normal heading/) 251 | expect(@textile).to match(/^h5. Small heading/) 252 | expect(@textile).to match(/^h6. Smallest heading/) 253 | end 254 | 255 | it "should convert inline formatting" do 256 | expect(@textile).to match(/^\*strong\*/) 257 | expect(@textile).to match(/^_emphasis_/) 258 | expect(@textile).to match(/^\?\?citation\?\?/) 259 | expect(@textile).to match(/^-deleted-/) 260 | expect(@textile).to match(/^\+inserted\+/) 261 | expect(@textile).to match(/^\^superscript\^/) 262 | expect(@textile).to match(/^\~subscript\~/) 263 | expect(@textile).to match(/^\{\{monospaced\}\}/) 264 | end 265 | 266 | it "should convert block quotes" do 267 | expect(@textile).to match(/^bq. Some block quoted text/) 268 | expect(@textile).to match(/^\{quote\}\s*here is quotable\s*content to be quoted\s*{quote}/) 269 | end 270 | 271 | it "should handle text color" do 272 | expect(@textile).to match(/^\{color\:red\}\s*look ma, red text!\s*\{color\}/) 273 | end 274 | 275 | it "should convert horizontal rules" do 276 | expect(@textile).to match(/^----/) 277 | end 278 | 279 | it "should convert dashes" do 280 | expect(@textile).to match(/^a -- b/) 281 | expect(@textile).to match(/^a --- b/) 282 | end 283 | 284 | it "should convert links" do 285 | expect(@textile).to match(/^\[\#anchor\]/) 286 | expect(@textile).to match(/^\[http\:\/\/jira.atlassian.com\]/) 287 | expect(@textile).to match(/^\[Atlassian\|http\:\/\/atlassian.com\]/) 288 | expect(@textile).to match(/^\[file\:\/\/\/c\:\/temp\/foo.txt\]/) 289 | end 290 | 291 | it "should convert bullets" do 292 | expect(@textile).to match(/\* some\s*\* bullet\s*\*\* indented\s*\*\* bullets\s*\* points/) 293 | expect(@textile).to match(/- different\s*- bullet\s*- types/) 294 | expect(@textile).to match(/# a\s*# numbered\s*# list/) 295 | expect(@textile).to match(/# a\s*# numbered\s*#\* with\s*#\* nested\s*#\* bullet\s*# list/) 296 | expect(@textile).to match(/\* a\s*\* bulleted\s*\*# with\s*\*# nested\s*\*# numbered\s*\* list/) 297 | end 298 | 299 | it "should convert pre blocks" do 300 | expect(@textile).to match(/^\{noformat\}\s*preformatted piece of text\s*so \*no\* further _formatting_ is done here\s*\{noformat\}/) 301 | end 302 | 303 | it "should convert tables" do 304 | expect(@textile).to include("||heading 1 ||heading 2 ||heading 3 ||") 305 | expect(@textile).to include("|col A1 |col A2 |col A3 |") 306 | expect(@textile).to include("|col B1 |col B2 |col B3 |") 307 | end 308 | 309 | it "should convert emoji from jira" do 310 | expect(@textile).to include(":)") 311 | expect(@textile).to include("(!)") 312 | expect(@textile).to include("(off)") 313 | expect(@textile).to include("(/)") 314 | end 315 | 316 | end 317 | --------------------------------------------------------------------------------