├── lib
└── html
│ ├── pipeline
│ ├── version.rb
│ ├── plain_text_input_filter.rb
│ ├── https_filter.rb
│ ├── text_filter.rb
│ ├── autolink_filter.rb
│ ├── syntax_highlight_filter.rb
│ ├── textile_filter.rb
│ ├── markdown_filter.rb
│ ├── toc_filter.rb
│ ├── image_max_width_filter.rb
│ ├── body_content.rb
│ ├── emoji_filter.rb
│ ├── camo_filter.rb
│ ├── email_reply_filter.rb
│ ├── @mention_filter.rb
│ ├── sanitization_filter.rb
│ └── filter.rb
│ └── pipeline.rb
├── Gemfile
├── .travis.yml
├── .gitignore
├── Rakefile
├── CHANGELOG.md
├── test
├── html
│ └── pipeline
│ │ ├── emoji_filter_test.rb
│ │ ├── plain_text_input_filter_test.rb
│ │ ├── autolink_filter_test.rb
│ │ ├── camo_filter_test.rb
│ │ ├── toc_filter_test.rb
│ │ ├── sanitization_filter_test.rb
│ │ ├── image_max_width_filter_test.rb
│ │ ├── markdown_filter_test.rb
│ │ └── mention_filter_test.rb
└── test_helper.rb
├── LICENSE
├── html-pipeline.gemspec
└── README.md
/lib/html/pipeline/version.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | VERSION = "0.0.6"
4 | end
5 | end
6 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in html-pipeline.gemspec
4 | gemspec
5 |
6 | group :development do
7 | gem 'bundler'
8 | gem 'rake'
9 | end
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 |
3 | before_install:
4 | - sudo apt-get update -qq
5 | - sudo apt-get install -qq libicu-dev
6 |
7 | script: "bundle exec rake"
8 |
9 | rvm:
10 | - 1.8.7
11 | - 1.9.2
12 | - 1.9.3
13 | - ree
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | .bundle
4 | .config
5 | .yardoc
6 | Gemfile.lock
7 | InstalledFiles
8 | _yardoc
9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 | bin/*
19 | vendor/gems
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env rake
2 | require "bundler/gem_tasks"
3 | require 'rake/testtask'
4 |
5 | Rake::TestTask.new do |t|
6 | t.libs << "test"
7 | t.test_files = FileList['test/**/*_test.rb']
8 | t.verbose = true
9 | end
10 |
11 | task :default => :test
--------------------------------------------------------------------------------
/lib/html/pipeline/plain_text_input_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # Simple filter for plain text input. HTML escapes the text input and wraps it
4 | # in a div.
5 | class PlainTextInputFilter < TextFilter
6 | def call
7 | "
#{EscapeUtils.escape_html(@text, false)}
"
8 | end
9 | end
10 | end
11 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/https_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # HTML Filter for replacing http github urls with https versions.
4 | class HttpsFilter < Filter
5 | def call
6 | doc.css('a[href^="http://github.com"]').each do |element|
7 | element['href'] = element['href'].sub(/^http:/,'https:')
8 | end
9 | doc
10 | end
11 | end
12 | end
13 | end
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # CHANGELOG
2 |
3 | ## 0.0.6
4 |
5 | * don't mutate markdown strings: jakedouglas #32
6 |
7 | ## 0.0.5
8 |
9 | * fix li xss vulnerability in sanitization filter: vmg #31
10 | * gemspec cleanup: nbibler #23, jbarnette #24
11 | * doc updates: jch #16, pborreli #17, wickedshimmy #18, benubois #19, blackerby #21
12 | * loosen gemoji dependency: josh #15
13 |
14 | ## 0.0.4
15 |
16 | * initial public release
--------------------------------------------------------------------------------
/lib/html/pipeline/text_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | class TextFilter < Filter
4 | attr_reader :text
5 |
6 | def initialize(text, context = nil, result = nil)
7 | raise TypeError, "text cannot be HTML" if text.is_a?(DocumentFragment)
8 | # Ensure that this is always a string
9 | @text = text.respond_to?(:to_str) ? text.to_str : text.to_s
10 | super nil, context, result
11 | end
12 | end
13 | end
14 | end
--------------------------------------------------------------------------------
/test/html/pipeline/emoji_filter_test.rb:
--------------------------------------------------------------------------------
1 | require 'test_helper'
2 |
3 | class HTML::Pipeline::EmojiFilterTest < Test::Unit::TestCase
4 | EmojiFilter = HTML::Pipeline::EmojiFilter
5 |
6 | def test_emojify
7 | filter = EmojiFilter.new(":shipit:
", {:asset_root => 'https://foo.com'})
8 | doc = filter.call
9 | assert_match "https://foo.com/emoji/shipit.png", doc.search('img').attr('src').value
10 | end
11 |
12 | def test_required_context_validation
13 | exception = assert_raise(ArgumentError) {
14 | EmojiFilter.call("", {})
15 | }
16 | assert_match /:asset_root/, exception.message
17 | end
18 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/autolink_filter.rb:
--------------------------------------------------------------------------------
1 | require 'rinku'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML Filter for auto_linking urls in HTML.
6 | #
7 | # Context options:
8 | # :autolink - boolean whether to autolink urls
9 | # :flags - additional Rinku flags. See https://github.com/vmg/rinku
10 | #
11 | # This filter does not write additional information to the context.
12 | class AutolinkFilter < Filter
13 | def call
14 | return html if context[:autolink] == false
15 | flags = 0
16 | flags |= context[:flags] if context[:flags]
17 |
18 | Rinku.auto_link(html, :urls, nil, %w[a script kbd pre code], flags)
19 | end
20 | end
21 | end
22 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/syntax_highlight_filter.rb:
--------------------------------------------------------------------------------
1 | require 'linguist'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML Filter that syntax highlights code blocks wrapped
6 | # in .
7 | class SyntaxHighlightFilter < Filter
8 | def call
9 | doc.search('pre').each do |node|
10 | next unless lang = node['lang']
11 | next unless lexer = Pygments::Lexer[lang]
12 | text = node.inner_text
13 |
14 | html = highlight_with_timeout_handling(lexer, text)
15 | next if html.nil?
16 |
17 | node.replace(html)
18 | end
19 | doc
20 | end
21 |
22 | def highlight_with_timeout_handling(lexer, text)
23 | lexer.highlight(text)
24 | rescue Timeout::Error => boom
25 | nil
26 | end
27 | end
28 | end
29 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/textile_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # HTML Filter that converts Textile text into HTML and converts into a
4 | # DocumentFragment. This is different from most filters in that it can take a
5 | # non-HTML as input. It must be used as the first filter in a pipeline.
6 | #
7 | # Context options:
8 | # :autolink => false Disable autolinking URLs
9 | #
10 | # This filter does not write any additional information to the context hash.
11 | #
12 | # NOTE This filter is provided for really old comments only. It probably
13 | # shouldn't be used for anything new.
14 | class TextileFilter < TextFilter
15 | # Convert Textile to HTML and convert into a DocumentFragment.
16 | def call
17 | RedCloth.new(@text).to_html
18 | end
19 | end
20 | end
21 | end
--------------------------------------------------------------------------------
/test/html/pipeline/plain_text_input_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class HTML::Pipeline::PlainTextInputFilterTest < Test::Unit::TestCase
4 | PlainTextInputFilter = HTML::Pipeline::PlainTextInputFilter
5 |
6 | def test_fails_when_given_a_documentfragment
7 | body = "heyo
"
8 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
9 | assert_raise(TypeError) { PlainTextInputFilter.call(doc, {}) }
10 | end
11 |
12 | def test_wraps_input_in_a_div_element
13 | doc = PlainTextInputFilter.call("howdy pahtner", {})
14 | assert_equal "howdy pahtner
", doc.to_s
15 | end
16 |
17 | def test_html_escapes_plain_text_input
18 | doc = PlainTextInputFilter.call("See: ", {})
19 | assert_equal "See: <http://example.org>
",
20 | doc.to_s
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/test/html/pipeline/autolink_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | AutolinkFilter = HTML::Pipeline::AutolinkFilter
4 |
5 | class HTML::Pipeline::AutolinkFilterTest < Test::Unit::TestCase
6 | def test_uses_rinku_for_autolinking
7 | # just try to parse a complicated piece of HTML
8 | # that Rails auto_link cannot handle
9 | assert_equal '"http://www.github.com"
',
10 | AutolinkFilter.to_html('"http://www.github.com"
')
11 | end
12 |
13 | def test_autolink_option
14 | assert_equal '"http://www.github.com"
',
15 | AutolinkFilter.to_html('"http://www.github.com"
', :autolink => false)
16 | end
17 |
18 | def test_autolink_flags
19 | assert_equal '"http://github"
',
20 | AutolinkFilter.to_html('"http://github"
', :flags => Rinku::AUTOLINK_SHORT_DOMAINS)
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/lib/html/pipeline/markdown_filter.rb:
--------------------------------------------------------------------------------
1 | require 'github/markdown'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML Filter that converts Markdown text into HTML and converts into a
6 | # DocumentFragment. This is different from most filters in that it can take a
7 | # non-HTML as input. It must be used as the first filter in a pipeline.
8 | #
9 | # Context options:
10 | # :gfm => false Disable GFM line-end processing
11 | #
12 | # This filter does not write any additional information to the context hash.
13 | class MarkdownFilter < TextFilter
14 | def initialize(text, context = nil, result = nil)
15 | super text, context, result
16 | @text = @text.gsub "\r", ''
17 | end
18 |
19 | # Convert Markdown to HTML using the best available implementation
20 | # and convert into a DocumentFragment.
21 | def call
22 | mode = (context[:gfm] != false) ? :gfm : :markdown
23 | html = GitHub::Markdown.to_html(@text, mode)
24 | html.rstrip!
25 | html
26 | end
27 | end
28 | end
29 | end
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012 GitHub Inc. and Jerry Cheung
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/html-pipeline.gemspec:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | require File.expand_path("../lib/html/pipeline/version", __FILE__)
3 |
4 | Gem::Specification.new do |gem|
5 | gem.name = "html-pipeline"
6 | gem.version = HTML::Pipeline::VERSION
7 | gem.license = "MIT"
8 | gem.authors = ["Ryan Tomayko", "Jerry Cheung"]
9 | gem.email = ["ryan@github.com", "jerry@github.com"]
10 | gem.description = %q{GitHub HTML processing filters and utilities}
11 | gem.summary = %q{Helpers for processing content through a chain of filters}
12 | gem.homepage = "https://github.com/jch/html-pipeline"
13 |
14 | gem.files = `git ls-files`.split $/
15 | gem.test_files = gem.files.grep(%r{^test})
16 | gem.require_paths = ["lib"]
17 |
18 | gem.add_dependency "gemoji", "~> 1.0"
19 | gem.add_dependency "nokogiri", "~> 1.4"
20 | gem.add_dependency "github-markdown", "~> 0.5"
21 | gem.add_dependency "sanitize", "~> 2.0"
22 | gem.add_dependency "github-linguist", "~> 2.1"
23 | gem.add_dependency "rinku", "~> 1.7"
24 | gem.add_dependency "escape_utils", "~> 0.2"
25 | gem.add_dependency "activesupport", ">= 2"
26 | end
27 |
--------------------------------------------------------------------------------
/lib/html/pipeline/toc_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # HTML filter that adds a 'name' attribute to all headers
4 | # in a document, so they can be accessed from a table of contents
5 | #
6 | # TODO: besides adding the name attribute, we should get around to
7 | # eventually generating the Table of Contents itself, with links
8 | # to each header
9 | class TableOfContentsFilter < Filter
10 | def call
11 | headers = Hash.new(0)
12 | doc.css('h1, h2, h3, h4, h5, h6').each do |node|
13 | name = node.text.downcase
14 | name.gsub!(/[^\w\- ]/, '') # remove punctuation
15 | name.gsub!(' ', '-') # replace spaces with dash
16 | name = EscapeUtils.escape_uri(name) # escape extended UTF-8 chars
17 |
18 | uniq = (headers[name] > 0) ? "-#{headers[name]}" : ''
19 | headers[name] += 1
20 | if header_content = node.children.first
21 | header_content.add_previous_sibling(%Q{})
22 | end
23 | end
24 | doc
25 | end
26 | end
27 | end
28 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/image_max_width_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # This filter rewrites image tags with a max-width inline style and also wraps
4 | # the image in an tag that causes the full size image to be opened in a
5 | # new tab.
6 | #
7 | # The max-width inline styles are especially useful in HTML email which
8 | # don't use a global stylesheets.
9 | class ImageMaxWidthFilter < Filter
10 | def call
11 | doc.search('img').each do |element|
12 | # Skip if there's already a style attribute. Not sure how this
13 | # would happen but we can reconsider it in the future.
14 | next if element['style']
15 |
16 | # Bail out if src doesn't look like a valid http url. trying to avoid weird
17 | # js injection via javascript: urls.
18 | next if element['src'].to_s.strip =~ /\Ajavascript/i
19 |
20 | element['style'] = "max-width:100%;"
21 |
22 | if !has_ancestor?(element, %w(a))
23 | link_image element
24 | end
25 | end
26 |
27 | doc
28 | end
29 |
30 | def link_image(element)
31 | link = doc.document.create_element('a', :href => element['src'], :target => '_blank')
32 | link.add_child(element.dup)
33 | element.replace(link)
34 | end
35 | end
36 | end
37 | end
--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | require 'bundler/setup'
2 | require 'html/pipeline'
3 | require 'test/unit'
4 |
5 | require 'active_support/core_ext/object/try'
6 |
7 | module TestHelpers
8 | # Asserts that `needle` is not a member of `haystack`, where
9 | # `haystack` is any object that responds to `include?`.
10 | def assert_doesnt_include(needle, haystack, message = nil)
11 | error = '> included in >'
12 | message = build_message(message, error, needle.to_s, Array(haystack).map(&:to_s))
13 |
14 | assert_block message do
15 | !haystack.include?(needle)
16 | end
17 | end
18 |
19 | # Asserts that `needle` is a member of `haystack`, where
20 | # `haystack` is any object that responds to `include?`.
21 | def assert_includes(needle, haystack, message = nil)
22 | error = '> not included in >'
23 | message = build_message(message, error, needle.to_s, Array(haystack).map(&:to_s))
24 |
25 | assert_block message do
26 | haystack.include?(needle)
27 | end
28 | end
29 |
30 | # Asserts that two html fragments are equivalent. Attribute order
31 | # will be ignored.
32 | def assert_equal_html(expected, actual)
33 | assert_equal Nokogiri::HTML::DocumentFragment.parse(expected).to_hash,
34 | Nokogiri::HTML::DocumentFragment.parse(actual).to_hash
35 | end
36 | end
37 |
38 | Test::Unit::TestCase.send(:include, TestHelpers)
--------------------------------------------------------------------------------
/lib/html/pipeline/body_content.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # Public: Runs a String of content through an HTML processing pipeline,
4 | # providing easy access to a generated DocumentFragment.
5 | class BodyContent
6 | attr_reader :result
7 |
8 | # Public: Initialize a BodyContent.
9 | #
10 | # body - A String body.
11 | # context - A Hash of context options for the filters.
12 | # pipeline - A HTML::Pipeline object with one or more Filters.
13 | def initialize(body, context, pipeline)
14 | @body = body
15 | @context = context
16 | @pipeline = pipeline
17 | end
18 |
19 | # Public: Gets the memoized result of the body content as it passed through
20 | # the Pipeline.
21 | #
22 | # Returns a Hash, or something similar as defined by @pipeline.result_class.
23 | def result
24 | @result ||= @pipeline.call @body, @context
25 | end
26 |
27 | # Public: Gets the updated body from the Pipeline result.
28 | #
29 | # Returns a String or DocumentFragment.
30 | def output
31 | @output ||= result[:output]
32 | end
33 |
34 | # Public: Parses the output into a DocumentFragment.
35 | #
36 | # Returns a DocumentFragment.
37 | def document
38 | @document ||= HTML::Pipeline.parse output
39 | end
40 | end
41 | end
42 | end
43 |
--------------------------------------------------------------------------------
/test/html/pipeline/camo_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class HTML::Pipeline::CamoFilterTest < Test::Unit::TestCase
4 | CamoFilter = HTML::Pipeline::CamoFilter
5 |
6 | def setup
7 | @asset_proxy_url = 'https//assets.example.org'
8 | @asset_proxy_secret_key = 'ssssh-secret'
9 | @options = {
10 | :asset_proxy => @asset_proxy_url,
11 | :asset_proxy_secret_key => @asset_proxy_secret_key
12 | }
13 | end
14 |
15 | def test_camouflaging_http_image_urls
16 | orig = %(
)
17 | assert_includes 'img src="' + @asset_proxy_url,
18 | CamoFilter.call(orig, @options).to_s
19 | end
20 |
21 | def test_rewrites_dotcom_image_urls
22 | orig = %(
)
23 | assert_equal "
",
24 | CamoFilter.call(orig, @options).to_s
25 | end
26 |
27 | def test_not_camouflaging_https_image_urls
28 | orig = %(
)
29 | assert_doesnt_include 'img src="' + @asset_proxy_url,
30 | CamoFilter.call(orig, @options).to_s
31 | end
32 |
33 | def test_handling_images_with_no_src_attribute
34 | orig = %(![]()
)
35 | assert_nothing_raised do
36 | CamoFilter.call(orig, @options).to_s
37 | end
38 | end
39 |
40 | def test_required_context_validation
41 | exception = assert_raise(ArgumentError) {
42 | CamoFilter.call("", {})
43 | }
44 | assert_match /:asset_proxy[^_]/, exception.message
45 | assert_match /:asset_proxy_secret_key/, exception.message
46 | end
47 | end
48 |
--------------------------------------------------------------------------------
/test/html/pipeline/toc_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class HTML::Pipeline::TableOfContentsFilterTest < Test::Unit::TestCase
4 | TocFilter = HTML::Pipeline::TableOfContentsFilter
5 |
6 | def test_anchors_are_added_properly
7 | orig = %(Ice cube
Will swarm on any motherfucker in a blue uniform
)
8 | assert_includes 'Straight Outta Compton
23 | Dopeman
24 | Express Yourself
25 | Dopeman
)
26 |
27 | result = TocFilter.call(orig).to_s
28 |
29 | assert_includes '"dopeman"', result
30 | assert_includes '"dopeman-1"', result
31 | end
32 |
33 | def test_all_header_tags_are_found_when_adding_anchors
34 | orig = %("Funky President" by James Brown
35 | "It's My Thing" by Marva Whitney
36 | "Boogie Back" by Roy Ayers
37 | "Feel Good" by Fancy
38 | "Funky Drummer" by James Brown
39 | "Ruthless Villain" by Eazy-E
40 | "Be Thankful for What You Got" by William DeVaughn)
41 |
42 | doc = TocFilter.call(orig)
43 | assert_equal 6, doc.search('a').size
44 | end
45 | end
46 |
47 |
48 |
--------------------------------------------------------------------------------
/test/html/pipeline/sanitization_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class HTML::Pipeline::SanitizationFilterTest < Test::Unit::TestCase
4 | SanitizationFilter = HTML::Pipeline::SanitizationFilter
5 |
6 | def test_removing_script_tags
7 | orig = %(
)
8 | html = SanitizationFilter.call(orig).to_s
9 | assert_no_match /script/, html
10 | end
11 |
12 | def test_removing_style_tags
13 | orig = %()
14 | html = SanitizationFilter.call(orig).to_s
15 | assert_no_match /style/, html
16 | end
17 |
18 | def test_removing_style_attributes
19 | orig = %(YO DAWG
)
20 | html = SanitizationFilter.call(orig).to_s
21 | assert_no_match /font-size/, html
22 | assert_no_match /style/, html
23 | end
24 |
25 | def test_removing_script_event_handler_attributes
26 | orig = %(YO DAWG)
27 | html = SanitizationFilter.call(orig).to_s
28 | assert_no_match /javscript/, html
29 | assert_no_match /onclick/, html
30 | end
31 |
32 | def test_sanitizes_li_elements_not_contained_in_ul_or_ol
33 | stuff = "a\nb\nc"
34 | html = SanitizationFilter.call(stuff).to_s
35 | assert_equal "a\nb\nc", html
36 | end
37 |
38 | def test_does_not_sanitize_li_elements_contained_in_ul_or_ol
39 | stuff = "a\n\nc"
40 | assert_equal stuff, SanitizationFilter.call(stuff).to_s
41 | end
42 |
43 | def test_github_specific_protocols_are_not_removed
44 | stuff = 'Spill this yo and so on'
45 | assert_equal stuff, SanitizationFilter.call(stuff).to_s
46 | end
47 | end
48 |
--------------------------------------------------------------------------------
/lib/html/pipeline/emoji_filter.rb:
--------------------------------------------------------------------------------
1 | require 'emoji'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML filter that replaces :emoji: with images.
6 | #
7 | # Context:
8 | # :asset_root (required) - base url to link to emoji sprite
9 | class EmojiFilter < Filter
10 | # Build a regexp that matches all valid :emoji: names.
11 | EmojiPattern = /:(#{Emoji.names.map { |name| Regexp.escape(name) }.join('|')}):/
12 |
13 | def call
14 | doc.search('text()').each do |node|
15 | content = node.to_html
16 | next if !content.include?(':')
17 | next if has_ancestor?(node, %w(pre code))
18 | html = emoji_image_filter(content)
19 | next if html == content
20 | node.replace(html)
21 | end
22 | doc
23 | end
24 |
25 | # Implementation of validate hook.
26 | # Errors should raise exceptions or use an existing validator.
27 | def validate
28 | needs :asset_root
29 | end
30 |
31 | # Replace :emoji: with corresponding images.
32 | #
33 | # text - String text to replace :emoji: in.
34 | #
35 | # Returns a String with :emoji: replaced with images.
36 | def emoji_image_filter(text)
37 | return text unless text.include?(':')
38 |
39 | text.gsub EmojiPattern do |match|
40 | name = $1
41 | "
"
42 | end
43 | end
44 |
45 | # The base url to link emoji sprites
46 | #
47 | # Raises ArgumentError if context option has not been provided.
48 | # Returns the context's asset_root.
49 | def asset_root
50 | context[:asset_root]
51 | end
52 | end
53 | end
54 | end
--------------------------------------------------------------------------------
/test/html/pipeline/image_max_width_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class HTML::Pipeline::ImageMaxWidthFilterTest < Test::Unit::TestCase
4 | def filter(html)
5 | HTML::Pipeline::ImageMaxWidthFilter.call(html)
6 | end
7 |
8 | def test_rewrites_image_style_tags
9 | body = "Screenshot: 
"
10 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
11 |
12 | res = filter(doc)
13 | assert_equal_html %q(Screenshot: 
),
14 | res.to_html
15 | end
16 |
17 | def test_leaves_existing_image_style_tags_alone
18 | body = "
"
19 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
20 |
21 | res = filter(doc)
22 | assert_equal_html '
',
23 | res.to_html
24 | end
25 |
26 | def test_links_to_image
27 | body = "Screenshot: 
"
28 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
29 |
30 | res = filter(doc)
31 | assert_equal_html 'Screenshot: 
',
32 | res.to_html
33 | end
34 |
35 | def test_doesnt_link_to_image_when_already_linked
36 | body = "Screenshot: 
"
37 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
38 |
39 | res = filter(doc)
40 | assert_equal_html %q(Screenshot: 
),
41 | res.to_html
42 | end
43 |
44 | def test_doesnt_screw_up_inlined_images
45 | body = "Screenshot
, yes, this is a screenshot indeed.
"
46 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
47 |
48 | assert_equal_html %q(Screenshot
, yes, this is a screenshot indeed.
), filter(doc).to_html
49 | end
50 | end
51 |
--------------------------------------------------------------------------------
/lib/html/pipeline/camo_filter.rb:
--------------------------------------------------------------------------------
1 | require 'openssl'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML Filter for replacing http image URLs with camo versions. See:
6 | #
7 | # https://github.com/atmos/camo
8 | #
9 | # All images provided in user content should be run through this
10 | # filter so that http image sources do not cause mixed-content warnings
11 | # in browser clients.
12 | #
13 | # Context options:
14 | # :asset_proxy (required) - Base URL for constructed asset proxy URLs.
15 | # :asset_proxy_secret_key (required) - The shared secret used to encode URLs.
16 | #
17 | # This filter does not write additional information to the context.
18 | class CamoFilter < Filter
19 | # Hijacks images in the markup provided, replacing them with URLs that
20 | # go through the github asset proxy.
21 | def call
22 | doc.search("img").each do |element|
23 | next if element['src'].nil?
24 | src = element['src'].strip
25 | src = src.sub(%r!^http://github.com!, 'https://github.com')
26 | next if context[:disable_asset_proxy]
27 |
28 | if src =~ /^http:/ || src =~ /^https:\/\/img.skitch.com\//
29 | element['src'] = asset_proxy_url(src)
30 | else
31 | element['src'] = src
32 | end
33 | end
34 | doc
35 | end
36 |
37 | # Implementation of validate hook.
38 | # Errors should raise exceptions or use an existing validator.
39 | def validate
40 | needs :asset_proxy, :asset_proxy_secret_key
41 | end
42 |
43 | # The camouflaged URL for a given image URL.
44 | def asset_proxy_url(url)
45 | "#{asset_proxy_host}/#{asset_url_hash(url)}/#{hexencode(url)}"
46 | end
47 |
48 | # Private: calculate the HMAC digest for a image source URL.
49 | def asset_url_hash(url)
50 | digest = OpenSSL::Digest::Digest.new('sha1')
51 | OpenSSL::HMAC.hexdigest(digest, asset_proxy_secret_key, url)
52 | end
53 |
54 | # Private: the hostname to use for generated asset proxied URLs.
55 | def asset_proxy_host
56 | context[:asset_proxy]
57 | end
58 |
59 | def asset_proxy_secret_key
60 | context[:asset_proxy_secret_key]
61 | end
62 |
63 | # Private: helper to hexencode a string. Each byte ends up encoded into
64 | # two characters, zero padded value in the range [0-9a-f].
65 | def hexencode(str)
66 | str.to_enum(:each_byte).map { |byte| "%02x" % byte }.join
67 | end
68 | end
69 | end
70 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/email_reply_filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # HTML Filter that converts email reply text into an HTML DocumentFragment.
4 | # It must be used as the first filter in a pipeline.
5 | #
6 | # Context options:
7 | # None
8 | #
9 | # This filter does not write any additional information to the context hash.
10 | class EmailReplyFilter < TextFilter
11 | include EscapeUtils
12 |
13 | EMAIL_HIDDEN_HEADER = %(…).freeze
14 | EMAIL_QUOTED_HEADER = %(
).freeze
15 | EMAIL_SIGNATURE_HEADER = %(
).freeze
16 | EMAIL_FRAGMENT_HEADER = %(
).freeze
17 | EMAIL_HEADER_END = "
".freeze
18 |
19 | # Scans an email body to determine which bits are quoted and which should
20 | # be hidden. EmailReplyParser is used to split the comment into an Array
21 | # of quoted or unquoted Blocks. Now, we loop through them and attempt to
22 | # add
tags around them so we can hide the hidden blocks, and style
23 | # the quoted blocks differently. Since multiple blocks may be hidden, be
24 | # sure to keep the "email-hidden-reply"
s around "email-quoted-reply"
25 | #
tags. Call this on each comment of a visible thread in the order
26 | # that they are displayed. Note: all comments are processed so we can
27 | # maintain a Set of SHAs of paragraphs. Only plaintext comments skip the
28 | # markdown step.
29 | #
30 | # Returns the email comment HTML as a String
31 | def call
32 | found_hidden = nil
33 | paragraphs = EmailReplyParser.read(text.dup).fragments.map do |fragment|
34 | pieces = [escape_html(fragment.to_s.strip).gsub(/^\s*(>|>)/, '')]
35 | if fragment.quoted?
36 | pieces.unshift EMAIL_QUOTED_HEADER
37 | pieces << EMAIL_HEADER_END
38 | elsif fragment.signature?
39 | pieces.unshift EMAIL_SIGNATURE_HEADER
40 | pieces << EMAIL_HEADER_END
41 | else
42 | pieces.unshift EMAIL_FRAGMENT_HEADER
43 | pieces << EMAIL_HEADER_END
44 | end
45 | if fragment.hidden? && !found_hidden
46 | found_hidden = true
47 | pieces.unshift EMAIL_HIDDEN_HEADER
48 | end
49 | pieces.join
50 | end
51 | paragraphs << EMAIL_HEADER_END if found_hidden
52 | paragraphs.join("\n")
53 | end
54 | end
55 | end
56 | end
--------------------------------------------------------------------------------
/test/html/pipeline/markdown_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | MarkdownFilter = HTML::Pipeline::MarkdownFilter
4 |
5 | class HTML::Pipeline::MarkdownFilterTest < Test::Unit::TestCase
6 | def setup
7 | @haiku =
8 | "Pointing at the moon\n" +
9 | "Reminded of simple things\n" +
10 | "Moments matter most"
11 | @links =
12 | "See http://example.org/ for more info"
13 | @code =
14 | "```\n" +
15 | "def hello()" +
16 | " 'world'" +
17 | "end" +
18 | "```"
19 | end
20 |
21 | def test_fails_when_given_a_documentfragment
22 | body = "
heyo
"
23 | doc = HTML::Pipeline.parse(body)
24 | assert_raise(TypeError) { MarkdownFilter.call(doc, {}) }
25 | end
26 |
27 | def test_gfm_enabled_by_default
28 | doc = MarkdownFilter.to_document(@haiku, {})
29 | assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
30 | assert_equal 2, doc.search('br').size
31 | end
32 |
33 | def test_disabling_gfm
34 | doc = MarkdownFilter.to_document(@haiku, :gfm => false)
35 | assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
36 | assert_equal 0, doc.search('br').size
37 | end
38 |
39 | def test_fenced_code_blocks
40 | doc = MarkdownFilter.to_document(@code)
41 | assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
42 | assert_equal 1, doc.search('pre').size
43 | end
44 |
45 | def test_fenced_code_blocks_with_language
46 | doc = MarkdownFilter.to_document(@code.sub("```", "``` ruby"))
47 | assert doc.kind_of?(HTML::Pipeline::DocumentFragment)
48 | assert_equal 1, doc.search('pre').size
49 | assert_equal 'ruby', doc.search('pre').first['lang']
50 | end
51 | end
52 |
53 | class GFMTest < Test::Unit::TestCase
54 | def gfm(text)
55 | MarkdownFilter.call(text, :gfm => true)
56 | end
57 |
58 | def test_not_touch_single_underscores_inside_words
59 | assert_equal "
foo_bar
",
60 | gfm("foo_bar")
61 | end
62 |
63 | def test_not_touch_underscores_in_code_blocks
64 | assert_equal "
foo_bar_baz\n
",
65 | gfm(" foo_bar_baz")
66 | end
67 |
68 | def test_not_touch_underscores_in_pre_blocks
69 | assert_equal "
\nfoo_bar_baz\n
",
70 | gfm("
\nfoo_bar_baz\n
")
71 | end
72 |
73 | def test_not_touch_two_or_more_underscores_inside_words
74 | assert_equal "
foo_bar_baz
",
75 | gfm("foo_bar_baz")
76 | end
77 |
78 | def test_turn_newlines_into_br_tags_in_simple_cases
79 | assert_equal "
foo
\nbar
",
80 | gfm("foo\nbar")
81 | end
82 |
83 | def test_convert_newlines_in_all_groups
84 | assert_equal "
apple
\npear
\norange
\n\n" +
85 | "
ruby
\npython
\nerlang
",
86 | gfm("apple\npear\norange\n\nruby\npython\nerlang")
87 | end
88 |
89 | def test_convert_newlines_in_even_long_groups
90 | assert_equal "
apple
\npear
\norange
\nbanana
\n\n" +
91 | "
ruby
\npython
\nerlang
",
92 | gfm("apple\npear\norange\nbanana\n\nruby\npython\nerlang")
93 | end
94 |
95 | def test_not_convert_newlines_in_lists
96 | assert_equal "
foo
\n\n
bar
",
97 | gfm("# foo\n# bar")
98 | assert_equal "
",
99 | gfm("* foo\n* bar")
100 | end
101 | end
102 |
--------------------------------------------------------------------------------
/lib/html/pipeline/@mention_filter.rb:
--------------------------------------------------------------------------------
1 | require 'set'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML filter that replaces @user mentions with links. Mentions within
,
6 | # , and elements are ignored. Mentions that reference users that do
7 | # not exist are ignored.
8 | #
9 | # Context options:
10 | # :base_url - Used to construct links to user profile pages for each
11 | # mention.
12 | # :info_url - Used to link to "more info" when someone mentions @mention
13 | # or @mentioned.
14 | #
15 | class MentionFilter < Filter
16 | # Public: Find user @mentions in text. See
17 | # MentionFilter#mention_link_filter.
18 | #
19 | # MentionFilter.mentioned_logins_in(text) do |match, login, is_mentioned|
20 | # "#{login}"
21 | # end
22 | #
23 | # text - String text to search.
24 | #
25 | # Yields the String match, the String login name, and a Boolean determining
26 | # if the match = "@mention[ed]". The yield's return replaces the match in
27 | # the original text.
28 | #
29 | # Returns a String replaced with the return of the block.
30 | def self.mentioned_logins_in(text)
31 | text.gsub MentionPattern do |match|
32 | login = $1
33 | yield match, login, MentionLogins.include?(login.downcase)
34 | end
35 | end
36 |
37 | # Pattern used to extract @mentions from text.
38 | MentionPattern = /
39 | (?:^|\W) # beginning of string or non-word char
40 | @((?>[a-z0-9][a-z0-9-]*)) # @username
41 | (?!\/) # without a trailing slash
42 | (?=
43 | \.+[ \t\W]| # dots followed by space or non-word character
44 | \.+$| # dots at end of line
45 | [^0-9a-zA-Z_.]| # non-word character except dot
46 | $ # end of line
47 | )
48 | /ix
49 |
50 | # List of username logins that, when mentioned, link to the blog post
51 | # about @mentions instead of triggering a real mention.
52 | MentionLogins = %w(
53 | mention
54 | mentions
55 | mentioned
56 | mentioning
57 | )
58 |
59 | # Don't look for mentions in text nodes that are children of these elements
60 | IGNORE_PARENTS = %w(pre code a).to_set
61 |
62 | def call
63 | doc.search('text()').each do |node|
64 | content = node.to_html
65 | next if !content.include?('@')
66 | next if has_ancestor?(node, IGNORE_PARENTS)
67 | html = mention_link_filter(content, base_url, info_url)
68 | next if html == content
69 | node.replace(html)
70 | end
71 | doc
72 | end
73 |
74 | # The URL to provide when someone @mentions a "mention" name, such as
75 | # @mention or @mentioned, that will give them more info on mentions.
76 | def info_url
77 | context[:info_url] || nil
78 | end
79 |
80 | # Replace user @mentions in text with links to the mentioned user's
81 | # profile page.
82 | #
83 | # text - String text to replace @mention usernames in.
84 | # base_url - The base URL used to construct user profile URLs.
85 | # info_url - The "more info" URL used to link to more info on @mentions.
86 | # If nil we don't link @mention or @mentioned.
87 | #
88 | # Returns a string with @mentions replaced with links. All links have a
89 | # 'user-mention' class name attached for styling.
90 | def mention_link_filter(text, base_url='/', info_url=nil)
91 | self.class.mentioned_logins_in(text) do |match, login, is_mentioned|
92 | link =
93 | if is_mentioned
94 | link_to_mention_info(login, info_url)
95 | else
96 | link_to_mentioned_user(login)
97 | end
98 |
99 | link ? match.sub("@#{login}", link) : match
100 | end
101 | end
102 |
103 | def link_to_mention_info(text, info_url=nil)
104 | return "@#{text}" if info_url.nil?
105 | "" +
106 | "@#{text}" +
107 | ""
108 | end
109 |
110 | def link_to_mentioned_user(login)
111 | url = File.join(base_url, login)
112 | "" +
113 | "@#{login}" +
114 | ""
115 | end
116 | end
117 | end
118 | end
--------------------------------------------------------------------------------
/lib/html/pipeline/sanitization_filter.rb:
--------------------------------------------------------------------------------
1 | require 'sanitize'
2 |
3 | module HTML
4 | class Pipeline
5 | # HTML filter with sanization routines and whitelists. This module defines
6 | # what HTML is allowed in user provided content and fixes up issues with
7 | # unbalanced tags and whatnot.
8 | #
9 | # See the Sanitize docs for more information on the underlying library:
10 | #
11 | # https://github.com/rgrove/sanitize/#readme
12 | #
13 | # Context options:
14 | # :whitelist - The sanitizer whitelist configuration to use. This can be one
15 | # of the options constants defined in this class or a custom
16 | # sanitize options hash.
17 | #
18 | # This filter does not write additional information to the context.
19 | class SanitizationFilter < Filter
20 | LISTS = Set.new(%w(ul ol).freeze)
21 | LIST_ITEM = 'li'.freeze
22 |
23 | # List of table child elements. These must be contained by a element
24 | # or they are not allowed through. Otherwise they can be used to break out
25 | # of places we're using tables to contain formatted user content (like pull
26 | # request review comments).
27 | TABLE_ITEMS = Set.new(%w(tr td th).freeze)
28 | TABLE = 'table'.freeze
29 |
30 | # The main sanitization whitelist. Only these elements and attributes are
31 | # allowed through by default.
32 | WHITELIST = {
33 | :elements => %w(
34 | h1 h2 h3 h4 h5 h6 h7 h8 br b i strong em a pre code img tt
35 | div ins del sup sub p ol ul table blockquote dl dt dd
36 | kbd q samp var hr ruby rt rp li tr td th
37 | ),
38 | :attributes => {
39 | 'a' => ['href'],
40 | 'img' => ['src'],
41 | 'div' => ['itemscope', 'itemtype'],
42 | :all => ['abbr', 'accept', 'accept-charset',
43 | 'accesskey', 'action', 'align', 'alt', 'axis',
44 | 'border', 'cellpadding', 'cellspacing', 'char',
45 | 'charoff', 'charset', 'checked', 'cite',
46 | 'clear', 'cols', 'colspan', 'color',
47 | 'compact', 'coords', 'datetime', 'dir',
48 | 'disabled', 'enctype', 'for', 'frame',
49 | 'headers', 'height', 'hreflang',
50 | 'hspace', 'ismap', 'label', 'lang',
51 | 'longdesc', 'maxlength', 'media', 'method',
52 | 'multiple', 'name', 'nohref', 'noshade',
53 | 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
54 | 'rows', 'rowspan', 'rules', 'scope',
55 | 'selected', 'shape', 'size', 'span',
56 | 'start', 'summary', 'tabindex', 'target',
57 | 'title', 'type', 'usemap', 'valign', 'value',
58 | 'vspace', 'width', 'itemprop']
59 | },
60 | :protocols => {
61 | 'a' => {'href' => ['http', 'https', 'mailto', :relative, 'github-windows', 'github-mac']},
62 | 'img' => {'src' => ['http', 'https', :relative]}
63 | },
64 | :transformers => [
65 | # Top-level elements are removed because they can break out of
66 | # containing markup.
67 | lambda { |env|
68 | name, node = env[:node_name], env[:node]
69 | if name == LIST_ITEM && !node.ancestors.any?{ |n| LISTS.include?(n.name) }
70 | node.replace(node.children)
71 | end
72 | },
73 |
74 | # Table child elements that are not contained by a are removed.
75 | lambda { |env|
76 | name, node = env[:node_name], env[:node]
77 | if TABLE_ITEMS.include?(name) && !node.ancestors.any? { |n| n.name == TABLE }
78 | node.replace(node.children)
79 | end
80 | }
81 | ]
82 | }
83 |
84 | # A more limited sanitization whitelist. This includes all attributes,
85 | # protocols, and transformers from WHITELIST but with a more locked down
86 | # set of allowed elements.
87 | LIMITED = WHITELIST.merge(
88 | :elements => %w(b i strong em a pre code img ins del sup sub p ol ul li))
89 |
90 | # Strip all HTML tags from the document.
91 | FULL = { :elements => [] }
92 |
93 | # Sanitize markup using the Sanitize library.
94 | def call
95 | Sanitize.clean_node!(doc, whitelist)
96 | end
97 |
98 | # The whitelist to use when sanitizing. This can be passed in the context
99 | # hash to the filter but defaults to WHITELIST constant value above.
100 | def whitelist
101 | context[:whitelist] || WHITELIST
102 | end
103 | end
104 | end
105 | end
--------------------------------------------------------------------------------
/test/html/pipeline/mention_filter_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class HTML::Pipeline::MentionFilterTest < Test::Unit::TestCase
4 | def filter(html, base_url='/', info_url=nil)
5 | HTML::Pipeline::MentionFilter.call(html, :base_url => base_url, :info_url => info_url)
6 | end
7 |
8 | def test_filtering_a_documentfragment
9 | body = "@kneath: check it out.
"
10 | doc = Nokogiri::HTML::DocumentFragment.parse(body)
11 |
12 | res = filter(doc, '/')
13 | assert_same doc, res
14 |
15 | link = "@kneath"
16 | assert_equal "#{link}: check it out.
",
17 | res.to_html
18 | end
19 |
20 | def test_filtering_plain_text
21 | body = "@kneath: check it out.
"
22 | res = filter(body, '/')
23 |
24 | link = "@kneath"
25 | assert_equal "#{link}: check it out.
",
26 | res.to_html
27 | end
28 |
29 | def test_not_replacing_mentions_in_pre_tags
30 | body = "@kneath: okay
"
31 | assert_equal body, filter(body).to_html
32 | end
33 |
34 | def test_not_replacing_mentions_in_code_tags
35 | body = "@kneath: okay
"
36 | assert_equal body, filter(body).to_html
37 | end
38 |
39 | def test_not_replacing_mentions_in_links
40 | body = "@kneath okay
"
41 | assert_equal body, filter(body).to_html
42 | end
43 |
44 | def test_entity_encoding_and_whatnot
45 | body = "@kneath what's up
"
46 | link = "@kneath"
47 | assert_equal "#{link} what's up
", filter(body, '/').to_html
48 | end
49 |
50 | def test_html_injection
51 | body = "@kneath <script>alert(0)</script>
"
52 | link = "@kneath"
53 | assert_equal "#{link} <script>alert(0)</script>
",
54 | filter(body, '/').to_html
55 | end
56 |
57 | def test_links_to_nothing_when_no_info_url_given
58 | body = "How do I @mention someone?
"
59 | assert_equal "How do I @mention someone?
",
60 | filter(body, '/').to_html
61 | end
62 |
63 | def test_links_to_more_info_when_info_url_given
64 | body = "How do I @mention someone?
"
65 | link = "@mention"
66 | assert_equal "How do I #{link} someone?
",
67 | filter(body, '/', 'https://github.com/blog/821').to_html
68 | end
69 |
70 | MarkdownPipeline =
71 | HTML::Pipeline.new [
72 | HTML::Pipeline::MarkdownFilter,
73 | HTML::Pipeline::MentionFilter
74 | ]
75 |
76 | def mentioned_usernames
77 | result = {}
78 | MarkdownPipeline.call(@body, {}, result)
79 | html = result[:output].to_html
80 | users = html.scan(/user-mention">@(.+?))
81 | users ? users.flatten.uniq : []
82 | end
83 |
84 | def test_matches_usernames_in_body
85 | @body = "@test how are you?"
86 | assert_equal %w[test], mentioned_usernames
87 | end
88 |
89 | def test_matches_usernames_with_dashes
90 | @body = "hi @some-user"
91 | assert_equal %w[some-user], mentioned_usernames
92 | end
93 |
94 | def test_matches_usernames_followed_by_a_single_dot
95 | @body = "okay @some-user."
96 | assert_equal %w[some-user], mentioned_usernames
97 | end
98 |
99 | def test_matches_usernames_followed_by_multiple_dots
100 | @body = "okay @some-user..."
101 | assert_equal %w[some-user], mentioned_usernames
102 | end
103 |
104 | def test_does_not_match_email_addresses
105 | @body = "aman@tmm1.net"
106 | assert_equal [], mentioned_usernames
107 | end
108 |
109 | def test_does_not_match_domain_name_looking_things
110 | @body = "we need a @github.com email"
111 | assert_equal [], mentioned_usernames
112 | end
113 |
114 | def test_does_not_match_organization_team_mentions
115 | @body = "we need to @github/enterprise know"
116 | assert_equal [], mentioned_usernames
117 | end
118 |
119 | def test_matches_colon_suffixed_names
120 | @body = "@tmm1: what do you think?"
121 | assert_equal %w[tmm1], mentioned_usernames
122 | end
123 |
124 | def test_matches_list_of_names
125 | @body = "@defunkt @atmos @kneath"
126 | assert_equal %w[defunkt atmos kneath], mentioned_usernames
127 | end
128 |
129 | def test_matches_list_of_names_with_commas
130 | @body = "/cc @defunkt, @atmos, @kneath"
131 | assert_equal %w[defunkt atmos kneath], mentioned_usernames
132 | end
133 |
134 | def test_matches_inside_brackets
135 | @body = "(@mislav) and [@rtomayko]"
136 | assert_equal %w[mislav rtomayko], mentioned_usernames
137 | end
138 |
139 | def test_doesnt_ignore_invalid_users
140 | @body = "@defunkt @mojombo and @somedude"
141 | assert_equal ['defunkt', 'mojombo', 'somedude'], mentioned_usernames
142 | end
143 |
144 | def test_returns_distinct_set
145 | @body = "/cc @defunkt, @atmos, @kneath, @defunkt, @defunkt"
146 | assert_equal %w[defunkt atmos kneath], mentioned_usernames
147 | end
148 |
149 | def test_does_not_match_inline_code_block_with_multiple_code_blocks
150 | @body = "something\n\n`/cc @defunkt @atmos @kneath` `/cc @atmos/atmos`"
151 | assert_equal %w[], mentioned_usernames
152 | end
153 |
154 | def test_mention_at_end_of_parenthetical_sentence
155 | @body = "(We're talking 'bout @ymendel.)"
156 | assert_equal %w[ymendel], mentioned_usernames
157 | end
158 | end
159 |
--------------------------------------------------------------------------------
/lib/html/pipeline.rb:
--------------------------------------------------------------------------------
1 | require "nokogiri"
2 | require "active_support/xml_mini/nokogiri" # convert Documents to hashes
3 | require "escape_utils"
4 |
5 | module HTML
6 | # GitHub HTML processing filters and utilities. This module includes a small
7 | # framework for defining DOM based content filters and applying them to user
8 | # provided content.
9 | #
10 | # See HTML::Pipeline::Filter for information on building filters.
11 | #
12 | # Construct a Pipeline for running multiple HTML filters. A pipeline is created once
13 | # with one to many filters, and it then can be `call`ed many times over the course
14 | # of its lifetime with input.
15 | #
16 | # filters - Array of Filter objects. Each must respond to call(doc,
17 | # context) and return the modified DocumentFragment or a
18 | # String containing HTML markup. Filters are performed in the
19 | # order provided.
20 | # default_context - The default context hash. Values specified here will be merged
21 | # into values from the each individual pipeline run. Can NOT be
22 | # nil. Default: empty Hash.
23 | # result_class - The default Class of the result object for individual
24 | # calls. Default: Hash. Protip: Pass in a Struct to get
25 | # some semblance of type safety.
26 | class Pipeline
27 | autoload :VERSION, 'html/pipeline/version'
28 | autoload :Pipeline, 'html/pipeline/pipeline'
29 | autoload :Filter, 'html/pipeline/filter'
30 | autoload :BodyContent, 'html/pipeline/body_content'
31 | autoload :AutolinkFilter, 'html/pipeline/autolink_filter'
32 | autoload :CamoFilter, 'html/pipeline/camo_filter'
33 | autoload :EmailReplyFilter, 'html/pipeline/email_reply_filter'
34 | autoload :EmojiFilter, 'html/pipeline/emoji_filter'
35 | autoload :HttpsFilter, 'html/pipeline/https_filter'
36 | autoload :ImageMaxWidthFilter, 'html/pipeline/image_max_width_filter'
37 | autoload :MarkdownFilter, 'html/pipeline/markdown_filter'
38 | autoload :MentionFilter, 'html/pipeline/@mention_filter'
39 | autoload :PlainTextInputFilter, 'html/pipeline/plain_text_input_filter'
40 | autoload :SanitizationFilter, 'html/pipeline/sanitization_filter'
41 | autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
42 | autoload :TextileFilter, 'html/pipeline/textile_filter'
43 | autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
44 | autoload :TextFilter, 'html/pipeline/text_filter'
45 |
46 | # Our DOM implementation.
47 | DocumentFragment = Nokogiri::HTML::DocumentFragment
48 |
49 | # Parse a String into a DocumentFragment object. When a DocumentFragment is
50 | # provided, return it verbatim.
51 | def self.parse(document_or_html)
52 | document_or_html ||= ''
53 | if document_or_html.is_a?(String)
54 | DocumentFragment.parse(document_or_html)
55 | else
56 | document_or_html
57 | end
58 | end
59 |
60 | # Public: Returns an Array of Filter objects for this Pipeline.
61 | attr_reader :filters
62 |
63 | def initialize(filters, default_context = {}, result_class = nil)
64 | raise ArgumentError, "default_context cannot be nil" if default_context.nil?
65 | @filters = filters.flatten.freeze
66 | @default_context = default_context.freeze
67 | @result_class = result_class || Hash
68 | end
69 |
70 | # Apply all filters in the pipeline to the given HTML.
71 | #
72 | # html - A String containing HTML or a DocumentFragment object.
73 | # context - The context hash passed to each filter. See the Filter docs
74 | # for more info on possible values. This object MUST NOT be modified
75 | # in place by filters. Use the Result for passing state back.
76 | # result - The result Hash passed to each filter for modification. This
77 | # is where Filters store extracted information from the content.
78 | #
79 | # Returns the result Hash after being filtered by this Pipeline. Contains an
80 | # :output key with the DocumentFragment or String HTML markup based on the
81 | # output of the last filter in the pipeline.
82 | def call(html, context = {}, result = nil)
83 | context = @default_context.merge(context)
84 | context = context.freeze
85 | result ||= @result_class.new
86 | result[:output] = @filters.inject(html) { |doc, filter| filter.call(doc, context, result) }
87 | result
88 | end
89 |
90 | # Like call but guarantee the value returned is a DocumentFragment.
91 | # Pipelines may return a DocumentFragment or a String. Callers that need a
92 | # DocumentFragment should use this method.
93 | def to_document(input, context = {}, result = nil)
94 | result = call(input, context, result)
95 | HTML::Pipeline.parse(result[:output])
96 | end
97 |
98 | # Like call but guarantee the value returned is a string of HTML markup.
99 | def to_html(input, context = {}, result = nil)
100 | result = call(input, context, result = nil)
101 | output = result[:output]
102 | if output.respond_to?(:to_html)
103 | output.to_html
104 | else
105 | output.to_s
106 | end
107 | end
108 | end
109 | end
110 |
111 | # XXX nokogiri monkey patches
112 | class Nokogiri::XML::Node
113 | # Work around an issue with utf-8 encoded data being erroneously converted to
114 | # ... some other shit when replacing text nodes. See 'utf-8 output 2' in
115 | # user_content_test.rb for details.
116 | def replace_with_encoding_fix(replacement)
117 | if replacement.respond_to?(:to_str)
118 | replacement = document.fragment("#{replacement}
").children.first.children
119 | end
120 | replace_without_encoding_fix(replacement)
121 | end
122 |
123 | alias_method :replace_without_encoding_fix, :replace
124 | alias_method :replace, :replace_with_encoding_fix
125 |
126 | def swap(replacement)
127 | replace(replacement)
128 | self
129 | end
130 | end
131 |
--------------------------------------------------------------------------------
/lib/html/pipeline/filter.rb:
--------------------------------------------------------------------------------
1 | module HTML
2 | class Pipeline
3 | # Base class for user content HTML filters. Each filter takes an
4 | # HTML string or Nokogiri::HTML::DocumentFragment, performs
5 | # modifications and/or writes information to the result hash. Filters must
6 | # return a DocumentFragment (typically the same instance provided to the call
7 | # method) or a String with HTML markup.
8 | #
9 | # Example filter that replaces all images with trollface:
10 | #
11 | # class FuuuFilter < HTML::Pipeline::Filter
12 | # def call
13 | # doc.search('img').each do |img|
14 | # img['src'] = "http://paradoxdgn.com/junk/avatars/trollface.jpg"
15 | # end
16 | # end
17 | # end
18 | #
19 | # The context Hash passes options to filters and should not be changed in
20 | # place. A Result Hash allows filters to make extracted information
21 | # available to the caller and is mutable.
22 | #
23 | # Common context options:
24 | # :base_url - The site's base URL
25 | # :repository - A Repository providing context for the HTML being processed
26 | #
27 | # Each filter may define additional options and output values. See the class
28 | # docs for more info.
29 | class Filter
30 | class InvalidDocumentException < StandardError; end
31 |
32 | def initialize(doc, context = nil, result = nil)
33 | if doc.kind_of?(String)
34 | @html = doc.to_str
35 | @doc = nil
36 | else
37 | @doc = doc
38 | @html = nil
39 | end
40 | @context = context || {}
41 | @result = result || {}
42 | validate
43 | end
44 |
45 | # Public: Returns a simple Hash used to pass extra information into filters
46 | # and also to allow filters to make extracted information available to the
47 | # caller.
48 | attr_reader :context
49 |
50 | # Public: Returns a Hash used to allow filters to pass back information
51 | # to callers of the various Pipelines. This can be used for
52 | # #mentioned_users, for example.
53 | attr_reader :result
54 |
55 | # The Nokogiri::HTML::DocumentFragment to be manipulated. If the filter was
56 | # provided a String, parse into a DocumentFragment the first time this
57 | # method is called.
58 | def doc
59 | @doc ||= parse_html(html)
60 | end
61 |
62 | # The String representation of the document. If a DocumentFragment was
63 | # provided to the Filter, it is serialized into a String when this method is
64 | # called.
65 | def html
66 | raise InvalidDocumentException if @html.nil? && @doc.nil?
67 | @html || doc.to_html
68 | end
69 |
70 | # The main filter entry point. The doc attribute is guaranteed to be a
71 | # Nokogiri::HTML::DocumentFragment when invoked. Subclasses should modify
72 | # this document in place or extract information and add it to the context
73 | # hash.
74 | def call
75 | raise NotImplementedError
76 | end
77 |
78 | # Make sure the context has everything we need. Noop: Subclasses can override.
79 | def validate
80 | end
81 |
82 | # The Repository object provided in the context hash, or nil when no
83 | # :repository was specified.
84 | #
85 | # It's assumed that the repository context has already been checked
86 | # for permissions
87 | def repository
88 | context[:repository]
89 | end
90 |
91 | # The User object provided in the context hash, or nil when no user
92 | # was specified
93 | def current_user
94 | context[:current_user]
95 | end
96 |
97 | # Return whether the filter can access a given repo while
98 | # applying a filter
99 | #
100 | # A repo can only be accessed if its pullable by the user who
101 | # submitted the content of this filter, or if it's the same as
102 | # the repository context in which the filter runs
103 | def can_access_repo?(repo)
104 | return false if repo.nil?
105 | return true if repo == repository
106 | repo.pullable_by?(current_user)
107 | end
108 |
109 | # The site's base URL provided in the context hash, or '/' when no
110 | # base URL was specified.
111 | def base_url
112 | context[:base_url] || '/'
113 | end
114 |
115 | # Ensure the passed argument is a DocumentFragment. When a string is
116 | # provided, it is parsed and returned; otherwise, the DocumentFragment is
117 | # returned unmodified.
118 | def parse_html(html)
119 | HTML::Pipeline.parse(html)
120 | end
121 |
122 | # Helper method for filter subclasses used to determine if any of a node's
123 | # ancestors have one of the tag names specified.
124 | #
125 | # node - The Node object to check.
126 | # tags - An array of tag name strings to check. These should be downcase.
127 | #
128 | # Returns true when the node has a matching ancestor.
129 | def has_ancestor?(node, tags)
130 | while node = node.parent
131 | if tags.include?(node.name.downcase)
132 | break true
133 | end
134 | end
135 | end
136 |
137 | # Perform a filter on doc with the given context.
138 | #
139 | # Returns a HTML::Pipeline::DocumentFragment or a String containing HTML
140 | # markup.
141 | def self.call(doc, context = nil, result = nil)
142 | new(doc, context, result).call
143 | end
144 |
145 | # Like call but guarantees that a DocumentFragment is returned, even when
146 | # the last filter returns a String.
147 | def self.to_document(input, context = nil)
148 | html = call(input, context)
149 | HTML::Pipeline::parse(html)
150 | end
151 |
152 | # Like call but guarantees that a string of HTML markup is returned.
153 | def self.to_html(input, context = nil)
154 | output = call(input, context)
155 | if output.respond_to?(:to_html)
156 | output.to_html
157 | else
158 | output.to_s
159 | end
160 | end
161 |
162 | # Validator for required context. This will check that anything passed in
163 | # contexts exists in @contexts
164 | #
165 | # If any errors are found an ArgumentError will be raised with a
166 | # message listing all the missing contexts and the filters that
167 | # require them.
168 | def needs(*keys)
169 | missing = keys.reject { |key| context.include? key }
170 |
171 | if missing.any?
172 | raise ArgumentError,
173 | "Missing context keys for #{self.class.name}: #{missing.map(&:inspect).join ', '}"
174 | end
175 | end
176 | end
177 | end
178 | end
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HTML::Pipeline [](http://travis-ci.org/jch/html-pipeline)
2 |
3 | GitHub HTML processing filters and utilities. This module includes a small
4 | framework for defining DOM based content filters and applying them to user
5 | provided content.
6 |
7 | ## Installation
8 |
9 | Add this line to your application's Gemfile:
10 |
11 | ```ruby
12 | gem 'html-pipeline'
13 | ```
14 |
15 | And then execute:
16 |
17 | ```sh
18 | $ bundle
19 | ```
20 |
21 | Or install it yourself as:
22 |
23 | ```sh
24 | $ gem install html-pipeline
25 | ```
26 |
27 | ## Usage
28 |
29 | This library provides a handful of chainable HTML filters to transform user
30 | content into markup. A filter takes an HTML string or
31 | `Nokogiri::HTML::DocumentFragment`, optionally manipulates it, and then
32 | outputs the result.
33 |
34 | For example, to transform Markdown source into Markdown HTML:
35 |
36 | ```ruby
37 | require 'html/pipeline'
38 |
39 | filter = HTML::Pipeline::MarkdownFilter.new("Hi **world**!")
40 | filter.call
41 | ```
42 |
43 | Filters can be combined into a pipeline which causes each filter to hand its
44 | output to the next filter's input. So if you wanted to have content be
45 | filtered through Markdown and be syntax highlighted, you can create the
46 | following pipeline:
47 |
48 | ```ruby
49 | pipeline = HTML::Pipeline.new [
50 | HTML::Pipeline::MarkdownFilter,
51 | HTML::Pipeline::SyntaxHighlightFilter
52 | ]
53 | result = pipeline.call <This is great:
68 |
69 |
70 |
some_code(:first)
71 |
72 |
73 | ```
74 |
75 | Some filters take an optional **context** and/or **result** hash. These are
76 | used to pass around arguments and metadata between filters in a pipeline. For
77 | example, if you want don't want to use GitHub formatted Markdown, you can
78 | pass an option in the context hash:
79 |
80 | ```ruby
81 | filter = HTML::Pipeline::MarkdownFilter.new("Hi **world**!", :gfm => false)
82 | filter.call
83 | ```
84 |
85 | ## Filters
86 |
87 | * `MentionFilter` - replace `@user` mentions with links
88 | * `AutolinkFilter` - auto_linking urls in HTML
89 | * `CamoFilter` - replace http image urls with [camo-fied](https://github.com/atmos/camo) https versions
90 | * `EmailReplyFilter` - util filter for working with emails
91 | * `EmojiFilter` - everyone loves [emoji](http://www.emoji-cheat-sheet.com/)!
92 | * `HttpsFilter` - HTML Filter for replacing http github urls with https versions.
93 | * `ImageMaxWidthFilter` - link to full size image for large images
94 | * `MarkdownFilter` - convert markdown to html
95 | * `PlainTextInputFilter` - html escape text and wrap the result in a div
96 | * `SanitizationFilter` - whitelist sanitize user markup
97 | * `SyntaxHighlightFilter` - code syntax highlighter with [linguist](https://github.com/github/linguist)
98 | * `TextileFilter` - convert textile to html
99 | * `TableOfContentsFilter` - anchor headings with name attributes
100 |
101 | ## Examples
102 |
103 | We define different pipelines for different parts of our app. Here are a few
104 | paraphrased snippets to get you started:
105 |
106 | ```ruby
107 | # The context hash is how you pass options between different filters.
108 | # See individual filter source for explanation of options.
109 | context = {
110 | :asset_root => "http://your-domain.com/where/your/images/live/icons",
111 | :base_url => "http://your-domain.com"
112 | }
113 |
114 | # Pipeline providing sanitization and image hijacking but no mention
115 | # related features.
116 | SimplePipeline = Pipeline.new [
117 | SanitizationFilter,
118 | TableOfContentsFilter, # add 'name' anchors to all headers
119 | CamoFilter,
120 | ImageMaxWidthFilter,
121 | SyntaxHighlightFilter,
122 | EmojiFilter,
123 | AutolinkFilter
124 | ], context, {}
125 |
126 | # Pipeline used for user provided content on the web
127 | MarkdownPipeline = Pipeline.new [
128 | MarkdownFilter,
129 | SanitizationFilter,
130 | CamoFilter,
131 | ImageMaxWidthFilter,
132 | HttpsFilter,
133 | MentionFilter,
134 | EmojiFilter,
135 | SyntaxHighlightFilter
136 | ], context.merge(:gfm => true), {} # enable github formatted markdown
137 |
138 |
139 | # Define a pipeline based on another pipeline's filters
140 | NonGFMMarkdownPipeline = Pipeline.new(MarkdownPipeline.filters,
141 | context.merge(:gfm => false), {})
142 |
143 | # Pipelines aren't limited to the web. You can use them for email
144 | # processing also.
145 | HtmlEmailPipeline = Pipeline.new [
146 | ImageMaxWidthFilter
147 | ], {}, {}
148 |
149 | # Just emoji.
150 | EmojiPipeline = Pipeline.new [
151 | HTMLInputFilter,
152 | EmojiFilter
153 | ], context, {}
154 | ```
155 |
156 | ## Extending
157 | To write a custom filter, you need a class with a `call` method that inherits
158 | from `HTML::Pipeline::Filter`.
159 |
160 | For example this filter adds a base url to images that are root relative:
161 |
162 | ```ruby
163 | require 'uri'
164 |
165 | class RootRelativeFilter < HTML::Pipeline::Filter
166 |
167 | def call
168 | doc.search("img").each do |img|
169 | next if img['src'].nil?
170 | src = img['src'].strip
171 | if src.start_with? '/'
172 | img["src"] = URI.join(context[:base_url], src).to_s
173 | end
174 | end
175 | doc
176 | end
177 |
178 | end
179 | ```
180 |
181 | Now this filter can be used in a pipeline:
182 |
183 | ```ruby
184 | Pipeline.new [ RootRelativeFilter ], { :base_url => 'http://somehost.com' }
185 | ```
186 |
187 | ## Development
188 |
189 | To see what has changed in recent versions, see the [CHANGELOG](https://github.com/jch/html-pipeline/blob/master/CHANGELOG.md).
190 |
191 | ```sh
192 | bundle
193 | rake test
194 | ```
195 |
196 | ## Contributing
197 |
198 | 1. [Fork it](https://help.github.com/articles/fork-a-repo)
199 | 2. Create your feature branch (`git checkout -b my-new-feature`)
200 | 3. Commit your changes (`git commit -am 'Added some feature'`)
201 | 4. Push to the branch (`git push origin my-new-feature`)
202 | 5. Create new [Pull Request](https://help.github.com/articles/using-pull-requests)
203 |
204 |
205 | ## TODO
206 |
207 | * test whether emoji filter works on heroku
208 | * test whether nokogiri monkey patch is still necessary
209 |
210 | ## Contributors
211 |
212 | * [Aman Gupta](mailto:aman@tmm1.net)
213 | * [Jake Boxer](mailto:jake@github.com)
214 | * [Joshua Peek](mailto:josh@joshpeek.com)
215 | * [Kyle Neath](mailto:kneath@gmail.com)
216 | * [Rob Sanheim](mailto:rsanheim@gmail.com)
217 | * [Simon Rozet](mailto:simon@rozet.name)
218 | * [Vicent Martí](mailto:tanoku@gmail.com)
219 | * [Risk :danger: Olson](mailto:technoweenie@gmail.com)
220 |
221 | Project is a member of the [OSS Manifesto](http://ossmanifesto.org/).
222 |
--------------------------------------------------------------------------------