├── var ├── company ├── created ├── name ├── title ├── version ├── summary ├── authors ├── copyrights ├── repositories ├── requirements ├── description └── resources ├── .ruby ├── Gemfile ├── work └── sample.html ├── bin ├── cssfilter └── htmlfilter ├── test ├── helper.rb ├── test_cssfilter.rb └── test_htmlfilter.rb ├── .gitignore ├── .yardopts ├── .travis.yml ├── MANIFEST ├── NOTICE.md ├── Assembly ├── LICENSE.txt ├── .index ├── HISTORY.md ├── README.md ├── lib ├── cssfilter.rb └── htmlfilter.rb └── htmlfilter.gemspec /var/company: -------------------------------------------------------------------------------- 1 | RubyWorks -------------------------------------------------------------------------------- /var/created: -------------------------------------------------------------------------------- 1 | 2009-06-25 -------------------------------------------------------------------------------- /var/name: -------------------------------------------------------------------------------- 1 | htmlfilter 2 | -------------------------------------------------------------------------------- /var/title: -------------------------------------------------------------------------------- 1 | HTMLFilter -------------------------------------------------------------------------------- /var/version: -------------------------------------------------------------------------------- 1 | 1.3.0 2 | -------------------------------------------------------------------------------- /var/summary: -------------------------------------------------------------------------------- 1 | HTML/CSS Sanity -------------------------------------------------------------------------------- /.ruby: -------------------------------------------------------------------------------- 1 | htmlfilter 1.3.0 2012-12-14 2 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source :rubygems 2 | gemspec 3 | -------------------------------------------------------------------------------- /work/sample.html: -------------------------------------------------------------------------------- 1 | 6 | licensed under a Creative Commons Attribution-ShareAlike 2.5 License 7 | 8 | Copyright (c) 2007 Cal Henderson 9 | 10 | [CC-BY-SA](http://creativecommons.org/licenses/by-sa/3.0/.Attribution-ShareAlike 3.0) License 11 | 12 | -------------------------------------------------------------------------------- /Assembly: -------------------------------------------------------------------------------- 1 | --- 2 | github: 3 | gh_pages: web 4 | 5 | gem: 6 | active: true 7 | 8 | dnote: 9 | service : DNote 10 | labels : ~ 11 | output: 12 | - log/NOTES.md 13 | 14 | yard: 15 | active: true 16 | 17 | #ruby-test: 18 | # tests : test/test_*.rb 19 | # loadpath : ~ 20 | # requires : ~ 21 | # active : true 22 | 23 | email: 24 | service : Email 25 | mailto : 26 | - rubyworks-mailinglist@googlegroups.com 27 | - ruby-talk@ruby-lang.org 28 | 29 | vclog: 30 | output: 31 | - log/Changes.md 32 | - log/History.md 33 | 34 | -------------------------------------------------------------------------------- /test/test_cssfilter.rb: -------------------------------------------------------------------------------- 1 | require './test/helper.rb' 2 | 3 | require "cssfilter" 4 | 5 | class TestCSSFilter < MicroTest::TestCase 6 | 7 | def setup 8 | @css = <<-END 9 | * { 10 | margin: 0; 11 | height: 0; 12 | } 13 | 14 | body { 15 | margin: 0; 16 | height: 0; 17 | background: url(http://xzy.org); 18 | } 19 | 20 | h1 { 21 | trythis: url(http://here.org/fun.js); 22 | font-size: 12pt; 23 | } 24 | END 25 | @result = "* {\nmargin: 0;\nheight: 0;\n}\nbody {\nmargin: 0;\nheight: 0;\n}\nh1 {\ntrythis: url(http://here.org/fun.js);\nfont-size: 12pt;\n}" 26 | end 27 | 28 | def test_filter 29 | cssfilter = CSSFilter.new(:allowed_hosts=>["here.org"], :strip_whitespace => true) 30 | csstree = cssfilter.filter(@css) 31 | 32 | csstree.to_s.assert == @result 33 | end 34 | 35 | end 36 | 37 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD-2-Clause License 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are 4 | permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of 7 | conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 10 | of conditions and the following disclaimer in the documentation and/or other materials 11 | provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY Thomas Sawyer ``AS IS'' AND ANY EXPRESS OR IMPLIED 14 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 15 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Thomas Sawyer OR 16 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 17 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 18 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 19 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 21 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | 23 | -------------------------------------------------------------------------------- /.index: -------------------------------------------------------------------------------- 1 | --- 2 | type: ruby 3 | revision: 2013 4 | sources: 5 | - var 6 | authors: 7 | - name: Thomas Sawyer 8 | email: transfire@gmail.com 9 | organizations: [] 10 | requirements: 11 | - groups: 12 | - build 13 | development: true 14 | name: detroit 15 | - groups: 16 | - test 17 | development: true 18 | name: microtest 19 | - groups: 20 | - test 21 | development: true 22 | name: ae 23 | conflicts: [] 24 | alternatives: [] 25 | resources: 26 | - type: home 27 | uri: http://rubyworks.github.com/htmlfilter 28 | label: Homepage 29 | - type: docs 30 | uri: http://rubydoc.info/gems/htmlfilter 31 | label: Documentation 32 | - type: code 33 | uri: http://github.com/rubyworks/htmlfilter 34 | label: Source Code 35 | - type: mail 36 | uri: http://groups.google.com/group/rubyworks-mailinglist 37 | label: Mailing List 38 | repositories: 39 | - name: upstream 40 | scm: git 41 | uri: git://github.com/rubyworks/htmlfilter.git 42 | categories: [] 43 | load_path: 44 | - lib 45 | copyrights: 46 | - holder: Thomas Sawyer, Rubyworks 47 | year: '2009' 48 | license: BSD-2-Clause 49 | created: '2009-06-25' 50 | summary: HTML/CSS Sanity 51 | title: HTMLFilter 52 | version: 1.3.0 53 | name: htmlfilter 54 | description: ! 'Pure Ruby library to sanitize and sterilize HTML. 55 | 56 | Also includes a CSS filter.' 57 | date: '2012-12-13' 58 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | # RELEASE HISTORY 2 | 3 | ## 1.3.0 | 2012-12-14 4 | 5 | This release provides a fairly comprehensive set of RELAXED html tags/attributes 6 | and allowed html entities. All thanks to David Wright. The release also adds 7 | a basic command line interface. 8 | 9 | Changes: 10 | 11 | * Make RELAXED set fairly comprehensive. 12 | * Add basic command line executable. 13 | 14 | 15 | ## 1.2.1 | 2011-10-26 16 | 17 | This release is simply a maintenance release to bring the 18 | projects build configuration up to date. 19 | 20 | Changes: 21 | 22 | * Modernize build configuration. 23 | 24 | 25 | ## 1.2.0 | 2010-10-13 26 | 27 | Finally removed the lowercase variations on the class names. 28 | You must use HTMLFilter now and not HtmlFilter. 29 | 30 | Changes: 31 | 32 | * Remove lowercase variations on class names. 33 | * No longer Multiton. 34 | 35 | 36 | ## 1.1.0 | 2009-11-24 37 | 38 | This is release adjusts the names of the classes to 39 | be capitialized according to the actual use of the 40 | terms. Some alternate options presets have been added 41 | as well, and this releaseo sheds the Multiton, which 42 | was basically a YAGNI. 43 | 44 | Changes: 45 | 46 | * Renamed HtmlFilter to HTMLFilter. 47 | * Renamed CssFilter to CSSFilter 48 | * HTMLFilter is no longer a Multiton. 49 | * Old names are still available temporarily. 50 | * Added built-in option constants. 51 | * CssTree is now CSSFilter::Tree. 52 | 53 | 54 | ## 1.0.0 | 2009-06-25 55 | 56 | First stand-alone release. 57 | 58 | Changes: 59 | 60 | * Birthday! (Spun-off from Ruby Facets) 61 | 62 | -------------------------------------------------------------------------------- /test/test_htmlfilter.rb: -------------------------------------------------------------------------------- 1 | require './test/helper.rb' 2 | 3 | require "htmlfilter" 4 | 5 | class TestHTMLFilter < MicroTest::TestCase 6 | 7 | # core tests 8 | 9 | def test_strip_single 10 | hf = HTMLFilter.new 11 | hf.send(:strip_single,'\"').assert == '"' 12 | hf.send(:strip_single,'\0').assert == "\000" 13 | end 14 | 15 | # functional tests 16 | 17 | def assert_filter(filtered, original) 18 | original.html_filter.assert == filtered 19 | end 20 | 21 | def test_fix_quotes 22 | assert_filter '', "" 23 | end 24 | 25 | def test_basics 26 | assert_filter '', '' 27 | assert_filter 'hello', 'hello' 28 | end 29 | 30 | def test_balancing_tags 31 | assert_filter "hello", "<hello" 32 | assert_filter "hello", ">hello" 33 | assert_filter "hello", "hello<" 34 | assert_filter "hello", "hello>" 35 | assert_filter "", "<>" 36 | end 37 | 38 | def test_tag_completion 39 | assert_filter "hello", "hello" 40 | assert_filter "hello", "hello" 41 | assert_filter "helloworld", "helloworld" 42 | assert_filter "hello", "hello" 43 | assert_filter "hello", "hello" 44 | assert_filter "helloworld", "helloworld" 45 | assert_filter "hello", "hello" 46 | assert_filter "", "" 47 | end 48 | 49 | def test_end_slashes 50 | assert_filter '', '' 51 | assert_filter '', '' 52 | assert_filter '', '' 53 | end 54 | 55 | end 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTMLFilter 2 | 3 | [Website](http://rubyworks.github.com/htmlfilter) / 4 | [Source Code](http://github.com/rubyworks/htmlfilter) 5 | 6 | 7 | ## Description 8 | 9 | HTML Filter library can be used to sanitize and sterilize 10 | HTML. A good idea if you let users submit HTML in comments, 11 | for instance. 12 | 13 | This library also include CSSFilter. The CSSFilter class will 14 | clean-up a cascading style sheet. It can be used to remove 15 | whitespace and most importantly remove URLs. 16 | 17 | 18 | ## Features 19 | 20 | * Based on well-worn PHP library. 21 | * Regular expression based filtering. 22 | * Very efficient for small snippets, like blog comments. 23 | * Pure-Ruby and no dependencies. 24 | * Also has library to clean and compact cascading stylesheets. 25 | 26 | 27 | ## Synopsis 28 | 29 | Via the class. 30 | 31 | html = "hello" 32 | 33 | HTMLFilter.new(options).filter(html) 34 | 35 | Or using the String extension. 36 | 37 | html.html_filter(options) #=> "hello" 38 | 39 | See API documentation for more information. 40 | 41 | 42 | ## Installation 43 | 44 | Of course, RubyGems is the answer: 45 | 46 | $ gem install htmlfilter 47 | 48 | 49 | ## Development 50 | 51 | HTMLFilter is hosted on [GitHub](http://github.com/rubyworks/htmlfilter). 52 | 53 | HTMLFilter is a [Rubyworks](http://rubyworks.github.com) project. 54 | 55 | 56 | ## Acknowledgements 57 | 58 | Thanks to Jang Kim for adding support for single quoted attributes. 59 | 60 | HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson . 61 | This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License. 62 | See http://creativecommons.org/licenses/by-sa/2.5/. 63 | 64 | 65 | ## Copyrights 66 | 67 | * Copyright (c) 2009 Rubyworks (BSD-2-Clause) 68 | * Copyright (c) 2007 Cal Henderson (CC-BY-SA) 69 | 70 | See LICENSE.txt and NOTICE.md for details. 71 | 72 | -------------------------------------------------------------------------------- /lib/cssfilter.rb: -------------------------------------------------------------------------------- 1 | #require 'htmlfilter/uri' 2 | require 'uri' 3 | 4 | # = CSS Filter 5 | # 6 | # The CSSFilter class will clean up a cascading style sheet. 7 | # It can be used to remove whitespace and most importantly 8 | # remove urls. 9 | # 10 | # == Issues 11 | # 12 | # TODO: Allow urls to be specified per attribute type. 13 | # 14 | # == Copying 15 | # 16 | # Copyright (c) 2007 Thomas Sawyer 17 | # 18 | # Creative Commons Attribution-ShareAlike 3.0 License 19 | # 20 | # See http://creativecommons.org/licenses/by-sa/3.0/ 21 | 22 | class CSSFilter 23 | 24 | # Library version. 25 | VERSION = "1.3.0" 26 | 27 | # should we remove comments? (true, false) 28 | attr_accessor :strip_comments 29 | 30 | # should we remove urls? (true, false) 31 | attr_accessor :strip_urls 32 | 33 | # url schemes which will be allowed (http, ftp, mailto) 34 | attr_accessor :allowed_scheme 35 | 36 | # alias for allowed_scheme 37 | alias_method :allowed_protocols, :allowed_scheme 38 | alias_method :allowed_protocols=, :allowed_scheme= 39 | 40 | # url hosts which will be allowed. 41 | attr_accessor :allowed_hosts 42 | 43 | # urls which will be allowed. (NOT YET USED) 44 | attr_accessor :allowed_urls 45 | 46 | # substitue urls (NOT YET USED) 47 | attr_accessor :substitute_urls 48 | 49 | # remove blank lines. 50 | attr_accessor :strip_whitespace 51 | 52 | # remove blank lines. 53 | attr_accessor :strip_blanklines 54 | 55 | # Complete parse and rewrite of CSS document. 56 | # This does a complete "cleaning" but note that 57 | # is not yet a perfect parser. 58 | attr_accessor :rewrite 59 | 60 | # CssFilter option defaults. 61 | 62 | DEFAULT = { 63 | 'strip_comments' => true, 64 | 'strip_urls' => true, 65 | 'allowed_urls' => [], 66 | 'allowed_hosts' => [], 67 | 'allowed_scheme' => [], 68 | 'strip_whitespace' => false, 69 | 'strip_blanklines' => true, 70 | 'rewrite' => false, 71 | 'substitute_urls' => {} 72 | } 73 | 74 | # 75 | 76 | def initialize(options=nil) 77 | if options 78 | h = DEFAULT.dup 79 | options.each do |k,v| 80 | h[k.to_s] = v 81 | end 82 | options = h 83 | else 84 | options = DEFAULT.dup 85 | end 86 | 87 | options.each{ |k,v| send("#{k}=",v) } 88 | end 89 | 90 | # 91 | 92 | def accept_host(host) 93 | @hosts << host 94 | end 95 | 96 | # 97 | 98 | def filter(css) 99 | css = remove_comments(css) if strip_comments 100 | css = remove_urls(css) if strip_urls 101 | 102 | css = remove_nullvalues(css) 103 | 104 | css = remove_whitespace(css) if strip_whitespace 105 | css = remove_blanklines(css) if strip_blanklines 106 | 107 | css = parse(css).to_css if rewrite 108 | css 109 | end 110 | 111 | # 112 | 113 | def remove_comments(data) 114 | data.gsub(/\/\*(.8?)\*\//,'') 115 | end 116 | 117 | # TODO: allowed_urls 118 | 119 | def remove_urls(data) 120 | urls = data.scan(/url\((.*?)\)/).flatten 121 | uris = urls.collect{ |u| URI.extract(u) }.flatten 122 | uris.each do |u| 123 | uri = URI.parse(u) 124 | unless allowed_hosts.include?(uri.host) or 125 | allowed_scheme.include?(uri.scheme) 126 | data.sub!(u.to_s, '') 127 | end 128 | end 129 | data.gsub(/url\(\s*\)/, '') 130 | end 131 | 132 | # 133 | 134 | def remove_whitespace(data) 135 | data = data.gsub(/^\s*/,'') 136 | data = data.gsub(/\s*$/,'') 137 | end 138 | 139 | # 140 | 141 | def remove_blanklines(data) 142 | data = data.gsub(/^\s*\n/,'') 143 | end 144 | 145 | # 146 | 147 | def remove_nullvalues(data); 148 | data = data.gsub(/\w+[:](\s+)[;]/,'') 149 | end 150 | 151 | # Breaks a css document up into a hash. This can be used 152 | # completely rewritting the css. 153 | # 154 | # TODO: Not complete, does not work with "@xxx foo;" for example. 155 | 156 | def parse(css) 157 | tree = Tree.new 158 | entries = css.scan(/^(.*?)\{(.*?)\}/m) 159 | entries.each do |ref, props| 160 | tree[ref.strip] ||= {} 161 | props = clean_properties(props) 162 | props = props.scan(/(.*?)[:](.*?)([;]|\s*\Z)/) 163 | props.each do |(key,val)| 164 | tree[ref.strip][key.strip] = clean_value(val) 165 | end 166 | end 167 | return tree 168 | end 169 | 170 | # Takes a css entry and ensures it is valid (as best it can). 171 | # It will fix trival mistakes, and raise an error when it is 172 | # beyond repair. 173 | # 174 | # TODO: So far this does absolutely nothing! 175 | 176 | def clean_properties(atts) 177 | atts 178 | end 179 | 180 | # 181 | 182 | def clean_value(val) 183 | val = val.strip 184 | 185 | if urls 186 | uris = URI.extract(val) 187 | uris.each do |u| 188 | val.sub!(u.to_s, urls) 189 | end 190 | end 191 | 192 | return val 193 | end 194 | 195 | # CSS parse tree. This is for a "deep filtering". 196 | 197 | class Tree < Hash 198 | 199 | def initialize(options=nil) 200 | @options = options || {} 201 | super() 202 | end 203 | 204 | # Re-output the CSS, all tidy ;) 205 | 206 | def to_css 207 | css = "" 208 | each do |selector, entries| 209 | css << "#{selector}{" 210 | entries.each do |key, value| 211 | css << "#{key}:#{value};" 212 | end 213 | css << "}\n" 214 | end 215 | return css 216 | end 217 | 218 | end 219 | 220 | # Simple Command line interface for CSSFilter. 221 | # 222 | # It can be configured via a YAML file. 223 | # 224 | class CLI 225 | def self.run 226 | new.run 227 | end 228 | 229 | attr_reader :config_file 230 | 231 | attr_reader :options 232 | 233 | def initialize 234 | require 'optparse' 235 | @config_file = nil 236 | @options = {} 237 | end 238 | 239 | def parser 240 | OptionParser.new do |opt| 241 | opt.on('--config ', 'filter with custom configuration'){ |file| @config_file = file } 242 | opt.on('--debug', 'run in debug mode to see error details'){ $DEBUG = true } 243 | end 244 | end 245 | 246 | def options 247 | if config_file 248 | raise "configuration file not found" unless File.exist?(config_file) 249 | @options = YAML.load_file(config_file) 250 | end 251 | end 252 | 253 | def run 254 | parser.parse! 255 | begin 256 | files = ARGV 257 | files.each do |f| 258 | raise "cssfilter: file not found -- #{f}" unless File.exist?(f) 259 | end 260 | files.each do |file| 261 | css = File.read(file) 262 | puts CSSFilter.new(options).filter(css) 263 | end 264 | rescue => error 265 | raise error if $DEBUG 266 | $stderr.puts error 267 | end 268 | end 269 | end 270 | 271 | end 272 | 273 | -------------------------------------------------------------------------------- /htmlfilter.gemspec: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'yaml' 4 | require 'pathname' 5 | 6 | module Indexer 7 | 8 | # Convert index data into a gemspec. 9 | # 10 | # Notes: 11 | # * Assumes all executables are in bin/. 12 | # * Does not yet handle default_executable setting. 13 | # * Does not yet handle platform setting. 14 | # * Does not yet handle required_ruby_version. 15 | # * Support for rdoc entries is weak. 16 | # 17 | class GemspecExporter 18 | 19 | # File globs to include in package --unless a manifest file exists. 20 | FILES = ".index .yardopts alt bin data demo ext features lib man spec test try* [A-Z]*.*" unless defined?(FILES) 21 | 22 | # File globs to omit from FILES. 23 | OMIT = "Config.rb" unless defined?(OMIT) 24 | 25 | # Standard file patterns. 26 | PATTERNS = { 27 | :root => '{.index,Gemfile}', 28 | :bin => 'bin/*', 29 | :lib => 'lib/{**/}*', #.rb', 30 | :ext => 'ext/{**/}extconf.rb', 31 | :doc => '*.{txt,rdoc,md,markdown,tt,textile}', 32 | :test => '{test,spec}/{**/}*.rb' 33 | } unless defined?(PATTERNS) 34 | 35 | # For which revision of indexer spec is this converter intended? 36 | REVISION = 2013 unless defined?(REVISION) 37 | 38 | # 39 | def self.gemspec 40 | new.to_gemspec 41 | end 42 | 43 | # 44 | attr :metadata 45 | 46 | # 47 | def initialize(metadata=nil) 48 | @root_check = false 49 | 50 | if metadata 51 | root_dir = metadata.delete(:root) 52 | if root_dir 53 | @root = root_dir 54 | @root_check = true 55 | end 56 | metadata = nil if metadata.empty? 57 | end 58 | 59 | @metadata = metadata || YAML.load_file(root + '.index') 60 | 61 | if @metadata['revision'].to_i != REVISION 62 | warn "This gemspec exporter was not designed for this revision of index metadata." 63 | end 64 | end 65 | 66 | # 67 | def has_root? 68 | root ? true : false 69 | end 70 | 71 | # 72 | def root 73 | return @root if @root || @root_check 74 | @root_check = true 75 | @root = find_root 76 | end 77 | 78 | # 79 | def manifest 80 | return nil unless root 81 | @manifest ||= Dir.glob(root + 'manifest{,.txt}', File::FNM_CASEFOLD).first 82 | end 83 | 84 | # 85 | def scm 86 | return nil unless root 87 | @scm ||= %w{git hg}.find{ |m| (root + ".#{m}").directory? }.to_sym 88 | end 89 | 90 | # 91 | def files 92 | return [] unless root 93 | @files ||= \ 94 | if manifest 95 | File.readlines(manifest). 96 | map{ |line| line.strip }. 97 | reject{ |line| line.empty? || line[0,1] == '#' } 98 | else 99 | list = [] 100 | Dir.chdir(root) do 101 | FILES.split(/\s+/).each do |pattern| 102 | list.concat(glob(pattern)) 103 | end 104 | OMIT.split(/\s+/).each do |pattern| 105 | list = list - glob(pattern) 106 | end 107 | end 108 | list 109 | end.select{ |path| File.file?(path) }.uniq 110 | end 111 | 112 | # 113 | def glob_files(pattern) 114 | return [] unless root 115 | Dir.chdir(root) do 116 | Dir.glob(pattern).select do |path| 117 | File.file?(path) && files.include?(path) 118 | end 119 | end 120 | end 121 | 122 | def patterns 123 | PATTERNS 124 | end 125 | 126 | def executables 127 | @executables ||= \ 128 | glob_files(patterns[:bin]).map do |path| 129 | File.basename(path) 130 | end 131 | end 132 | 133 | def extensions 134 | @extensions ||= \ 135 | glob_files(patterns[:ext]).map do |path| 136 | File.basename(path) 137 | end 138 | end 139 | 140 | def name 141 | metadata['name'] || metadata['title'].downcase.gsub(/\W+/,'_') 142 | end 143 | 144 | def homepage 145 | page = ( 146 | metadata['resources'].find{ |r| r['type'] =~ /^home/i } || 147 | metadata['resources'].find{ |r| r['name'] =~ /^home/i } || 148 | metadata['resources'].find{ |r| r['name'] =~ /^web/i } 149 | ) 150 | page ? page['uri'] : false 151 | end 152 | 153 | def licenses 154 | metadata['copyrights'].map{ |c| c['license'] }.compact 155 | end 156 | 157 | def require_paths 158 | metadata['load_path'] || ['lib'] 159 | end 160 | 161 | # 162 | # Convert to gemnspec. 163 | # 164 | def to_gemspec 165 | if has_root? 166 | Gem::Specification.new do |gemspec| 167 | to_gemspec_data(gemspec) 168 | to_gemspec_paths(gemspec) 169 | end 170 | else 171 | Gem::Specification.new do |gemspec| 172 | to_gemspec_data(gemspec) 173 | to_gemspec_paths(gemspec) 174 | end 175 | end 176 | end 177 | 178 | # 179 | # Convert pure data settings. 180 | # 181 | def to_gemspec_data(gemspec) 182 | gemspec.name = name 183 | gemspec.version = metadata['version'] 184 | gemspec.summary = metadata['summary'] 185 | gemspec.description = metadata['description'] 186 | 187 | metadata['authors'].each do |author| 188 | gemspec.authors << author['name'] 189 | 190 | if author.has_key?('email') 191 | if gemspec.email 192 | gemspec.email << author['email'] 193 | else 194 | gemspec.email = [author['email']] 195 | end 196 | end 197 | end 198 | 199 | gemspec.licenses = licenses 200 | 201 | requirements = metadata['requirements'] || [] 202 | requirements.each do |req| 203 | next if req['optional'] 204 | next if req['external'] 205 | 206 | name = req['name'] 207 | groups = req['groups'] || [] 208 | 209 | version = gemify_version(req['version']) 210 | 211 | if groups.empty? or groups.include?('runtime') 212 | # populate runtime dependencies 213 | if gemspec.respond_to?(:add_runtime_dependency) 214 | gemspec.add_runtime_dependency(name,*version) 215 | else 216 | gemspec.add_dependency(name,*version) 217 | end 218 | else 219 | # populate development dependencies 220 | if gemspec.respond_to?(:add_development_dependency) 221 | gemspec.add_development_dependency(name,*version) 222 | else 223 | gemspec.add_dependency(name,*version) 224 | end 225 | end 226 | end 227 | 228 | # convert external dependencies into gemspec requirements 229 | requirements.each do |req| 230 | next unless req['external'] 231 | gemspec.requirements << ("%s-%s" % req.values_at('name', 'version')) 232 | end 233 | 234 | gemspec.homepage = homepage 235 | gemspec.require_paths = require_paths 236 | gemspec.post_install_message = metadata['install_message'] 237 | end 238 | 239 | # 240 | # Set gemspec settings that require a root directory path. 241 | # 242 | def to_gemspec_paths(gemspec) 243 | gemspec.files = files 244 | gemspec.extensions = extensions 245 | gemspec.executables = executables 246 | 247 | if Gem::VERSION < '1.7.' 248 | gemspec.default_executable = gemspec.executables.first 249 | end 250 | 251 | gemspec.test_files = glob_files(patterns[:test]) 252 | 253 | unless gemspec.files.include?('.document') 254 | gemspec.extra_rdoc_files = glob_files(patterns[:doc]) 255 | end 256 | end 257 | 258 | # 259 | # Return a copy of this file. This is used to generate a local 260 | # .gemspec file that can automatically read the index file. 261 | # 262 | def self.source_code 263 | File.read(__FILE__) 264 | end 265 | 266 | private 267 | 268 | def find_root 269 | root_files = patterns[:root] 270 | if Dir.glob(root_files).first 271 | Pathname.new(Dir.pwd) 272 | elsif Dir.glob("../#{root_files}").first 273 | Pathname.new(Dir.pwd).parent 274 | else 275 | #raise "Can't find root of project containing `#{root_files}'." 276 | warn "Can't find root of project containing `#{root_files}'." 277 | nil 278 | end 279 | end 280 | 281 | def glob(pattern) 282 | if File.directory?(pattern) 283 | Dir.glob(File.join(pattern, '**', '*')) 284 | else 285 | Dir.glob(pattern) 286 | end 287 | end 288 | 289 | def gemify_version(version) 290 | case version 291 | when /^(.*?)\+$/ 292 | ">= #{$1}" 293 | when /^(.*?)\-$/ 294 | "< #{$1}" 295 | when /^(.*?)\~$/ 296 | "~> #{$1}" 297 | else 298 | version 299 | end 300 | end 301 | 302 | end 303 | 304 | end 305 | 306 | Indexer::GemspecExporter.gemspec -------------------------------------------------------------------------------- /lib/htmlfilter.rb: -------------------------------------------------------------------------------- 1 | # = HTML Filter 2 | # 3 | # HTML Filter library can be used to sanitize and sterilize 4 | # HTML. A good idea if you let users submit HTML in comments, 5 | # for instance. 6 | # 7 | # HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson 8 | # licensed under a Creative Commons Attribution-ShareAlike 2.5 License 9 | # http://creativecommons.org/licenses/by-sa/3.0/. 10 | # 11 | # == Usage 12 | # 13 | # hf = HTMLFilter.new 14 | # hf.filter("Bold Action") #=> "Bold Action" 15 | # 16 | # == Reference 17 | # 18 | # * http://iamcal.com/publish/articles/php/processing_html/ 19 | # * http://iamcal.com/publish/articles/php/processing_html_part_2/ 20 | # 21 | # == Issues 22 | # 23 | # * The built in option constants could use some refinement. 24 | # 25 | # == Copying 26 | # 27 | # Copyright (c) 2009 Thomas Sawyer, Rubyworks (BSD-2-Clause) 28 | # 29 | # Thanks to Jang Kim for adding support for single quoted attributes. 30 | 31 | class HTMLFilter 32 | 33 | # Library version. 34 | VERSION = "1.3.0" 35 | 36 | # tags and attributes that are allowed 37 | # 38 | # Eg. 39 | # 40 | # { 41 | # 'a' => ['href', 'target'], 42 | # 'b' => [], 43 | # 'img' => ['src', 'width', 'height', 'alt'] 44 | # } 45 | attr_accessor :allowed 46 | 47 | # tags which should always be self-closing (e.g. "") 48 | attr_accessor :no_close 49 | 50 | # tags which must always have seperate opening and closing 51 | # tags (e.g. "") 52 | attr_accessor :always_close 53 | 54 | # attributes which should be checked for valid protocols 55 | # (src,href) 56 | attr_accessor :protocol_attributes 57 | 58 | # protocols which are allowed (http, ftp, mailto) 59 | attr_accessor :allowed_protocols 60 | 61 | # tags which should be removed if they contain no content 62 | # (e.g. "" or "") 63 | attr_accessor :remove_blanks 64 | 65 | # should we remove comments? (true, false) 66 | attr_accessor :strip_comments 67 | 68 | # should we try and make a tag out of "b>" (true, false) 69 | attr_accessor :always_make_tags 70 | 71 | # entity control option (true, false) 72 | attr_accessor :allow_numbered_entities 73 | 74 | # entity control option (amp, gt, lt, quot, etc.) 75 | attr_accessor :allowed_entities 76 | 77 | ## max number of text characters at which to truncate (leave as +nil+ for no truncation) 78 | #attr_accessor :truncate 79 | 80 | # Default settings 81 | DEFAULT = { 82 | 'allowed' => { 83 | 'a' => ['href', 'target'], 84 | 'img' => ['src', 'width', 'height', 'alt'], 85 | 'b' => [], 86 | 'i' => [], 87 | 'em' => [], 88 | 'tt' => [], 89 | }, 90 | 'no_close' => ['img', 'br', 'hr'], 91 | 'always_close' => ['a', 'b'], 92 | 'protocol_attributes' => ['src', 'href'], 93 | 'allowed_protocols' => ['http', 'ftp', 'mailto'], 94 | 'remove_blanks' => ['a', 'b'], 95 | 'strip_comments' => true, 96 | 'always_make_tags' => true, 97 | 'allow_numbered_entities' => true, 98 | 'allowed_entities' => ['amp', 'gt', 'lt', 'quot'] 99 | } 100 | 101 | # Basic settings are simlialr to DEFAULT but do not allow any type 102 | # of links, neither a href or img. 103 | BASIC = { 104 | 'allowed' => { 105 | 'b' => [], 106 | 'i' => [], 107 | 'em' => [], 108 | 'tt' => [], 109 | }, 110 | 'no_close' => ['img', 'br', 'hr'], 111 | 'always_close' => ['a', 'b'], 112 | 'protocol_attributes' => ['src', 'href'], 113 | 'allowed_protocols' => ['http', 'ftp', 'mailto'], 114 | 'remove_blanks' => ['a', 'b'], 115 | 'strip_comments' => true, 116 | 'always_make_tags' => true, 117 | 'allow_numbered_entities' => true, 118 | 'allowed_entities' => ['amp', 'gt', 'lt', 'quot'] 119 | } 120 | 121 | # Strict settings do not allow any tags. 122 | STRICT = { 123 | 'allowed' => {}, 124 | 'no_close' => ['img', 'br', 'hr'], 125 | 'always_close' => ['a', 'b'], 126 | 'protocol_attributes' => ['src', 'href'], 127 | 'allowed_protocols' => ['http', 'ftp', 'mailto'], 128 | 'remove_blanks' => ['a', 'b'], 129 | 'strip_comments' => true, 130 | 'always_make_tags' => true, 131 | 'allow_numbered_entities' => true, 132 | 'allowed_entities' => ['amp', 'gt', 'lt', 'quot'] 133 | } 134 | 135 | # Relaxed settings allows a great deal of HTML spec. 136 | # 137 | # Here is a very comprhensive set of tags with attributes. 138 | # 139 | RELAXED = { 140 | 'allowed' => { 141 | 'a' => ['class', 'href', 'target', 'name', 'id', 'style', 'title'], 142 | 'abbr' => ['class', 'dir', 'lang', 'id', 'style', 'title'], 143 | 'acronym' => ['class', 'dir', 'lang', 'id', 'style', 'title'], 144 | 'address' => ['class', 'dir', 'lang', 'id', 'style', 'title'], 145 | #'applet' => ['class', 'dir', 'lang', 'id', 'style', 'title'], 146 | 'area' => ['shape', 'cords', 'type', 'nohref', 'href', 'class', 'id', 'style', 'title'], 147 | 'b' => ['class', 'id', 'style', 'title'], 148 | 'base' => ['target', 'type', 'href'], # NO class, id, style, title 149 | 'basefont' => ['color', 'face', 'size'], # NO class, id, style, title 150 | 'bdo' => ['class', 'dir', 'lang', 'id', 'style', 'title'], 151 | 'bgsound' => ['loop', 'src'], 152 | 'big' => ['class', 'dir', 'lang', 'id', 'style', 'title'], 153 | 'blockquote' => ['class', 'id', 'style', 'title'], 154 | 'body' => ['background', 'bgcolor', 'text', 'link', 'vlink', 'class', 'id', 'style', 'title'], 155 | 'button' => ['disabled', 'name', 'type', 'value', 'accesskey', 'class', 'id', 'style', 'title'], 156 | 'br' => ['clear', 'class', 'id', 'style', 'title'], #
or
157 | 'caption' => ['class', 'align', 'valign', 'id', 'style', 'title'], 158 | 'center' => ['class', 'id', 'style', 'title'], 159 | 'cite' => ['class', 'id', 'style', 'title'], 160 | 'code'=> ['class', 'id', 'style', 'title'], 161 | 'col' => ['char', 'charoff', 'span', 'class', 'width', 'align', 'valign', 'id', 'style', 'title'], 162 | 'colgroup' => ['char', 'charoff', 'span', 'class', 'width', 'align', 'valign', 'id', 'style', 'title'], 163 | 'div' => ['class', 'align', 'style', 'id', 'style', 'title'], 164 | 'dl' => ['class', 'id', 'style', 'title'], 165 | 'dt' => ['class', 'id', 'style', 'title'], 166 | 'dd' => ['class', 'id', 'style', 'title'], 167 | 'em' => ['class', 'id', 'style', 'title'], 168 | 'frameset' => ['cols', 'rows', 'class', 'id', 'style', 'title'], 169 | 'frame' => ['src', 'name', 'noresize', 'scroll', 'marginwidth', 'marginheight', 'class', 'id', 'style', 'title'], 170 | 'form' => ['method', 'action', 'class', 'id', 'style', 'title'], 171 | 'font' => ['face', 'size', 'color', 'class', 'id', 'style', 'title'], 172 | 'head' => [], # NO class, id, style, title 173 | 'html' => [], # NO class, id, style, title 174 | 'h1' => ['align', 'class', 'id', 'style', 'title'], 175 | 'h2' => ['align', 'class', 'id', 'style', 'title'], 176 | 'h3' => ['align', 'class', 'id', 'style', 'title'], 177 | 'h4' => ['align', 'class', 'id', 'style', 'title'], 178 | 'h5' => ['align', 'class', 'id', 'style', 'title'], 179 | 'h6' => ['align', 'class', 'id', 'style', 'title'], 180 | 'hr' => ['width', 'size', 'noshade', 'class', 'id', 'style', 'title'], # or
181 | 'i' => ['class', 'id', 'style', 'title'], 182 | 'iframe' => ['src', 'name', 'noresize', 'scroll', 'marginwidth', 'marginheight', 'class', 'id', 'style', 'title'], 183 | 'img' => ['src', 'align', 'width', 'height', 'alt', 'border', 'ISMAP', 'class', 'USEMAP', 'id', 'style', 'title'], 184 | 'input' => ['name', 'type', 'class', 'id', 'style', 'title'], 185 | 'li' => ['type', 'start', 'class', 'id', 'style', 'title'], 186 | 'link' => ['rel', 'type', 'href', 'class', 'id', 'style', 'title'], 187 | 'map' => ['name', 'class', 'id', 'style', 'title'], 188 | 'meta' => ['http-equiv', 'content', 'name', 'content'], # NO class, id, style, title 189 | 'noframes' => [], 190 | 'option' => ['class', 'id', 'style', 'title'], 191 | 'ol' => ['type', 'start', 'class', 'id', 'style', 'title'], 192 | 'p' => ['align', 'class', 'id', 'style', 'title'], 193 | 'param' => [], # NO class, id, style, title 194 | 'pre' => ['class', 'id', 'style', 'title'], 195 | 's' => ['class', 'id', 'style', 'title'], 196 | 'select' => ['name', 'size', 'class', 'id', 'style', 'title'], 197 | #'script' => '', # not this for sure 198 | 'span' => ['class', 'id', 'style', 'title'], 199 | 'strong' => ['class', 'id', 'style', 'title'], 200 | 'style' => ['type'], # NO class, id, style, title 201 | 'table' => ['class', 'border', 'width', 'height', 'cellpadding', 'cellspacing', 'bgcolor', 'background', 'id', 'style', 'title'], 202 | 'tbody' => ['class', 'align', 'valign', 'id', 'style', 'title'], 203 | 'td' => ['class', 'nowrap', 'width', 'align', 'valign', 'colspan', 'rowspan', 'bgcolor', 'id', 'style', 'title'], 204 | 'textarea' => ['name', 'rows', 'cols', 'class', 'id', 'style', 'title'], 205 | 'tfoot' => ['class', 'align', 'valign', 'id', 'style', 'title'], 206 | 'th' => ['class', 'nowrap', 'width', 'align', 'valign', 'colspan', 'rowspan', 'bgcolor', 'id', 'style', 'title'], 207 | 'thead' => ['class', 'align', 'valign', 'id', 'style', 'title'], 208 | 'title' => [], # NO class, id, style, title 209 | 'tr' => ['class', 'align', 'valign', 'bgcolor', 'id', 'style', 'title'], 210 | 'tt' => ['class', 'id', 'style', 'title'], 211 | 'u' => ['class', 'id', 'style', 'title'], 212 | 'ul' => ['type', 'class', 'id', 'style', 'title'], 213 | }, 214 | #'body', 'div', 'span', 'br', 'hr', 'p', 'b', 'i', 'tt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'font', 'blockquote', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'a', 'img', 'map', 'area', 'table', 'tr', 'td', 'th', 'thead', 'tfoot', 'tbody', 'caption', 'frameset', 'frame', 'noframes', 'form', 'input', 'select', 'option', 'textarea', 'link', 'col', 'colgroup', 'u', 's', 'strong', 'em', 'base', 'html', 'head', 'title', 'param', 'script', 'meta', 'style' 215 | 'no_close' => ['img', 'br', 'hr'], 216 | 'always_close' => ['a', 'b'], 217 | 'protocol_attributes' => ['src', 'href'], 218 | 'allowed_protocols' => ['http', 'ftp', 'mailto', 'https', 'sftp'], 219 | 'remove_blanks' => ['a', 'b'], 220 | 'strip_comments' => false, # comments? 221 | 'always_make_tags' => false, 222 | 'allow_numbered_entities' => true, 223 | 'allowed_entities' => ['amp', 'cent', 'copy', 'deg', 'gt', 'lt', 'nbsp', '#174', '#153', 'pound', 'ndash', '#8211', 'mdash', '#8212', 'iexcl', '#161', 'iquest', '#191', 'quot', '#34', 'ldquo', '#8220', 'rdquo', '#8221', '#39', 'lsquo', '#8216', 'rsquo', '#8217', 'laquo', 'raquo', '#171', '#187', 'nbsp', '#160', 'amp', '#38', 'cent', '#162', 'copy', '#169', 'divide', '#247', 'gt', '#62', 'lt', '#60', 'micro', '#181', 'middot', 'para', '#182', 'plusmn', 'euro', '#8364', 'pound', '#163', 'reg', '#174', 'sect', '#167', 'trade', '#153', 'yen', '#165', 'aacute', 'Aacute', '#225', '#193', 'agrave', 'Agrave', '#224', '#192', 'acirc', 'Acirc', '#226', '#194', 'aring', 'Aring', '#229', '#197', 'atilde', 'Atilde', '#227', '#195', 'auml', 'Auml', '#228', '#196', 'aelig', 'AElig', '#230', '#198', 'ccedil', 'Ccedil', '#231', '#199', 'eacute', 'Eacute', '#233', '#201', 'egrave', 'Egrave', '#232', '#200', 'ecirc', 'Ecirc', '#234', '#202', 'euml', 'Euml', '#235', '#203', 'iacute', 'Iacute', '#237', '#205', 'igrave', 'Igrave', '#236', '#204', 'icirc', 'Icirc', '#238', '#206', 'iuml', 'Iuml', '#239', '#207', 'ntilde', 'Ntilde', '#241', '#209', 'oacute', 'Oacute', '#243', '#211', 'ograve', 'Ograve', '#242', '#210', 'ocirc', 'Ocirc', '#244', '#212', 'oslash', 'Oslash', '#248', '#216', 'otilde', 'Otilde', '#245', '#213', 'ouml', 'Ouml', '#246', '#214', 'szlig', '#223', 'uacute', 'Uacute', '#250', '#218', 'ugrave', 'Ugrave', '#249', '#217', 'ucirc', 'Ucirc', '#251', '#219', 'uuml', 'Uuml', '#252', '#220', 'yuml', '#255', '#180', '#96'] 224 | } 225 | 226 | 227 | # New html filter. 228 | # 229 | # Provide custom +options+, or use one of the built-in options 230 | # constants. 231 | # 232 | # hf = HTMLFilter.new(HTMLFilter::RELAXED) 233 | # hf.filter(htmlstr) 234 | # 235 | def initialize(options=nil) 236 | if options 237 | h = DEFAULT.dup 238 | options.each do |k,v| 239 | h[k.to_s] = v 240 | end 241 | options = h 242 | else 243 | options = DEFAULT.dup 244 | end 245 | options.each{ |k,v| send("#{k}=",v) } 246 | end 247 | 248 | # Filter html string. 249 | # 250 | def filter(html) 251 | @tag_counts = {} 252 | html = escape_comments(html) 253 | html = balance_html(html) 254 | html = check_tags(html) 255 | html = process_remove_blanks(html) 256 | html = validate_entities(html) 257 | #html = truncate_html(html) 258 | html 259 | end 260 | 261 | private 262 | 263 | # 264 | # internal tag counter 265 | # 266 | 267 | def tag_counts ; @tag_counts; end 268 | 269 | # 270 | # 271 | # 272 | 273 | def escape_comments(data) 274 | data = data.gsub(//s) do 275 | '' 276 | end 277 | 278 | return data 279 | end 280 | 281 | # 282 | # 283 | # 284 | 285 | def balance_html(data) 286 | data = data.dup 287 | 288 | if always_make_tags 289 | # try and form html 290 | data.gsub!(/>>+/, '>') 291 | data.gsub!(/<<+/, '<') 292 | data.gsub!(/^>/, '') 293 | data.gsub!(/<([^>]*?)(?=<|$)/, '<\1>') 294 | data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1<\2') 295 | else 296 | # escape stray brackets 297 | data.gsub!(/<([^>]*?)(?=<|$)/, '<\1') 298 | data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1\2><') 299 | # the last regexp causes '<>' entities to appear 300 | # (we need to do a lookahead assertion so that the last bracket 301 | # can be used in the next pass of the regexp) 302 | data.gsub!('<>', '') 303 | end 304 | 305 | return data 306 | end 307 | 308 | # 309 | # 310 | # 311 | 312 | def check_tags(data) 313 | data = data.dup 314 | 315 | data.gsub!(/<(.*?)>/s){ 316 | process_tag(strip_single($1)) 317 | } 318 | 319 | tag_counts.each do |tag, cnt| 320 | cnt.times{ data << "" } 321 | end 322 | 323 | return data 324 | end 325 | 326 | # 327 | # 328 | # 329 | 330 | def process_tag(data) 331 | 332 | # ending tags 333 | 334 | re = /^\/([a-z0-9]+)/si 335 | 336 | if matches = re.match(data) 337 | name = matches[1].downcase 338 | if allowed.key?(name) 339 | unless no_close.include?(name) 340 | if tag_counts[name] 341 | tag_counts[name] -= 1 342 | return "" 343 | end 344 | end 345 | else 346 | return '' 347 | end 348 | end 349 | 350 | # starting tags 351 | 352 | re = /^([a-z0-9]+)(.*?)(\/?)$/si 353 | 354 | if matches = re.match(data) 355 | name = matches[1].downcase 356 | body = matches[2] 357 | ending = matches[3] 358 | 359 | if allowed.key?(name) 360 | params = "" 361 | 362 | matches_2 = body.scan(/([a-z0-9]+)=(["'])(.*?)\2/si) # 363 | matches_1 = body.scan(/([a-z0-9]+)(=)([^"\s']+)/si) # 364 | matches_3 = body.scan(/([a-z0-9]+)=(["'])([^"']*?)\s*$/si) # ' 395 | else 396 | return '' 397 | end 398 | end 399 | 400 | # comments 401 | if /^!--(.*)--$/si =~ data 402 | if strip_comments 403 | return '' 404 | else 405 | return '<' + data + '>' 406 | end 407 | end 408 | 409 | # garbage, ignore it 410 | return '' 411 | end 412 | 413 | # 414 | # 415 | # 416 | 417 | def process_param_protocol(data) 418 | data = decode_entities(data) 419 | 420 | re = /^([^:]+)\:/si 421 | 422 | if matches = re.match(data) 423 | unless allowed_protocols.include?(matches[1]) 424 | #data = '#'.substr(data, strlen(matches[1])+1) 425 | data = '#' + data[0..matches[1].size+1] 426 | end 427 | end 428 | 429 | return data 430 | end 431 | 432 | # 433 | # 434 | # 435 | 436 | def process_remove_blanks(data) 437 | data = data.dup 438 | 439 | remove_blanks.each do |tag| 440 | data.gsub!(/<#{tag}(\s[^>]*)?><\/#{tag}>/, '') 441 | data.gsub!(/<#{tag}(\s[^>]*)?\/>/, '') 442 | end 443 | 444 | return data 445 | end 446 | 447 | # 448 | # 449 | # 450 | 451 | def fix_case(data) 452 | data_notags = strip_tags(data) 453 | data_notags = data_notags.gsub(/[^a-zA-Z]/, '') 454 | 455 | if data_notags.size < 5 456 | return data 457 | end 458 | 459 | if /[a-z]/ =~ data_notags 460 | return data 461 | end 462 | 463 | data = data.gsub(/(>|^)([^<]+?)(<|$)/s){ 464 | strip_single($1) + 465 | fix_case_inner(strip_single($2)) + 466 | strip_single($3) 467 | } 468 | 469 | return data 470 | end 471 | 472 | # 473 | # 474 | # 475 | 476 | def fix_case_inner(data) 477 | data = data.dup 478 | 479 | data.downcase! 480 | 481 | data.gsub!(/(^|[^\w\s\';,\\-])(\s*)([a-z])/){ 482 | strip_single("#{$1}#{$2}") + strip_single($3).upcase 483 | } 484 | 485 | return data 486 | end 487 | 488 | # 489 | # 490 | # 491 | 492 | def validate_entities(data) 493 | data = data.dup 494 | 495 | # validate entities throughout the string 496 | data.gsub!(%r!&([^&;]*)(?=(;|&|$))!){ 497 | check_entity(strip_single($1), strip_single($2)) 498 | } 499 | 500 | # validate quotes outside of tags 501 | data.gsub!(/(>|^)([^<]+?)(<|$)/s){ 502 | m1, m2, m3 = $1, $2, $3 503 | strip_single(m1) + 504 | strip_single(m2).gsub('\"', '"') + 505 | strip_single(m3) 506 | } 507 | 508 | return data 509 | end 510 | 511 | # 512 | # 513 | # 514 | 515 | def check_entity(preamble, term) 516 | if term != ';' 517 | return '&' + preamble 518 | end 519 | 520 | if is_valid_entity(preamble) 521 | return '&' + preamble 522 | end 523 | 524 | return '&' + preamble 525 | end 526 | 527 | # 528 | # 529 | # 530 | 531 | def is_valid_entity(entity) 532 | re = /^#([0-9]+)$/i 533 | 534 | if md = re.match(entity) 535 | if (md[1].to_i > 127) 536 | return true 537 | end 538 | return allow_numbered_entities 539 | end 540 | 541 | if allowed_entities.include?(entity) 542 | return true 543 | end 544 | 545 | return nil 546 | end 547 | 548 | # within attributes, we want to convert all hex/dec/url 549 | # escape sequences into their raw characters so that we can 550 | # check we don't get stray quotes/brackets inside strings. 551 | 552 | def decode_entities(data) 553 | data = data.dup 554 | 555 | data.gsub!(/(&)#(\d+);?/){ decode_dec_entity($1, $2) } 556 | data.gsub!(/(&)#x([0-9a-f]+);?/i){ decode_hex_entity($1, $2) } 557 | data.gsub!(/(%)([0-9a-f]{2});?/i){ decode_hex_entity($1, $2) } 558 | 559 | data = validate_entities(data) 560 | 561 | return data 562 | end 563 | 564 | # 565 | # 566 | # 567 | 568 | def decode_hex_entity(*m) 569 | return decode_num_entity(m[1], m[2].to_i.to_s(16)) 570 | end 571 | 572 | # 573 | # 574 | # 575 | 576 | def decode_dec_entity(*m) 577 | return decode_num_entity(m[1], m[2]) 578 | end 579 | 580 | # 581 | # 582 | # 583 | 584 | def decode_num_entity(orig_type, d) 585 | d = d.to_i 586 | d = 32 if d < 0 # space 587 | 588 | # don't mess with high chars 589 | if d > 127 590 | return '%' + d.to_s(16) if orig_type == '%' 591 | return "&#{d};" if orig_type == '&' 592 | end 593 | 594 | return escape_special_chars(d.chr) 595 | end 596 | 597 | # 598 | # 599 | # 600 | 601 | def strip_single(data) 602 | return data.gsub('\"', '"').gsub('\0', 0.chr) 603 | end 604 | 605 | # Certain characters have special significance in HTML, and 606 | # should be represented by HTML entities if they are to 607 | # preserve their meanings. This function returns a string 608 | # with some of these conversions made; the translations made 609 | # are those most useful for everyday web programming. 610 | 611 | def escape_special_chars(data) 612 | data = data.dup 613 | data.gsub!( /&/n , '&' ) 614 | data.gsub!( /\"/n , '"' ) 615 | data.gsub!( />/n , '>' ) 616 | data.gsub!( /} 623 | # 624 | ## HTML tag regular expression 625 | #TAG_RE = %r{\s]+))?)+\s*|\s*)/?>} #' 626 | # 627 | ## 628 | #def truncate_html(html) 629 | # return html unless truncate 630 | # # default settings 631 | # limit = truncate 632 | # 633 | # mask = html.gsub(REM_RE){ |m| "\0" * m.size } 634 | # mask = mask.gsub(TAG_RE){ |m| "\0" * m.size } 635 | # 636 | # i, x = 0, 0 637 | # 638 | # while i < mask.size && x < limit 639 | # x += 1 if mask[i] != "\0" 640 | # i += 1 641 | # end 642 | # 643 | # while x > 0 && mask[x,1] == "\0" 644 | # x -= 1 645 | # end 646 | # 647 | # return html[0..x] 648 | #end 649 | 650 | # Simple Command line interface for HTMLFilter. 651 | # 652 | # It can be configured via a YAML file. 653 | # 654 | class CLI 655 | def self.run 656 | new.run 657 | end 658 | 659 | attr_reader :config_file 660 | 661 | attr_reader :options 662 | 663 | def initialize 664 | require 'optparse' 665 | @config_file = nil 666 | @options = {} 667 | end 668 | 669 | def parser 670 | OptionParser.new do |opt| 671 | opt.on('--config ', 'filter with custom configuration'){ |file| @config_file = file } 672 | opt.on('--debug', 'run in debug mode to see error details'){ $DEBUG = true } 673 | end 674 | end 675 | 676 | def options 677 | if config_file 678 | raise "configuration file not found" unless File.exist?(config_file) 679 | @options = YAML.load_file(config_file) 680 | end 681 | end 682 | 683 | def run 684 | parser.parse! 685 | begin 686 | files = ARGV 687 | files.each do |f| 688 | raise "htmlfilter: file not found -- #{f}" unless File.exist?(f) 689 | end 690 | files.each do |file| 691 | html = File.read(file) 692 | puts HTMLFilter.new(options).filter(html) 693 | end 694 | rescue => error 695 | raise error if $DEBUG 696 | $stderr.puts error 697 | end 698 | end 699 | end 700 | 701 | end 702 | 703 | # Overload the standard String class for extra convienience. 704 | 705 | class String 706 | def html_filter(*opts) 707 | HTMLFilter.new(*opts).filter(self) 708 | end 709 | end 710 | 711 | --------------------------------------------------------------------------------