├── .gemtest ├── test ├── data │ ├── news │ │ ├── ryzom-naissance-du-projet-libre-ryzom-forge.md │ │ └── index.html │ ├── validator.nu-success.json │ ├── assets │ │ └── application-92f19110a9d47a56d2ebe744e15af301.css │ ├── validator.nu-failure.json │ ├── xhtml1-strict.html │ ├── html5.html │ ├── html5-fail.html │ └── html4-strict.html ├── core_test.rb ├── test_helper.rb ├── example │ └── ruby smalltalk │ │ └── blockcamp-paris-le-28-novembre.html ├── webmock_helper.rb ├── static_test.rb ├── validator_test.rb └── crawler_test.rb ├── validate-website.png ├── lib ├── validate_website.rb └── validate_website │ ├── version.rb │ ├── validator_class_methods.rb │ ├── colorful_messages.rb │ ├── runner.rb │ ├── static_link.rb │ ├── utils.rb │ ├── static.rb │ ├── crawl.rb │ ├── validator.rb │ ├── core.rb │ └── option_parser.rb ├── .gitignore ├── Gemfile ├── bin ├── validate-website └── validate-website-static ├── .rubocop.yml ├── data └── schemas │ ├── xhtml2.xsd │ ├── xhtml-inlstyle-1.xsd │ ├── frameset.dtd │ ├── xhtml-inputmode-1.xsd │ ├── xhtml-copyright-1.xsd │ ├── xml-events-copyright-1.xsd │ ├── xml-events-copyright-2.xsd │ ├── xhtml-base-1.xsd │ ├── xhtml-charent-1.xsd │ ├── xhtml-metaAttributes-1.xsd │ ├── xhtml-ssismap-1.xsd │ ├── xhtml-target-1.xsd │ ├── xhtml-nameident-1.xsd │ ├── xml-events-attribs-1.xsd │ ├── xml-events-1.xsd │ ├── xml-events-2.xsd │ ├── xml-events-attribs-2.xsd │ ├── xhtml-ruby-basic-1.xsd │ ├── xhtml-notations-1.xsd │ ├── xhtml-special.ent │ ├── xml-handlers-1.xsd │ ├── xhtml-events-1.xsd │ ├── xframes-1.xsd │ ├── xhtml-datatypes-1.xsd │ ├── xhtml-basic11.dtd │ ├── xml.xsd │ ├── xhtml-lat1.ent │ └── xhtml-symbol.ent ├── .github └── workflows │ └── ci.yml ├── Rakefile ├── .gitlab-ci.yml ├── LICENSE ├── validate-website.gemspec ├── doc ├── validate-website-static.adoc └── validate-website.adoc ├── man └── man1 │ ├── validate-website-static.1 │ └── validate-website.1 ├── README.md └── History.md /.gemtest: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/data/news/ryzom-naissance-du-projet-libre-ryzom-forge.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /validate-website.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spk/validate-website/HEAD/validate-website.png -------------------------------------------------------------------------------- /test/data/validator.nu-success.json: -------------------------------------------------------------------------------- 1 | {"url":"https://example.org/","messages":[],"language":"fr"} 2 | -------------------------------------------------------------------------------- /lib/validate_website.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'validate_website/core' 4 | require 'validate_website/version' 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pkg 2 | tags 3 | Gemfile.lock 4 | *.gem 5 | man/man1/validate-website-static.xml 6 | man/man1/validate-website.xml 7 | coverage 8 | -------------------------------------------------------------------------------- /test/data/assets/application-92f19110a9d47a56d2ebe744e15af301.css: -------------------------------------------------------------------------------- 1 | .t { background-image: url(/image/42.png) } 2 | /**/ .foo {} #{bar {} 3 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | gemspec 6 | 7 | gem 'simplecov', require: false 8 | 9 | # vim: syntax=ruby filetype=ruby 10 | -------------------------------------------------------------------------------- /bin/validate-website: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'validate_website/runner' 5 | exit_status = ValidateWebsite::Runner.run_crawl(ARGV) 6 | exit(exit_status) 7 | -------------------------------------------------------------------------------- /bin/validate-website-static: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'validate_website/runner' 5 | exit_status = ValidateWebsite::Runner.run_static(ARGV) 6 | exit(exit_status) 7 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | --- 2 | AllCops: 3 | TargetRubyVersion: 2.7 4 | Naming/FileName: 5 | Enabled: false 6 | Lint/MissingCopEnableDirective: 7 | Enabled: false 8 | Lint/InterpolationCheck: 9 | Enabled: false 10 | Lint/UriEscapeUnescape: 11 | Enabled: false 12 | -------------------------------------------------------------------------------- /lib/validate_website/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Version file for ValidateWebsite 4 | module ValidateWebsite 5 | VERSION = '1.12.0' 6 | 7 | def self.jruby? # :nodoc: 8 | defined?(RUBY_ENGINE) && RUBY_ENGINE == 'jruby' 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /test/core_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path('test_helper', __dir__) 4 | 5 | describe ValidateWebsite::Core do 6 | describe 'invalid options' do 7 | it 'raise ArgumentError on wrong validation_type' do 8 | _(proc { ValidateWebsite::Core.new({ color: false }, :fail) }) 9 | .must_raise ArgumentError 10 | end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | begin 4 | require 'simplecov' 5 | SimpleCov.start 6 | rescue LoadError 7 | warn 'simplecov not loaded' 8 | end 9 | 10 | require 'minitest/autorun' 11 | require 'spidr' 12 | 13 | require 'validate_website/core' 14 | 15 | require File.expand_path('webmock_helper', __dir__) 16 | 17 | TEST_DOMAIN = 'http://www.example.com/' 18 | ENV['LC_ALL'] = 'C.UTF-8' if defined?(RUBY_ENGINE) && RUBY_ENGINE == 'jruby' 19 | -------------------------------------------------------------------------------- /test/data/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | title 6 | 7 | 8 | 9 | 10 | 11 |
12 | 17 |
18 | 19 | 20 | -------------------------------------------------------------------------------- /lib/validate_website/validator_class_methods.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tidy_ffi' 4 | 5 | # Validator Class Methods 6 | module ValidatorClassMethods 7 | def validator_uri 8 | @validator_uri ||= 9 | ENV['VALIDATOR_NU_URL'] || @html5_validator_service_url 10 | end 11 | 12 | def tidy 13 | return @tidy if defined?(@tidy) 14 | 15 | @lib_tidy = TidyFFI::LibTidy 16 | @tidy = TidyFFI::Tidy 17 | rescue TidyFFI::LibTidyNotInstalled 18 | @tidy = nil 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /test/data/validator.nu-failure.json: -------------------------------------------------------------------------------- 1 | {"url":"https://www.rust-lang.org/en-US/","messages":[{"type":"error","lastLine":113,"lastColumn":6,"firstColumn":1,"message":"End tag “pre” seen, but there were open elements.","extract":"}\n }\n}\n\n\"Hola\", , \"Hola\"","hiliteStart":10,"hiliteLength":21}]} 2 | -------------------------------------------------------------------------------- /data/schemas/xhtml2.xsd: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | A minimal XML Schema for XHTML 2.0 7 | $Id: xhtml2.xsd,v 1.4 2005/06/14 15:28:27 mimasa Exp $ 8 | 9 | 10 | 11 | 12 | 13 | 14 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: CI 3 | 4 | on: 5 | push: 6 | branches: [master] 7 | pull_request: 8 | branches: [master] 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | ruby: ['2.7', '3.0', '3.1', jruby-9.4] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Ruby 20 | uses: ruby/setup-ruby@v1 21 | with: 22 | bundler-cache: true 23 | ruby-version: ${{ matrix.ruby }} 24 | - name: Install libtidy-dev 25 | run: sudo apt-get install libtidy-dev 26 | - name: Install dependencies 27 | run: bundle install -j 3 28 | - name: Run tests 29 | run: bundle exec rake 30 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rake/testtask' 4 | require 'rubocop/rake_task' 5 | require 'asciidoctor' 6 | 7 | default = %i[test rubocop] 8 | task default: default 9 | 10 | desc 'Update manpage from asciidoc file' 11 | task :manpage do 12 | Dir.glob('doc/*.adoc').each do |adoc| 13 | Asciidoctor.convert_file adoc, to_file: true, 14 | backend: 'manpage', 15 | to_dir: 'man/man1' 16 | end 17 | end 18 | 19 | Rake::TestTask.new do |t| 20 | t.pattern = 'test/**/*_test.rb' 21 | end 22 | task spec: :test 23 | 24 | desc 'Execute rubocop' 25 | RuboCop::RakeTask.new(:rubocop) do |t| 26 | t.options = ['--display-cop-names', '--display-style-guide'] 27 | t.fail_on_error = true 28 | end 29 | -------------------------------------------------------------------------------- /lib/validate_website/colorful_messages.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'paint' 4 | 5 | module ValidateWebsite 6 | # Internal helper for colorful messages 7 | module ColorfulMessages 8 | def color(type, message, colored = true) 9 | return message unless colored 10 | 11 | send(type, message) 12 | end 13 | 14 | def error(message) 15 | Paint[message, :red] 16 | end 17 | 18 | def warning(message) 19 | Paint[message, :yellow] 20 | end 21 | 22 | def success(message) 23 | Paint[message, :green] 24 | end 25 | 26 | alias message success 27 | 28 | def note(message) 29 | Paint[message, :magenta] 30 | end 31 | 32 | def info(message) 33 | Paint[message, :blue] 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/validate_website/runner.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'validate_website/core' 4 | 5 | module ValidateWebsite 6 | # Runner for command line use and clean exit on ctrl-c 7 | class Runner 8 | def self.trap_interrupt 9 | trap('INT') do 10 | warn "\nExiting..." 11 | exit!(1) 12 | end 13 | end 14 | 15 | def self.run_crawl(args) 16 | trap_interrupt 17 | validate_website = ValidateWebsite::Crawl.new(args) 18 | validate_website.crawl 19 | validate_website.exit_status 20 | end 21 | 22 | def self.run_static(args) 23 | trap_interrupt 24 | validate_website = ValidateWebsite::Static.new(args) 25 | validate_website.crawl 26 | validate_website.exit_status 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | stages: 3 | - test 4 | - coverage 5 | 6 | default: 7 | before_script: 8 | - apt-get update -qy 9 | - apt-get install -y libtidy-dev 10 | - ruby -v 11 | - which ruby 12 | - gem install bundler --no-document 13 | - bundle install --jobs $(nproc) "${FLAGS[@]}" 14 | 15 | .tests: 16 | script: 17 | - bundle exec rake 18 | stage: test 19 | 20 | test:2.7: 21 | extends: .tests 22 | image: 'ruby:2.7' 23 | 24 | test:3.0: 25 | extends: .tests 26 | image: 'ruby:3.0' 27 | 28 | test:3.1: 29 | extends: .tests 30 | image: 'ruby:3.1' 31 | 32 | test:jruby: 33 | extends: .tests 34 | image: 'jruby:9.4-jre' 35 | 36 | coverage: 37 | image: 'ruby:3.1' 38 | script: 39 | - bundle exec rake test 40 | artifacts: 41 | paths: 42 | - coverage/ 43 | stage: coverage 44 | -------------------------------------------------------------------------------- /test/data/xhtml1-strict.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | title 7 | 8 | 9 |

Title 1

10 |

Paragraphe.

11 | 12 |

Title 2

13 | 18 |

non local img

19 |

local img with absolute uri

20 |

local img with non absolute uri

21 | 22 | 23 | -------------------------------------------------------------------------------- /test/example/ruby smalltalk/blockcamp-paris-le-28-novembre.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | title 7 | 8 | 9 |

Title 1

10 |

Paragraphe.

11 | 12 |

Title 2

13 | 18 |

non local img

19 |

local img with absolute uri

20 |

local img with non absolute uri

21 | 22 | 23 | -------------------------------------------------------------------------------- /data/schemas/xhtml-inlstyle-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Inline Style module 7 | This is the XML Schema Inline Style module for XHTML 8 | 9 | * styloe attribute 10 | 11 | This module declares the 'style' attribute, used to support inline 12 | style markup. 13 | 14 | $Id: xhtml-inlstyle-1.xsd,v 1.2 2005/09/26 22:54:53 ahby Exp $ 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /test/webmock_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'webmock/minitest' 4 | 5 | # FakePage html helper for webmock 6 | class FakePage 7 | include WebMock::API 8 | 9 | attr_accessor :links 10 | attr_accessor :hrefs 11 | attr_accessor :body 12 | 13 | def initialize(name = '', options = {}) 14 | @name = name 15 | @links = [options[:links]].flatten if options.key?(:links) 16 | @hrefs = [options[:hrefs]].flatten if options.key?(:hrefs) 17 | @content_type = options[:content_type] || 'text/html' 18 | @body = options[:body] 19 | 20 | create_body unless @body 21 | add_to_webmock 22 | end 23 | 24 | def url 25 | TEST_DOMAIN + @name 26 | end 27 | 28 | private 29 | 30 | def create_body 31 | @body = '' 32 | @links&.each { |l| @body += "" } 33 | @hrefs&.each { |h| @body += "" } 34 | @body += '' 35 | end 36 | 37 | def add_to_webmock 38 | options = { body: @body, headers: { 'Content-Type' => @content_type } } 39 | stub_request(:get, url).to_return(options) 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2009-2022 Laurent Arnoud 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | 'Software'), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /data/schemas/frameset.dtd: -------------------------------------------------------------------------------- 1 | 20 | 25 | 26 | 27 | ... 28 | 29 | 30 | ... 31 | 32 | 33 | --> 34 | 35 | 36 | 37 | %HTML4.dtd; -------------------------------------------------------------------------------- /test/data/html5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | title 6 | 7 | 8 | 9 | 10 | 11 |
12 | 19 |
20 | 21 |
22 |
23 |

article title

24 |

texte de my article

25 |
26 |

article subtitle

27 |

text

28 |
29 | my image 30 |
31 |
32 |
33 |
34 | 35 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /data/schemas/xhtml-inputmode-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | This is the XML Schema inputmode module for XHTML 12 | $Id: xhtml-inputmode-1.xsd,v 1.1 2008/05/29 19:43:21 smccarro Exp $ 13 | 14 | 15 | 16 | 17 | 18 | 19 | InputMode 20 | 21 | * inputmode 22 | 23 | This module declares the 'inputmode' attribute used for giving hints about how to deal with input 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /data/schemas/xhtml-copyright-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | This is XHTML, a reformulation of HTML as a modular XML application 12 | The Extensible HyperText Markup Language (XHTML) 13 | Copyright ©1998-2005 World Wide Web Consortium 14 | (Massachusetts Institute of Technology, European Research Consortium 15 | for Informatics and Mathematics, Keio University). 16 | All Rights Reserved. 17 | 18 | Permission to use, copy, modify and distribute the XHTML Schema 19 | modules and their accompanying xs:documentation for any purpose 20 | and without fee is hereby granted in perpetuity, provided that the above 21 | copyright notice and this paragraph appear in all copies. 22 | The copyright holders make no representation about the suitability of 23 | these XML Schema modules for any purpose. 24 | 25 | They are provided "as is" without expressed or implied warranty. 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /data/schemas/xml-events-copyright-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | 12 | 13 | This is XML Events, a generalized event model for XML-based 14 | markup languages. 15 | 16 | Copyright 2001-2003 World Wide Web Consortium 17 | (Massachusetts Institute of Technology, European Research 18 | Consortium for Informatics and Mathematics, Keio University). 19 | All Rights Reserved. 20 | 21 | Permission to use, copy, modify and distribute the 22 | XML Events Schema modules and their accompanying xs:documentation 23 | for any purpose and without fee is hereby granted in perpetuity, 24 | provided that the above copyright notice and this paragraph appear 25 | in all copies. 26 | 27 | The copyright holders make no representation about the suitability of 28 | these XML Schema modules for any purpose. 29 | 30 | They are provided "as is" without expressed or implied warranty. 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /data/schemas/xml-events-copyright-2.xsd: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | 12 | 13 | This is XML Events, a generalized event model for XML-based 14 | markup languages. 15 | 16 | Copyright 2001-2007 World Wide Web Consortium 17 | (Massachusetts Institute of Technology, European Research 18 | Consortium for Informatics and Mathematics, Keio University). 19 | All Rights Reserved. 20 | 21 | Permission to use, copy, modify and distribute the 22 | XML Events Schema modules and their accompanying xs:documentation 23 | for any purpose and without fee is hereby granted in perpetuity, 24 | provided that the above copyright notice and this paragraph appear 25 | in all copies. 26 | 27 | The copyright holders make no representation about the suitability of 28 | these XML Schema modules for any purpose. 29 | 30 | They are provided "as is" without expressed or implied warranty. 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /data/schemas/xhtml-base-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Base element 8 | This is the XML Schema Base Element module for XHTML 9 | 10 | * base 11 | 12 | This module declares the base element type and its attributes, 13 | used to define a base URI against which relative URIs in the 14 | document will be resolved. 15 | 16 | $Id: xhtml-base-1.xsd,v 1.2 2005/09/26 22:54:53 ahby Exp $ 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /lib/validate_website/static_link.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'uri' 4 | require 'validate_website/utils' 5 | require 'validate_website/static' 6 | require 'spidr' 7 | 8 | # rubocop:disable Metrics/BlockLength 9 | StaticLink = Struct.new(:link, :site) do 10 | def link_uri 11 | @link_uri = URI.parse(WEBrick::HTTPUtils.escape(link)) 12 | @link_uri = URI.join(site, @link_uri) if @link_uri.host.nil? 13 | @link_uri 14 | end 15 | 16 | def in_static_domain? 17 | URI.parse(site).host == link_uri.host 18 | end 19 | 20 | def content_types 21 | if css? 22 | ['text/css'] 23 | else 24 | ValidateWebsite::Static::CONTENT_TYPES 25 | end 26 | end 27 | 28 | def body 29 | if File.exist?(link) 30 | File.open(link).read 31 | else 32 | File.open(file_path).read 33 | end 34 | end 35 | 36 | def response 37 | @response ||= ValidateWebsite::Static.fake_httpresponse( 38 | body, 39 | content_types 40 | ) 41 | end 42 | 43 | def page 44 | @page ||= Spidr::Page.new(link_uri, response) 45 | end 46 | 47 | def extract_urls_from_fake_css_response 48 | ValidateWebsite::Utils.extract_urls_from_css(page) 49 | end 50 | 51 | def file_path 52 | @file_path ||= URI.parse( 53 | File.join(Dir.getwd, link_uri.path || '/') 54 | ).path 55 | end 56 | 57 | def extname 58 | @extname ||= File.extname(file_path) 59 | end 60 | 61 | def css? 62 | extname == '.css' 63 | end 64 | 65 | def check? 66 | !link.include?('#') && in_static_domain? 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /data/schemas/xhtml-charent-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 5 | 7 | 10 | %HTMLlat1; 11 | 12 | 15 | %HTMLsymbol; 16 | 17 | 20 | %HTMLspecial; 21 | ]> 22 | 24 | 25 | 26 | Character Entities for XHTML 27 | This is the XML Schema Character Entities module for XHTML 28 | 29 | This module declares the set of character entities for XHTML, 30 | including the Latin 1, Symbol and Special character collections. 31 | XML Schema does not support Entities, hence Entities are enable 32 | through an Internal DTD Subset. 33 | 34 | $Id: xhtml-charent-1.xsd,v 1.3 2005/09/26 22:54:53 ahby Exp $ 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /data/schemas/xhtml-metaAttributes-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | This is the XML Schema Metainformation Attributes module for XHTML 7 | 8 | $Id: xhtml-metaAttributes-1.xsd,v 1.6 2008/07/05 04:15:30 smccarro Exp $ 9 | 10 | 11 | 12 | 13 | 14 | 15 | XHTML Metainformation Attributes 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /lib/validate_website/utils.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Base module ValidateWebsite 4 | module ValidateWebsite 5 | # Utils class for CSS helpers 6 | class Utils 7 | # Extract urls from CSS page 8 | # 9 | # @param [Spidr::Page] a Spidr::Page object 10 | # @return [Set] Lists of urls 11 | # 12 | def self.extract_urls_from_css(page) 13 | return Set[] unless page 14 | return Set[] if page.body.nil? 15 | 16 | nodes = Crass::Parser.parse_stylesheet(page.body) 17 | extract_urls_from_nodes nodes, page 18 | end 19 | 20 | # Return urls as absolute from Crass nodes 21 | # 22 | # @param [Hash] node from Crass 23 | # @param [Spidr::Page] a Spidr::Page object 24 | # @return [Set] list of obsolute urls 25 | def self.urls_to_absolute(node, page) 26 | if node[:node] == :function && node[:name] == 'url' || node[:node] == :url 27 | Array(node[:value]).map do |v| 28 | url = v.is_a?(String) ? v : v[:value] 29 | page.to_absolute(url).to_s 30 | end 31 | else 32 | Set.new 33 | end 34 | end 35 | 36 | # Extract urls from Crass nodes 37 | # @param [Array] Array of nodes from Crass 38 | # @param [Spidr::Page] a Spidr::Page object 39 | # @param [Set] memo for recursivity 40 | # @return [Set] list of urls 41 | def self.extract_urls_from_nodes(nodes, page, memo = Set[]) 42 | nodes.each_with_object(memo) do |node, result| 43 | result.merge urls_to_absolute(node, page) 44 | if node[:children] 45 | extract_urls_from_nodes node.delete(:children), page, result 46 | end 47 | result 48 | end 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /validate-website.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path('lib/validate_website/version', __dir__) 4 | 5 | Gem::Specification.new do |s| 6 | s.author = 'Laurent Arnoud' 7 | s.email = 'laurent@spkdev.net' 8 | s.homepage = 'http://github.com/spk/validate-website' 9 | s.platform = Gem::Platform::RUBY 10 | s.summary = 'Web crawler for checking the validity of your documents' 11 | s.name = 'validate-website' 12 | s.version = ValidateWebsite::VERSION 13 | s.license = 'MIT' 14 | s.required_ruby_version = '>= 2.7' 15 | s.add_dependency 'crass', '~> 1' 16 | s.add_dependency 'nokogiri', '~> 1.15' 17 | s.add_dependency 'paint', '~> 2' 18 | s.add_dependency 'slop', '~> 4.6' 19 | s.add_dependency 'spidr', '~> 0.7' 20 | s.add_dependency 'tidy_ffi', '~> 1.0' 21 | s.add_dependency 'w3c_validators', '~> 1.3' 22 | s.add_dependency 'webrick', '~> 1' 23 | s.add_development_dependency 'asciidoctor', '~> 1.5' 24 | s.add_development_dependency 'minitest', '~> 5.0' 25 | s.add_development_dependency 'rake', '~> 12' 26 | s.add_development_dependency 'rubocop', '~> 0.76.0' 27 | s.add_development_dependency 'webmock', '~> 3.4' 28 | s.require_path = 'lib' 29 | s.bindir = 'bin' 30 | s.executables << 'validate-website' 31 | s.executables << 'validate-website-static' 32 | s.files = Dir['README.md', 'Rakefile', 'LICENSE', 'History.md', 33 | 'bin', 34 | 'lib/**/*.rb', 35 | 'man/**/*', 36 | 'test/**/*', 37 | 'data/**/*'] 38 | s.description = %(validate-website is a web crawler for checking the markup \ 39 | validity with XML Schema / DTD and not found urls.) 40 | s.test_files = Dir.glob('test/**/*_test.rb') 41 | end 42 | -------------------------------------------------------------------------------- /data/schemas/xhtml-ssismap-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | This is the XML Schema Server-side Image Maps module for XHTML 7 | $Id: xhtml-ssismap-1.xsd,v 1.3 2005/09/26 22:54:53 ahby Exp $ 8 | 9 | 10 | 11 | 12 | 13 | Server-side Image Maps 14 | 15 | This adds the 'ismap' attribute to the img element to 16 | support server-side processing of a user selection. 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /data/schemas/xhtml-target-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | This is the XML Schema Target module for XHTML 12 | $Id: xhtml-target-1.xsd,v 1.3 2007/04/03 18:27:01 ahby Exp $ 13 | 14 | 15 | 16 | 17 | 18 | 19 | Target 20 | 21 | * target 22 | 23 | This module declares the 'target' attribute used for opening windows 24 | 25 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /doc/validate-website-static.adoc: -------------------------------------------------------------------------------- 1 | validate-website-static(1) 2 | ========================== 3 | 4 | NAME 5 | ---- 6 | validate-website-static - check the validity of your documents 7 | 8 | SYNOPSIS 9 | -------- 10 | *validate-website-static* ['OPTIONS'] 11 | 12 | DESCRIPTION 13 | ----------- 14 | validate-website-static check the markup validity of your local documents with 15 | XML Schema / DTD. 16 | HTML5 support with Validator.nu Web Service. 17 | 18 | OPTIONS 19 | ------- 20 | *-s*, *--site* 'SITE':: 21 | Where static files will be hosted (Default: http://www.example.com/) 22 | *-p*, *--pattern* 'PATTERN':: 23 | Change filenames pattern (Default: \*\*/*.html) 24 | *-e*, *--exclude* 'EXCLUDE':: 25 | Url to exclude (ex: 'redirect|news') 26 | *-i*, *--ignore* 'IGNORE':: 27 | Ignore certain validation errors (ex: 'autocorrect') 28 | *-m*, *--[no-]markup*:: 29 | Markup validation (Default: true) 30 | *--css-syntax*:: 31 | Css validation (Default: false) 32 | *-n*, *--not-found*:: 33 | Log files not on filesystem, pwd considered as root « / » (Default: false) 34 | *--[no-]color*:: 35 | Show colored output (Default: true) 36 | *-x*, *--html5-validator* 'VALIDATOR':: 37 | Change default html5 validator engine (tidy/nu/nokogiri) 38 | *-5*, *--html5-validator-service-url* 'URL':: 39 | Change default html5 validator service URL for "nu" engine 40 | *-v*, *--verbose*:: 41 | Show detail of validator errors (Default: false). 42 | *-h*, *--help*:: 43 | Show help message and exit. 44 | 45 | EXIT STATUS 46 | ----------- 47 | 0:: 48 | Markup is valid. 49 | 64:: 50 | Not valid markup found. 51 | 65:: 52 | There are pages not found. 53 | 66:: 54 | There are not valid markup and pages not found. 55 | 56 | AUTHOR 57 | ------ 58 | Laurent Arnoud mailto:laurent@spkdev.net[Laurent Arnoud] 59 | 60 | LICENSE 61 | ------- 62 | The MIT License 63 | 64 | Copyright (c) 2009-2019 mailto:laurent@spkdev.net[Laurent Arnoud] 65 | 66 | // vim: set syntax=asciidoc: 67 | -------------------------------------------------------------------------------- /doc/validate-website.adoc: -------------------------------------------------------------------------------- 1 | validate-website(1) 2 | =================== 3 | 4 | NAME 5 | ---- 6 | validate-website - Web crawler for checking the validity of your documents 7 | 8 | SYNOPSIS 9 | -------- 10 | *validate-website* ['OPTIONS'] 11 | 12 | DESCRIPTION 13 | ----------- 14 | validate-website is a web crawler for checking the markup validity with XML 15 | Schema / DTD and not found urls. 16 | HTML5 support with Validator.nu Web Service. 17 | 18 | OPTIONS 19 | ------- 20 | *-s*, *--site* 'SITE':: 21 | Website to crawl (Default: http://localhost:3000/) 22 | *-u*, *--user-agent* 'USERAGENT':: 23 | Change user agent (Default: Spidr.user_agent) 24 | *-e*, *--exclude* 'EXCLUDE':: 25 | Url to exclude (ex: 'redirect|news') 26 | *-i*, *--ignore* 'IGNORE':: 27 | Ignore certain validation errors (ex: 'autocorrect') 28 | *-c*, *--cookies* 'COOKIES':: 29 | Set defaults cookies 30 | *-m*, *--[no-]markup*:: 31 | Markup validation (Default: true) 32 | *--css-syntax*:: 33 | Css validation (Default: false) 34 | *-n*, *--not-found*:: 35 | Log not found url (Default: false) 36 | *--[no-]color*:: 37 | Show colored output (Default: true) 38 | *-x*, *--html5-validator* 'VALIDATOR':: 39 | Change default html5 validator engine (tidy/nu/nokogiri) 40 | *-5*, *--html5-validator-service-url* 'URL':: 41 | Change default html5 validator service URL for "nu" engine 42 | *-v*, *--verbose*:: 43 | Show detail of validator errors (Default: false). 44 | *-h*, *--help*:: 45 | Show help message and exit. 46 | 47 | EXIT STATUS 48 | ----------- 49 | 0:: 50 | Markup is valid and no 404 found. 51 | 64:: 52 | Not valid markup found. 53 | 65:: 54 | There are pages not found. 55 | 66:: 56 | There are not valid markup and pages not found. 57 | 58 | AUTHOR 59 | ------ 60 | Laurent Arnoud mailto:laurent@spkdev.net[Laurent Arnoud] 61 | 62 | LICENSE 63 | ------- 64 | The MIT License 65 | 66 | Copyright (c) 2009-2019 mailto:laurent@spkdev.net[Laurent Arnoud] 67 | 68 | // vim: set syntax=asciidoc: 69 | -------------------------------------------------------------------------------- /data/schemas/xhtml-nameident-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | This is the XML Schema Name Identifier module for XHTML 12 | $Id: xhtml-nameident-1.xsd,v 1.2 2005/09/26 22:54:53 ahby Exp $ 13 | 14 | 15 | 16 | 17 | 18 | 19 | Name Identifier 20 | 21 | * 'name' attribute on form, img, a, map, applet, frame, iframe 22 | 23 | This module declares the 'name' attribute on element types when 24 | it is used as a node identifier for legacy linking and scripting 25 | support. This does not include those instances when 'name' is used 26 | as a container for form control, property or metainformation names. 27 | 28 | This module should be instantiated following all modules it modifies. 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /data/schemas/xml-events-attribs-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 13 | 14 | 15 | 16 | This is the XML Schema for XML Events global attributes 17 | 18 | URI: http://www.w3.org/MarkUp/SCHEMA/xml-events-attribs-1.xsd 19 | $Id: xml-events-attribs-1.xsd,v 1.7 2004/11/22 17:09:15 ahby Exp $ 20 | 21 | 22 | 23 | 24 | 25 | 26 | XML Event Attributes 27 | 28 | These "global" event attributes are defined in "Attaching 29 | Attributes Directly to the Observer Element" of the XML 30 | Events specification. 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /data/schemas/xml-events-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 13 | 14 | 15 | 16 | This is the XML Schema for XML Events 17 | 18 | URI: http://www.w3.org/MarkUp/SCHEMA/xml-events-1.xsd 19 | $Id: xml-events-1.xsd,v 1.8 2004/11/22 17:09:15 ahby Exp $ 20 | 21 | 22 | 23 | 24 | 25 | 26 | XML Events element listener 27 | 28 | This module defines the listener element for XML Events. 29 | This element can be used to define event listeners. This 30 | module relies upon the XmlEvents.attlist attribute group 31 | defined in xml-events-attribs-1.xsd. 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /data/schemas/xml-events-2.xsd: -------------------------------------------------------------------------------- 1 | 2 | 13 | 14 | 15 | 16 | This is the XML Schema for XML Events 17 | 18 | URI: http://www.w3.org/MarkUp/SCHEMA/xml-events-2.xsd 19 | $Id: xml-events-2.xsd,v 1.2 2008/06/25 14:36:17 smccarro Exp $ 20 | 21 | 22 | 23 | 24 | 25 | 26 | XML Events element listener 27 | 28 | This module defines the listener element for XML Events. 29 | This element can be used to define event listeners. This 30 | module relies upon the XmlEvents.attlist attribute group 31 | defined in xml-events-attribs-2.xsd. 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /data/schemas/xml-events-attribs-2.xsd: -------------------------------------------------------------------------------- 1 | 2 | 13 | 14 | 15 | 16 | This is the XML Schema for XML Events global attributes 17 | 18 | URI: http://www.w3.org/MarkUp/SCHEMA/xml-events-attribs-2.xsd 19 | $Id: xml-events-attribs-2.xsd,v 1.2 2008/06/25 14:36:21 smccarro Exp $ 20 | 21 | 22 | 23 | 24 | 25 | 26 | XML Event Attributes 27 | 28 | These "global" event attributes are defined in "Attaching 29 | Attributes Directly to the Observer Element" of the XML 30 | Events specification. 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /lib/validate_website/static.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'validate_website/core' 4 | require 'validate_website/utils' 5 | 6 | module ValidateWebsite 7 | # Class for validation Static website 8 | class Static < Core 9 | CONTENT_TYPES = ['text/html', 'text/xhtml+xml'].freeze 10 | START_MESSAGE_TYPE = 'files' 11 | 12 | attr_reader :history_count 13 | 14 | def initialize(options = {}, validation_type = :static) 15 | @history_count = 0 16 | super 17 | start_message("#{START_MESSAGE_TYPE} in #{Dir.pwd} (#{@site} as site)") 18 | end 19 | 20 | # @param [Hash] options 21 | # 22 | def crawl(options = {}) 23 | @options = @options.merge(options) 24 | @site = @options[:site] 25 | 26 | files = Dir.glob(@options[:pattern]) 27 | files.each do |file| 28 | next unless File.file?(file) 29 | next if @options[:exclude]&.match(file) 30 | 31 | @history_count += 1 32 | check_static_file(file) 33 | end 34 | print_status_line(files.size, 0, @not_founds_count, @errors_count) 35 | end 36 | 37 | # Fake http response for Spidr static crawling 38 | # see https://github.com/ruby/ruby/blob/trunk/lib/net/http/response.rb 39 | # 40 | # @param [String] response body 41 | # @param [Array] content types 42 | # @return [Net::HTTPResponse] fake http response 43 | def self.fake_httpresponse(body, content_types = CONTENT_TYPES) 44 | response = Net::HTTPResponse.new '1.1', 200, 'OK' 45 | response.instance_variable_set(:@read, true) 46 | response.body = body 47 | content_types.each do |c| 48 | response.add_field('content-type', c) 49 | end 50 | response 51 | end 52 | 53 | private 54 | 55 | def check_static_file(file) 56 | page = StaticLink.new(file, @site).page 57 | check_page(file, page) 58 | check_css_syntax(page) if page.css? && options[:css_syntax] 59 | end 60 | 61 | def check_page(file, page) 62 | if page.html? && options[:markup] 63 | keys = %i[ignore html5_validator] 64 | slice = options.slice(*keys) 65 | validate(page.doc, page.body, file, slice) 66 | end 67 | check_static_not_found(page.links, page.url.to_s) if options[:not_found] 68 | end 69 | 70 | # check files linked on static document 71 | # see lib/validate_website/runner.rb 72 | def check_static_not_found(links, site = @site) 73 | static_links = links.map { |l| StaticLink.new(l, site) } 74 | static_links.each do |static_link| 75 | next unless static_link.check? 76 | 77 | unless File.exist?(static_link.file_path) 78 | not_found_error(static_link.file_path) 79 | next 80 | end 81 | next unless static_link.css? 82 | 83 | check_static_not_found static_link.extract_urls_from_fake_css_response 84 | end 85 | end 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /lib/validate_website/crawl.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'validate_website/core' 4 | require 'validate_website/utils' 5 | 6 | module ValidateWebsite 7 | # Class for http website validation 8 | class Crawl < Core 9 | attr_reader :crawler 10 | 11 | def initialize(options = {}, validation_type = :crawl) 12 | super 13 | start_message(@site) 14 | end 15 | 16 | def history_count 17 | crawler.history.size 18 | end 19 | 20 | # @param [Hash] options 21 | # :color [Boolean] color output (true, false) 22 | # :exclude [String] a String used by Regexp.new 23 | # :markup [Boolean] Check the markup validity 24 | # :not_found [Boolean] Check for not found page (404) 25 | # 26 | def crawl(options = {}) 27 | @options = @options.merge(options) 28 | @options[:ignore_links] = @options[:exclude] if @options[:exclude] 29 | 30 | @crawler = spidr_crawler(@site, @options) 31 | print_status_line(@crawler.history.size, 32 | @crawler.failures.size, 33 | @not_founds_count, 34 | @errors_count) 35 | end 36 | 37 | private 38 | 39 | # Extract imgs urls from page 40 | # 41 | # @param [Spidr::Page] an Spidr::Page object 42 | # @return [Array] Lists of urls 43 | # 44 | def extract_imgs_from_page(page) 45 | return Set[] if page.is_redirect? 46 | 47 | page.doc.search('//img[@src]').reduce(Set[]) do |result, elem| 48 | u = elem.attributes['src'].content 49 | result << page.to_absolute(URI.parse(WEBrick::HTTPUtils.escape(u))) 50 | end 51 | end 52 | 53 | def spidr_crawler(site, options) 54 | @host = URI(site).host 55 | Spidr.site(site, **options.slice(:user_agent, :ignore_links)) do |crawler| 56 | crawler.cookies[@host] = default_cookies if options[:cookies] 57 | on_every_css_page(crawler) 58 | on_every_html_page(crawler) 59 | on_every_failed_url(crawler) if options[:not_found] 60 | end 61 | end 62 | 63 | def on_every_css_page(crawler) 64 | crawler.every_css_page do |page| 65 | check_css_syntax(page) if options[:css_syntax] 66 | ValidateWebsite::Utils.extract_urls_from_css(page).each do |u| 67 | crawler.enqueue(u) 68 | end 69 | end 70 | end 71 | 72 | def validate?(page) 73 | options[:markup] && page.html? && !page.is_redirect? 74 | end 75 | 76 | def on_every_html_page(crawler) 77 | crawler.every_html_page do |page| 78 | extract_imgs_from_page(page).each do |i| 79 | crawler.enqueue(i) 80 | end 81 | 82 | if validate?(page) 83 | keys = %i[ignore html5_validator] 84 | validate(page.doc, page.body, page.url, options.slice(keys)) 85 | end 86 | end 87 | end 88 | 89 | def on_every_failed_url(crawler) 90 | crawler.every_failed_url do |url| 91 | not_found_error(url) 92 | end 93 | end 94 | end 95 | end 96 | -------------------------------------------------------------------------------- /man/man1/validate-website-static.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: validate-website-static 3 | .\" Author: [see the "AUTHOR(S)" section] 4 | .\" Generator: Asciidoctor 2.0.18 5 | .\" Date: 2022-05-02 6 | .\" Manual: \ \& 7 | .\" Source: \ \& 8 | .\" Language: English 9 | .\" 10 | .TH "VALIDATE\-WEBSITE\-STATIC" "1" "2022-05-02" "\ \&" "\ \&" 11 | .ie \n(.g .ds Aq \(aq 12 | .el .ds Aq ' 13 | .ss \n[.ss] 0 14 | .nh 15 | .ad l 16 | .de URL 17 | \fI\\$2\fP <\\$1>\\$3 18 | .. 19 | .als MTO URL 20 | .if \n[.g] \{\ 21 | . mso www.tmac 22 | . am URL 23 | . ad l 24 | . . 25 | . am MTO 26 | . ad l 27 | . . 28 | . LINKSTYLE blue R < > 29 | .\} 30 | .SH "NAME" 31 | validate-website-static \- check the validity of your documents 32 | .SH "SYNOPSIS" 33 | .sp 34 | \fBvalidate\-website\-static\fP [\fIOPTIONS\fP] 35 | .SH "DESCRIPTION" 36 | .sp 37 | validate\-website\-static check the markup validity of your local documents with 38 | XML Schema / DTD. 39 | HTML5 support with Validator.nu Web Service. 40 | .SH "OPTIONS" 41 | .sp 42 | \fB\-s\fP, \fB\-\-site\fP \fISITE\fP 43 | .RS 4 44 | Where static files will be hosted (Default: \c 45 | .URL "http://www.example.com/" "" ")" 46 | .RE 47 | .sp 48 | \fB\-p\fP, \fB\-\-pattern\fP \fIPATTERN\fP 49 | .RS 4 50 | Change filenames pattern (Default: *\(rs*/*.html) 51 | .RE 52 | .sp 53 | \fB\-e\fP, \fB\-\-exclude\fP \fIEXCLUDE\fP 54 | .RS 4 55 | Url to exclude (ex: \fIredirect|news\fP) 56 | .RE 57 | .sp 58 | \fB\-i\fP, \fB\-\-ignore\fP \fIIGNORE\fP 59 | .RS 4 60 | Ignore certain validation errors (ex: \fIautocorrect\fP) 61 | .RE 62 | .sp 63 | \fB\-m\fP, \fB\-\-[no\-]markup\fP 64 | .RS 4 65 | Markup validation (Default: true) 66 | .RE 67 | .sp 68 | \fB\-\-css\-syntax\fP 69 | .RS 4 70 | Css validation (Default: false) 71 | .RE 72 | .sp 73 | \fB\-n\fP, \fB\-\-not\-found\fP 74 | .RS 4 75 | Log files not on filesystem, pwd considered as root « / » (Default: false) 76 | .RE 77 | .sp 78 | \fB\-\-[no\-]color\fP 79 | .RS 4 80 | Show colored output (Default: true) 81 | .RE 82 | .sp 83 | \fB\-x\fP, \fB\-\-html5\-validator\fP \fIVALIDATOR\fP 84 | .RS 4 85 | Change default html5 validator engine (tidy/nu/nokogiri) 86 | .RE 87 | .sp 88 | \fB\-5\fP, \fB\-\-html5\-validator\-service\-url\fP \fIURL\fP 89 | .RS 4 90 | Change default html5 validator service URL for "nu" engine 91 | .RE 92 | .sp 93 | \fB\-v\fP, \fB\-\-verbose\fP 94 | .RS 4 95 | Show detail of validator errors (Default: false). 96 | .RE 97 | .sp 98 | \fB\-h\fP, \fB\-\-help\fP 99 | .RS 4 100 | Show help message and exit. 101 | .RE 102 | .SH "EXIT STATUS" 103 | .sp 104 | 0 105 | .RS 4 106 | Markup is valid. 107 | .RE 108 | .sp 109 | 64 110 | .RS 4 111 | Not valid markup found. 112 | .RE 113 | .sp 114 | 65 115 | .RS 4 116 | There are pages not found. 117 | .RE 118 | .sp 119 | 66 120 | .RS 4 121 | There are not valid markup and pages not found. 122 | .RE 123 | .SH "AUTHOR" 124 | .sp 125 | Laurent Arnoud \c 126 | .MTO "laurent\(atspkdev.net" "Laurent Arnoud" "" 127 | .SH "LICENSE" 128 | .sp 129 | The MIT License 130 | .sp 131 | Copyright (c) 2009\-2019 \c 132 | .MTO "laurent\(atspkdev.net" "Laurent Arnoud" "" -------------------------------------------------------------------------------- /man/man1/validate-website.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: validate-website 3 | .\" Author: [see the "AUTHOR(S)" section] 4 | .\" Generator: Asciidoctor 2.0.18 5 | .\" Date: 2022-05-02 6 | .\" Manual: \ \& 7 | .\" Source: \ \& 8 | .\" Language: English 9 | .\" 10 | .TH "VALIDATE\-WEBSITE" "1" "2022-05-02" "\ \&" "\ \&" 11 | .ie \n(.g .ds Aq \(aq 12 | .el .ds Aq ' 13 | .ss \n[.ss] 0 14 | .nh 15 | .ad l 16 | .de URL 17 | \fI\\$2\fP <\\$1>\\$3 18 | .. 19 | .als MTO URL 20 | .if \n[.g] \{\ 21 | . mso www.tmac 22 | . am URL 23 | . ad l 24 | . . 25 | . am MTO 26 | . ad l 27 | . . 28 | . LINKSTYLE blue R < > 29 | .\} 30 | .SH "NAME" 31 | validate-website \- Web crawler for checking the validity of your documents 32 | .SH "SYNOPSIS" 33 | .sp 34 | \fBvalidate\-website\fP [\fIOPTIONS\fP] 35 | .SH "DESCRIPTION" 36 | .sp 37 | validate\-website is a web crawler for checking the markup validity with XML 38 | Schema / DTD and not found urls. 39 | HTML5 support with Validator.nu Web Service. 40 | .SH "OPTIONS" 41 | .sp 42 | \fB\-s\fP, \fB\-\-site\fP \fISITE\fP 43 | .RS 4 44 | Website to crawl (Default: \c 45 | .URL "http://localhost:3000/" "" ")" 46 | .RE 47 | .sp 48 | \fB\-u\fP, \fB\-\-user\-agent\fP \fIUSERAGENT\fP 49 | .RS 4 50 | Change user agent (Default: Spidr.user_agent) 51 | .RE 52 | .sp 53 | \fB\-e\fP, \fB\-\-exclude\fP \fIEXCLUDE\fP 54 | .RS 4 55 | Url to exclude (ex: \fIredirect|news\fP) 56 | .RE 57 | .sp 58 | \fB\-i\fP, \fB\-\-ignore\fP \fIIGNORE\fP 59 | .RS 4 60 | Ignore certain validation errors (ex: \fIautocorrect\fP) 61 | .RE 62 | .sp 63 | \fB\-c\fP, \fB\-\-cookies\fP \fICOOKIES\fP 64 | .RS 4 65 | Set defaults cookies 66 | .RE 67 | .sp 68 | \fB\-m\fP, \fB\-\-[no\-]markup\fP 69 | .RS 4 70 | Markup validation (Default: true) 71 | .RE 72 | .sp 73 | \fB\-\-css\-syntax\fP 74 | .RS 4 75 | Css validation (Default: false) 76 | .RE 77 | .sp 78 | \fB\-n\fP, \fB\-\-not\-found\fP 79 | .RS 4 80 | Log not found url (Default: false) 81 | .RE 82 | .sp 83 | \fB\-\-[no\-]color\fP 84 | .RS 4 85 | Show colored output (Default: true) 86 | .RE 87 | .sp 88 | \fB\-x\fP, \fB\-\-html5\-validator\fP \fIVALIDATOR\fP 89 | .RS 4 90 | Change default html5 validator engine (tidy/nu/nokogiri) 91 | .RE 92 | .sp 93 | \fB\-5\fP, \fB\-\-html5\-validator\-service\-url\fP \fIURL\fP 94 | .RS 4 95 | Change default html5 validator service URL for "nu" engine 96 | .RE 97 | .sp 98 | \fB\-v\fP, \fB\-\-verbose\fP 99 | .RS 4 100 | Show detail of validator errors (Default: false). 101 | .RE 102 | .sp 103 | \fB\-h\fP, \fB\-\-help\fP 104 | .RS 4 105 | Show help message and exit. 106 | .RE 107 | .SH "EXIT STATUS" 108 | .sp 109 | 0 110 | .RS 4 111 | Markup is valid and no 404 found. 112 | .RE 113 | .sp 114 | 64 115 | .RS 4 116 | Not valid markup found. 117 | .RE 118 | .sp 119 | 65 120 | .RS 4 121 | There are pages not found. 122 | .RE 123 | .sp 124 | 66 125 | .RS 4 126 | There are not valid markup and pages not found. 127 | .RE 128 | .SH "AUTHOR" 129 | .sp 130 | Laurent Arnoud \c 131 | .MTO "laurent\(atspkdev.net" "Laurent Arnoud" "" 132 | .SH "LICENSE" 133 | .sp 134 | The MIT License 135 | .sp 136 | Copyright (c) 2009\-2019 \c 137 | .MTO "laurent\(atspkdev.net" "Laurent Arnoud" "" -------------------------------------------------------------------------------- /test/static_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path('test_helper', __dir__) 4 | 5 | # rubocop:disable Metrics/BlockLength 6 | describe ValidateWebsite::Static do 7 | before do 8 | _out, _err = capture_io do 9 | @validate_website = ValidateWebsite::Static.new(color: false) 10 | end 11 | end 12 | 13 | it 'exclude directories' do 14 | pattern = File.join(File.dirname(__FILE__), '**/*.html') 15 | _out, _err = capture_io do 16 | @validate_website.crawl(pattern: pattern, 17 | site: 'http://spkdev.net/', 18 | markup: false, 19 | not_found: false, 20 | exclude: /data|example/) 21 | end 22 | _(@validate_website.history_count).must_equal 0 23 | end 24 | 25 | it 'no space in directory name' do 26 | pattern = File.join(File.dirname(__FILE__), 'example/**/*.html') 27 | _out, _err = capture_io do 28 | @validate_website.crawl(pattern: pattern, 29 | site: 'http://dev.af83.com/', 30 | markup: false, 31 | not_found: false) 32 | end 33 | _(@validate_website.not_founds_count).must_equal 0 34 | end 35 | 36 | it 'not found' do 37 | Dir.chdir('test/data') do 38 | _out, _err = capture_io do 39 | @validate_website.crawl(pattern: '**/*.html', 40 | site: 'https://linuxfr.org/', 41 | markup: false, 42 | not_found: true) 43 | end 44 | _(@validate_website.not_founds_count).must_equal 210 45 | end 46 | end 47 | 48 | it 'can change validator' do 49 | validator_res = File.join('test', 'data', 'validator.nu-failure.json') 50 | stub_request(:any, 51 | /#{ValidateWebsite::Validator.html5_validator_service_url}/) 52 | .to_return(body: File.open(validator_res).read) 53 | pattern = File.join(File.dirname(__FILE__), 'data', 54 | 'html5-fail.html') 55 | Dir.chdir('test/data') do 56 | _out, _err = capture_io do 57 | @validate_website.crawl(pattern: pattern, 58 | site: 'http://w3.org/', 59 | ignore: /Warning/, 60 | html5_validator: :nu) 61 | end 62 | _(@validate_website.errors_count).must_equal 1 63 | end 64 | end 65 | 66 | it 'ignore' do 67 | pattern = File.join(File.dirname(__FILE__), 'data', 68 | 'w3.org-xhtml1-strict-errors.html') 69 | Dir.chdir('test/data') do 70 | _out, _err = capture_io do 71 | @validate_website.crawl(pattern: pattern, 72 | site: 'http://w3.org/', 73 | ignore: /height|width|Length/) 74 | end 75 | _(@validate_website.errors_count).must_equal 0 76 | end 77 | end 78 | 79 | describe 'css' do 80 | it 'validate' do 81 | pattern = File.join(File.dirname(__FILE__), '**/*.{html,css}') 82 | Dir.chdir('test/data') do 83 | _out, _err = capture_io do 84 | @validate_website.crawl(pattern: pattern, 85 | site: 'https://linuxfr.org/', 86 | markup: false, 87 | css_syntax: true) 88 | end 89 | _(@validate_website.errors_count).must_equal 1 90 | end 91 | end 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /data/schemas/xhtml-ruby-basic-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | This is the XML Schema module for Ruby Basic. 7 | $Id: xhtml-ruby-basic-1.xsd,v 1.6 2005/09/26 22:54:53 ahby Exp $ 8 | 9 | 10 | 11 | 12 | 13 | "Ruby" are short runs of text alongside the base text, typically 14 | used in East Asian documents to indicate pronunciation or to 15 | provide a short annotation. The full specification for Ruby is here: 16 | 17 | http://www.w3.org/TR/2001/REC-ruby-20010531/ 18 | 19 | This module defines "Ruby Basic" or "simple Ruby" as described 20 | in the specification: 21 | 22 | http://www.w3.org/TR/ruby/#simple-ruby1 23 | 24 | This module declares the elements and their attributes used to 25 | support simple ruby annotation markup. Elements defined here are 26 | * ruby, rb, rt, rp 27 | Ruby Basic does not use the rbc or rtc elements. 28 | The content of the ruby element for Ruby Basic 29 | uses the rp element for fallback purposes. 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 49 | 50 | 51 | 52 | 53 | 54 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 66 | 67 | 68 | 69 | 72 | 73 | 74 | 75 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /data/schemas/xhtml-notations-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | Notations module 12 | This is the XML Schema module for data type notations for XHTML 13 | $Id: xhtml-notations-1.xsd,v 1.5 2005/09/26 22:54:53 ahby Exp $ 14 | 15 | 16 | 17 | 18 | 19 | Notations module 20 | Defines the XHTML notations, many of these imported from 21 | other specifications and standards. When an existing FPI is 22 | known, it is incorporated here. 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /data/schemas/xhtml-special.ent: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 15 | 16 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 34 | 35 | 36 | 38 | 40 | 42 | 43 | 44 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 54 | 55 | 56 | 57 | 58 | 59 | 61 | 63 | 64 | 66 | 68 | 69 | 70 | 71 | 72 | 74 | 75 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /lib/validate_website/validator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'uri' 4 | 5 | require 'nokogiri' unless ValidateWebsite.jruby? 6 | require 'w3c_validators' 7 | 8 | require 'validate_website/validator_class_methods' 9 | 10 | module ValidateWebsite 11 | # Document validation from DTD or XSD (webservice for html5) 12 | class Validator 13 | extend ValidatorClassMethods 14 | 15 | @html5_validator_service_url = 'https://validator.nu/' 16 | XHTML_SCHEMA_PATH = File.expand_path('../../data/schemas', __dir__) 17 | @mutex = Mutex.new 18 | 19 | class << self 20 | attr_accessor :html5_validator_service_url 21 | 22 | # http://www.w3.org/TR/xhtml1-schema/ 23 | def schema(namespace) 24 | @mutex.synchronize do 25 | Dir.chdir(XHTML_SCHEMA_PATH) do 26 | if File.exist?("#{namespace}.xsd") 27 | Nokogiri::XML::Schema(File.read("#{namespace}.xsd")) 28 | end 29 | end 30 | end 31 | end 32 | 33 | alias xsd schema 34 | end 35 | 36 | attr_reader :original_doc, :body, :dtd, :doc, :namespace, :html5_validator 37 | 38 | ## 39 | # @param [Nokogiri::HTML::Document] original_doc 40 | # @param [String] The raw HTTP response body of the page 41 | # @param [Regexp] Errors to ignore 42 | # @param [Symbol] html5_validator default offline :tidy 43 | # fallback webservice :nu 44 | def initialize(original_doc, body, ignore: nil, html5_validator: :tidy) 45 | @errors = [] 46 | @document, @dtd_uri = nil 47 | @original_doc = original_doc 48 | @body = body 49 | @ignore = ignore 50 | @html5_validator = html5_validator 51 | @dtd = @original_doc.internal_subset 52 | @namespace = find_namespace(@dtd) 53 | end 54 | 55 | ## 56 | # @return [Boolean] 57 | def valid? 58 | find_errors 59 | errors.empty? 60 | end 61 | 62 | # @return [Array] of errors 63 | def errors 64 | @errors.map!(&:to_s) 65 | @ignore ? @errors.reject { |e| @ignore =~ e } : @errors 66 | end 67 | 68 | private 69 | 70 | # http://www.w3.org/TR/xhtml1/#dtds 71 | def find_namespace(dtd) 72 | return unless dtd.system_id 73 | 74 | dtd_uri = URI.parse(dtd.system_id) 75 | return unless dtd_uri.path 76 | 77 | @dtd_uri = dtd_uri 78 | File.basename(@dtd_uri.path, '.dtd') 79 | end 80 | 81 | def document 82 | return @document if @document 83 | 84 | @document = if @dtd_uri && @body.match(@dtd_uri.to_s) 85 | @body.sub(@dtd_uri.to_s, @namespace + '.dtd') 86 | else 87 | @body 88 | end 89 | end 90 | 91 | # @return [Array] contain result errors 92 | def validate 93 | if document =~ /^\/i 94 | html5_validate 95 | elsif self.class.schema(@namespace) 96 | self.class.schema(@namespace).validate(xhtml_doc) 97 | else 98 | # dont have xsd fall back to dtd 99 | Dir.chdir(XHTML_SCHEMA_PATH) do 100 | Nokogiri::HTML.parse(document) 101 | end.errors 102 | end 103 | end 104 | 105 | # http://nokogiri.org/tutorials/ensuring_well_formed_markup.html 106 | def find_errors 107 | @errors = validate 108 | rescue Nokogiri::XML::SyntaxError => e 109 | @errors << e 110 | end 111 | 112 | def html5_validate 113 | if html5_validator.to_sym == :tidy && self.class.tidy 114 | tidy_validate 115 | elsif html5_validator.to_sym == :nu 116 | nu_validate 117 | else 118 | Nokogiri::HTML5(document, max_errors: -1).errors 119 | end 120 | end 121 | 122 | def tidy_validate 123 | results = self.class.tidy.new(document) 124 | if results.errors 125 | errors.concat(results.errors.split("\n")) 126 | else 127 | [] 128 | end 129 | end 130 | 131 | def nu_validate 132 | validator = W3CValidators::NuValidator.new( 133 | validator_uri: self.class.validator_uri 134 | ) 135 | results = validator.validate_text(document) 136 | errors.concat(results.errors) 137 | end 138 | 139 | def xhtml_doc 140 | Dir.chdir(XHTML_SCHEMA_PATH) do 141 | Nokogiri::XML(document) { |cfg| cfg.nonoent.dtdload.dtdvalid.nonet } 142 | end 143 | end 144 | end 145 | end 146 | -------------------------------------------------------------------------------- /lib/validate_website/core.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'set' 4 | require 'open-uri' 5 | require 'webrick/cookie' 6 | 7 | require 'validate_website/option_parser' 8 | require 'validate_website/validator' 9 | require 'validate_website/colorful_messages' 10 | 11 | require 'spidr' 12 | require 'crass' 13 | 14 | # Base module ValidateWebsite 15 | module ValidateWebsite 16 | autoload :Crawl, 'validate_website/crawl' 17 | autoload :Static, 'validate_website/static' 18 | autoload :StaticLink, 'validate_website/static_link' 19 | 20 | # Core class for static or website validation 21 | class Core 22 | attr_accessor :site 23 | attr_reader :options, :host, :errors_count, :not_founds_count 24 | 25 | include ColorfulMessages 26 | 27 | EXIT_SUCCESS = 0 28 | EXIT_FAILURE_MARKUP = 64 29 | EXIT_FAILURE_NOT_FOUND = 65 30 | EXIT_FAILURE_MARKUP_NOT_FOUND = 66 31 | START_MESSAGE = 'Validating' 32 | 33 | # Initialize core ValidateWebsite class 34 | # @example 35 | # new({ site: "https://example.com/" }, :crawl) 36 | # @param [Hash] options 37 | # @param [Symbol] validation_type `crawl` for web or `static` for local 38 | # @return [NilClass] 39 | def initialize(options, validation_type) 40 | @not_founds_count = 0 41 | @errors_count = 0 42 | @options = Parser.parse(options, validation_type).to_h 43 | @site = @options[:site] 44 | @service_url = @options[:html5_validator_service_url] 45 | Validator.html5_validator_service_url = @service_url if @service_url 46 | end 47 | 48 | def errors? 49 | @errors_count.positive? 50 | end 51 | 52 | def not_founds? 53 | @not_founds_count.positive? 54 | end 55 | 56 | def exit_status 57 | if errors? && not_founds? 58 | EXIT_FAILURE_MARKUP_NOT_FOUND 59 | elsif errors? 60 | EXIT_FAILURE_MARKUP 61 | elsif not_founds? 62 | EXIT_FAILURE_NOT_FOUND 63 | else 64 | EXIT_SUCCESS 65 | end 66 | end 67 | 68 | def default_cookies 69 | WEBrick::Cookie.parse(@options[:cookies]).each_with_object({}) do |c, h| 70 | h[c.name] = c.value 71 | h 72 | end 73 | end 74 | 75 | private 76 | 77 | def start_message(type) 78 | puts color(:note, "#{START_MESSAGE} #{type}\n", @options[:color]) 79 | end 80 | 81 | def check_css_syntax(page) 82 | nodes = Crass::Parser.parse_stylesheet(page.body) 83 | return unless any_css_errors?(nodes) 84 | 85 | handle_validation_error(page.url) 86 | end 87 | 88 | def any_css_errors?(nodes) 89 | nodes.any? do |node| 90 | if node[:children] 91 | any_css_errors? node.delete(:children) 92 | elsif node[:tokens] 93 | any_css_errors? node.delete(:tokens) 94 | else 95 | node[:node] == :error || node[:error] == true 96 | end 97 | end 98 | end 99 | 100 | def print_status_line(total, failures, not_founds, errors) 101 | puts "\n\n" 102 | puts color(:info, ["#{total} visited", 103 | "#{failures} failures", 104 | "#{not_founds} not founds", 105 | "#{errors} errors"].join(', '), options[:color]) 106 | end 107 | 108 | def not_found_error(location) 109 | puts "\n" 110 | puts color(:error, "#{location} linked but not exist", options[:color]) 111 | @not_founds_count += 1 112 | end 113 | 114 | ## 115 | # @param [Nokogiri::HTML::Document] original_doc 116 | # @param [String] The raw HTTP response body of the page 117 | # @param [String] url 118 | # @param [Hash] Validator options 119 | # 120 | def validate(doc, body, url, options) 121 | validator = Validator.new(doc, body, **options) 122 | if validator.valid? 123 | print color(:success, '.', options[:color]) # rspec style 124 | else 125 | handle_html_validation_error(validator, url) 126 | end 127 | end 128 | 129 | def handle_html_validation_error(validator, url) 130 | handle_validation_error(url) 131 | return unless options[:verbose] 132 | 133 | puts color(:error, validator.errors.join(', '), options[:color]) 134 | end 135 | 136 | def handle_validation_error(url) 137 | @errors_count += 1 138 | puts "\n" 139 | puts color(:error, "* #{url}", options[:color]) 140 | end 141 | end 142 | end 143 | -------------------------------------------------------------------------------- /lib/validate_website/option_parser.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'slop' 4 | require File.expand_path('version', __dir__) 5 | 6 | module ValidateWebsite 7 | # Internal class for parse command line args 8 | class Parser 9 | VALID_TYPES = %i[crawl static].freeze 10 | 11 | DEFAULT_OPTIONS = { 12 | site: 'http://localhost/', 13 | pattern: '**/*.html', 14 | exclude: nil, 15 | user_agent: nil, 16 | markup: true, 17 | css_syntax: false, 18 | # crawler: log not found url (404 status code) 19 | # static: log not found url (not on filesystem, `pwd` considered 20 | # as root " / ") 21 | not_found: false, 22 | file: nil, 23 | # regex to ignore certain validation errors 24 | ignore: nil, 25 | color: true, 26 | html5_validator: 'tidy', 27 | # internal verbose for ValidateWebsite 28 | verbose: false 29 | }.freeze 30 | 31 | # Generic parse method for crawl or static options 32 | def self.parse(options, type) 33 | raise ArgumentError unless VALID_TYPES.include?(type) 34 | 35 | # We are in command line (ARGV) 36 | if options.is_a?(Array) 37 | send("command_line_parse_#{type}", options) 38 | else 39 | # for testing or Ruby usage with a Hash 40 | DEFAULT_OPTIONS.merge(options) 41 | end 42 | end 43 | 44 | def self.default_args 45 | Slop.parse do |opt| 46 | yield opt if block_given? 47 | markup_syntax(opt) 48 | boolean_options(opt) 49 | ignore_html5_options(opt) 50 | verbose_option(opt) 51 | version_help(opt) 52 | end 53 | end 54 | 55 | def self.ignore_html5_options(opt) 56 | opt.regexp('-i', '--ignore', 57 | 'Validation errors to ignore (ex: "valign|autocorrect")') 58 | opt.string('-x', '--html5-validator', 59 | 'Change default html5 validator engine (tidy/nu/nokogiri)', 60 | default: DEFAULT_OPTIONS[:html5_validator]) 61 | opt.string('-5', '--html5-validator-service-url', 62 | 'Change default html5 validator service URL for "nu" engine') 63 | end 64 | 65 | def self.markup_syntax(opt) 66 | opt.bool('-m', '--markup', 67 | "Markup validation (default: #{DEFAULT_OPTIONS[:markup]})", 68 | default: DEFAULT_OPTIONS[:markup]) 69 | opt.bool('--css-syntax', 70 | "Css validation (default: #{DEFAULT_OPTIONS[:css_syntax]})", 71 | default: DEFAULT_OPTIONS[:css_syntax]) 72 | end 73 | 74 | def self.boolean_options(opt) 75 | opt.bool('-n', '--not-found', 76 | "Log not found url (default: #{DEFAULT_OPTIONS[:not_found]})", 77 | default: DEFAULT_OPTIONS[:not_found]) 78 | opt.bool('--color', 79 | "Show colored output (default: #{DEFAULT_OPTIONS[:color]})", 80 | default: DEFAULT_OPTIONS[:color]) 81 | end 82 | 83 | def self.verbose_option(opt) 84 | opt.bool('-v', '--verbose', 85 | "Show validator errors (default: #{DEFAULT_OPTIONS[:verbose]})", 86 | default: DEFAULT_OPTIONS[:verbose]) 87 | end 88 | 89 | def self.version_help(opt) 90 | opt.on('--version', 'Display version.') do 91 | puts ValidateWebsite::VERSION 92 | exit 93 | end 94 | opt.on('-h', '--help', 'Display this help message.') do 95 | puts opt 96 | exit 97 | end 98 | end 99 | 100 | # Parse command line for validate-website bin 101 | # @params [ARGV] 102 | # @return [Hash] 103 | def self.command_line_parse_crawl(_args) 104 | default_args do |opt| 105 | opt.string('-s', '--site', 106 | "Website to crawl (default: #{DEFAULT_OPTIONS[:site]})", 107 | default: DEFAULT_OPTIONS[:site]) 108 | opt.string('-u', '--user-agent', 109 | 'Change user agent', 110 | default: DEFAULT_OPTIONS[:user_agent]) 111 | opt.regexp('-e', '--exclude', 'Url to exclude (ex: "redirect|news")') 112 | opt.string('-c', '--cookies', 'Set defaults cookies') 113 | end 114 | end 115 | 116 | # Parse command line for validate-website-static bin 117 | # @params [ARGV] 118 | # @return [Hash] 119 | def self.command_line_parse_static(_args) 120 | default_args do |opt| 121 | opt.string('-s', '--site', 122 | "Website to crawl (default: #{DEFAULT_OPTIONS[:site]})", 123 | default: DEFAULT_OPTIONS[:site]) 124 | opt.string('-p', '--pattern', 125 | "Filename pattern (default: #{DEFAULT_OPTIONS[:pattern]})", 126 | default: DEFAULT_OPTIONS[:pattern]) 127 | opt.regexp('-e', '--exclude', 'Url to exclude (ex: "redirect|news")') 128 | end 129 | end 130 | end 131 | end 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # validate-website 2 | 3 | ## Description 4 | 5 | Web crawler for checking the validity of your documents 6 | 7 | ![validate website](https://raw.github.com/spk/validate-website/master/validate-website.png) 8 | 9 | ## Installation 10 | 11 | ### Debian 12 | 13 | ``` 14 | apt install ruby-dev libxslt1-dev libxml2-dev 15 | ``` 16 | 17 | If you want complete local validation look [tidy 18 | packages](https://binaries.html-tidy.org/) 19 | 20 | ### RubyGems 21 | 22 | ``` 23 | gem install validate-website 24 | ``` 25 | 26 | ## Synopsis 27 | 28 | ``` 29 | validate-website [OPTIONS] 30 | validate-website-static [OPTIONS] 31 | ``` 32 | 33 | ## Examples 34 | 35 | ``` 36 | validate-website -v -s https://www.ruby-lang.org/ 37 | validate-website -v -x tidy -s https://www.ruby-lang.org/ 38 | validate-website -v -x nu -s https://www.ruby-lang.org/ 39 | validate-website -h 40 | ``` 41 | 42 | ## Description 43 | 44 | validate-website is a web crawler for checking the markup validity with XML 45 | Schema / DTD and not found urls (more info [doc/validate-website.adoc](https://github.com/spk/validate-website/blob/master/doc/validate-website.adoc)). 46 | 47 | validate-website-static checks the markup validity of your local documents with 48 | XML Schema / DTD (more info [doc/validate-website-static.adoc](https://github.com/spk/validate-website/blob/master/doc/validate-website-static.adoc)). 49 | 50 | HTML5 support with [libtidy5](http://www.html-tidy.org/) or [Validator.nu Web 51 | Service](https://checker.html5.org/). 52 | 53 | ## Exit status 54 | 55 | * 0: Markup is valid and no 404 found. 56 | * 64: Not valid markup found. 57 | * 65: There are pages not found. 58 | * 66: There are not valid markup and pages not found. 59 | 60 | ## On your application 61 | 62 | ``` ruby 63 | require 'validate_website/validator' 64 | body = '' 65 | v = ValidateWebsite::Validator.new(Nokogiri::HTML(body), body) 66 | v.valid? # => false 67 | ``` 68 | 69 | ## Jekyll static site validation 70 | 71 | You can add this Rake task to validate a 72 | [jekyll](https://github.com/jekyll/jekyll) site: 73 | 74 | ``` ruby 75 | desc 'validate _site with validate website' 76 | task validate: :build do 77 | Dir.chdir("_site") do 78 | system("validate-website-static", 79 | "--verbose", 80 | "--exclude", "examples", 81 | "--site", HTTP_URL) 82 | exit($?.exitstatus) 83 | end 84 | end 85 | end 86 | ``` 87 | 88 | ## More info 89 | 90 | ### HTML5 91 | 92 | #### Tidy5 93 | 94 | If the libtidy5 is found on your system this will be the default to validate 95 | your html5 document. This does not depend on a tier service everything is done 96 | locally. 97 | 98 | #### nokogiri 99 | 100 | nokogiri can validate html5 document without tier service but reports less 101 | errors than tidy. 102 | 103 | #### Validator.nu web service 104 | 105 | When `--html5-validator nu` option is used HTML5 support is done by using the 106 | Validator.nu Web Service, so the content of your webpage is logged by a tier. 107 | It's not the case for other validation because validate-website use the XML 108 | Schema or DTD stored on the data/ directory. 109 | 110 | Please read for more info on the HTML5 111 | validation service. 112 | 113 | ##### Use validator standalone web server locally 114 | 115 | You can download [validator](https://github.com/validator/validator) jar and 116 | start it with: 117 | 118 | ``` 119 | java -cp PATH_TO/vnu.jar nu.validator.servlet.Main 8888 120 | ``` 121 | 122 | Then you can use validate-website option: 123 | 124 | ``` 125 | --html5-validator-service-url http://localhost:8888/ 126 | # or 127 | export VALIDATOR_NU_URL="http://localhost:8888/" 128 | ``` 129 | 130 | This will prevent you to be blacklisted from validator webservice. 131 | 132 | ## Tests 133 | 134 | With standard environment: 135 | 136 | ``` 137 | bundle exec rake 138 | ``` 139 | 140 | ## Credits 141 | 142 | * Thanks tenderlove for Nokogiri, this tool is inspired from markup_validity. 143 | * And Chris Kite for Anemone web-spider framework and postmodern for Spidr. 144 | 145 | ## Contributors 146 | 147 | See [GitHub](https://github.com/spk/validate-website/graphs/contributors). 148 | 149 | ## License 150 | 151 | The MIT License 152 | 153 | Copyright (c) 2009-2022 Laurent Arnoud 154 | 155 | --- 156 | [![Build](https://img.shields.io/gitlab/pipeline/spkdev/validate-website/master)](https://gitlab.com/spkdev/validate-website/-/commits/master) 157 | [![Coverage](https://gitlab.com/spkdev/validate-website/badges/master/coverage.svg)](https://gitlab.com/spkdev/validate-website/-/commits/master) 158 | [![Version](https://img.shields.io/gem/v/validate-website.svg)](https://rubygems.org/gems/validate-website) 159 | [![Documentation](https://img.shields.io/badge/doc-rubydoc-blue.svg)](http://www.rubydoc.info/gems/validate-website) 160 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](http://opensource.org/licenses/MIT "MIT") 161 | [![Inline docs](https://inch-ci.org/github/spk/validate-website.svg?branch=master)](http://inch-ci.org/github/spk/validate-website) 162 | -------------------------------------------------------------------------------- /data/schemas/xml-handlers-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 14 | 15 | 16 | 17 | This is the XML Schema for XML Handlers 18 | 19 | URI: http://www.w3.org/MarkUp/SCHEMA/xml-handlers-1.xsd 20 | $Id: xml-handlers-1.xsd,v 1.1 2008/06/25 14:36:29 smccarro Exp $ 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /test/data/html5-fail.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | The Rust Programming Language 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 36 |
37 | 38 |
39 |
40 |

41 | Rust is a systems programming language 42 | that runs blazingly fast, 43 | prevents segfaults, 44 | and guarantees thread safety. 45 |
46 | See who's using Rust. 47 |

48 |
49 |
50 | 51 |
Install Rust 1.16.0
52 |
53 |
March 16, 2017
54 |
55 |
56 | 57 |
58 |
59 |

Featuring

60 |
    61 |
  • zero-cost abstractions
  • 62 |
  • move semantics
  • 63 |
  • guaranteed memory safety
  • 64 |
  • threads without data races
  • 65 |
  • trait-based generics
  • 66 |
  • pattern matching
  • 67 |
  • type inference
  • 68 |
  • minimal runtime
  • 69 |
  • efficient C bindings
  • 70 |
71 |
72 |
73 |
74 | 75 |
fn main() { 76 | let greetings = ["Hello", "Hola", "Bonjour", 77 | "こんにちは", "您好"]; 78 | 79 | for (num, greeting) in greetings.iter().enumerate() { 80 | println!("{}", greeting); 81 | match num { 82 | 0 => println!("This code is editable and runnable!"), 83 | 1 => println!("Este código es editable y ejecutable!"), 84 | 2 => println!("Ce code est modifiable et exécutable!"), 85 | 3 => println!("このコードは編集して実行出来ます!"), 86 | 4 => println!("这个代码是可以编辑并且能够运行的!"), 87 | _ => {}, 88 | } 89 | } 90 | } 91 |
92 |
93 | 94 |
95 |
96 |
 97 | fn main() {
 98 |     let greetings = ["Hello", "Hola", "Bonjour",
 99 |                      "こんにちは", "您好"];
100 | 
101 |     for (num, greeting) in greetings.iter().enumerate() {
102 |         println!("{}", greeting);
103 |         match num {
104 |             0 =>  println!("This code is editable and runnable!"),
105 |             1 =>  println!("Este código es editable y ejecutable!"),
106 |             2 =>  println!("Ce code est modifiable et exécutable!"),
107 |             3 =>  println!("このコードは編集して実行出来ます!"),
108 |             4 =>  println!("这个代码是可以编辑并且能够运行的!"),
109 |             _ =>  {},
110 |         }
111 |     }
112 | }
113 | 
114 |
115 |
116 | More examples 117 |
118 |
119 |
120 | 121 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /data/schemas/xhtml-events-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 7 | 9 | 10 | 11 | 12 | This is the XML Schema Intrinsic Events module for XHTML 13 | $Id: xhtml-events-1.xsd,v 1.4 2005/09/26 22:54:53 ahby Exp $ 14 | 15 | 16 | 17 | 18 | 19 | Intrinsic Event Attributes 20 | These are the event attributes defined in HTML 4, 21 | Section 18.2.3 "Intrinsic Events". 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 40 | 41 | 42 | 43 | 44 | 47 | 48 | 49 | 50 | 51 | 54 | 55 | 56 | 57 | 58 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 70 | 71 | 72 | 73 | 74 | 75 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 87 | 88 | 89 | 90 | 91 | 94 | 95 | 96 | 97 | 98 | 101 | 102 | 103 | 104 | 105 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /data/schemas/xframes-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 10 | 11 | This is XFrames - an XML application for composing documents together. 12 | URI: http://www.w3.org/MarkUp/SCHEMA/xframes-1.xsd 13 | 14 | Copyright ©2002-2005 W3C (MIT, ERCIM, Keio), All Rights Reserved. 15 | 16 | Editor: Masayasu Ishikawa (mimasa@w3.org) 17 | Revision: $Id: xframes-1.xsd,v 1.9 2005/10/05 23:56:45 mimasa Exp $ 18 | 19 | Permission to use, copy, modify and distribute this XML Schema for 20 | XFrames and its accompanying documentation for any purpose and without 21 | fee is hereby granted in perpetuity, provided that the above copyright 22 | notice and this paragraph appear in all copies. The copyright holders 23 | make no representation about the suitability of this XML Schema 24 | for any purpose. 25 | 26 | It is provided "as is" without expressed or implied warranty. 27 | 28 | 29 | 30 | 32 | 33 | 34 | Get access to the XML namespace 35 | 36 | 37 | 38 | 39 | 40 | 41 | Datatypes 42 | 43 | 44 | 45 | 46 | 47 | 48 | media type, as per [RFC2045] 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | A comma-separated list of media descriptors as described by [CSS2]. 58 | The default is all. 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | An [XMLNS]-qualified name. 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | An Internationalized Resource Identifier Reference, as defined 79 | by [IRI]. 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | Common attributes 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /test/validator_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path('test_helper', __dir__) 4 | 5 | # rubocop:disable Metrics/BlockLength 6 | describe ValidateWebsite::Validator do 7 | let(:subject) { ValidateWebsite::Validator } 8 | 9 | before do 10 | WebMock.reset! 11 | @http = Spidr::Agent.new 12 | end 13 | 14 | describe('xhtml1') do 15 | it 'can ignore' do 16 | name = 'w3.org-xhtml1-strict-errors' 17 | file = File.join('test', 'data', "#{name}.html") 18 | page = FakePage.new(name, 19 | body: File.open(file).read, 20 | content_type: 'text/html') 21 | @xhtml1_page = @http.get_page(page.url) 22 | ignore = /width|height|Length/ 23 | validator = subject.new(@xhtml1_page.doc, 24 | @xhtml1_page.body, 25 | ignore: ignore) 26 | _(validator.valid?).must_equal true 27 | _(validator.errors).must_equal [] 28 | end 29 | 30 | it 'xhtml1-strict should be valid' do 31 | name = 'xhtml1-strict' 32 | dtd_uri = 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' 33 | file = File.join('test', 'data', "#{name}.html") 34 | page = FakePage.new(name, 35 | body: File.open(file).read, 36 | content_type: 'text/html') 37 | @xhtml1_page = @http.get_page(page.url) 38 | ignore = /width|height|Length/ 39 | validator = subject.new(@xhtml1_page.doc, 40 | @xhtml1_page.body, 41 | ignore: ignore) 42 | _(validator.dtd.system_id).must_equal dtd_uri 43 | _(validator.namespace).must_equal name 44 | _(validator.valid?).must_equal true 45 | _(validator.errors).must_equal [] 46 | end 47 | end 48 | 49 | describe('html5') do 50 | describe('when valid') do 51 | before do 52 | validator_res = File.join('test', 'data', 'validator.nu-success.json') 53 | stub_request(:any, /#{subject.html5_validator_service_url}/) 54 | .to_return(body: File.open(validator_res).read) 55 | end 56 | it 'html5 should be valid' do 57 | name = 'html5' 58 | file = File.join('test', 'data', "#{name}.html") 59 | page = FakePage.new(name, 60 | body: File.open(file).read, 61 | content_type: 'text/html') 62 | @html5_page = @http.get_page(page.url) 63 | validator = subject.new(@html5_page.doc, 64 | @html5_page.body) 65 | _(validator.valid?).must_equal true 66 | end 67 | end 68 | 69 | describe('when not valid') do 70 | before do 71 | validator_res = File.join('test', 'data', 'validator.nu-failure.json') 72 | stub_request(:any, /#{subject.html5_validator_service_url}/) 73 | .to_return(body: File.open(validator_res).read) 74 | name = 'html5-fail' 75 | file = File.join('test', 'data', "#{name}.html") 76 | page = FakePage.new(name, 77 | body: File.open(file).read, 78 | content_type: 'text/html') 79 | @html5_page = @http.get_page(page.url) 80 | end 81 | 82 | describe('with nu') do 83 | it 'should have an array of errors' do 84 | validator = subject.new(@html5_page.doc, 85 | @html5_page.body, 86 | html5_validator: :nu) 87 | _(validator.valid?).must_equal false 88 | _(validator.errors.size).must_equal 3 89 | end 90 | 91 | it 'should exclude errors ignored by :ignore option' do 92 | ignore = /Unclosed element/ 93 | validator = subject.new(@html5_page.doc, 94 | @html5_page.body, 95 | ignore: ignore, 96 | html5_validator: :nu) 97 | _(validator.valid?).must_equal false 98 | _(validator.errors.size).must_equal 1 99 | end 100 | end 101 | 102 | describe('with nokogiri') do 103 | it 'have an array of errors' do 104 | skip('nokogiri dont support jruby') if ValidateWebsite.jruby? 105 | validator = subject.new(@html5_page.doc, 106 | @html5_page.body, 107 | html5_validator: :nokogiri) 108 | _(validator.valid?).must_equal false 109 | _(validator.errors.size).must_equal 1 110 | end 111 | 112 | it 'exclude errors ignored by :ignore option' do 113 | skip('nokogiri dont support jruby') if ValidateWebsite.jruby? 114 | ignore = /End tag 'pre' isn't allowed here/ 115 | validator = subject.new(@html5_page.doc, 116 | @html5_page.body, 117 | ignore: ignore, 118 | html5_validator: :nokogiri) 119 | _(validator.valid?).must_equal true 120 | _(validator.errors.size).must_equal 0 121 | end 122 | end 123 | 124 | describe('with tidy') do 125 | it 'should have an array of errors' do 126 | skip('tidy is not installed') unless ValidateWebsite::Validator.tidy 127 | validator = subject.new(@html5_page.doc, 128 | @html5_page.body) 129 | _(validator.valid?).must_equal false 130 | _(validator.errors.size).must_equal 3 131 | end 132 | 133 | it 'should exclude errors ignored by :ignore option' do 134 | skip('tidy is not installed') unless ValidateWebsite::Validator.tidy 135 | ignore = /letter not allowed here|trimming empty/ 136 | validator = subject.new(@html5_page.doc, 137 | @html5_page.body, 138 | ignore: ignore) 139 | _(validator.valid?).must_equal false 140 | _(validator.errors.size).must_equal 2 141 | end 142 | end 143 | end 144 | end 145 | 146 | describe('html4') do 147 | it 'should validate html4' do 148 | name = 'html4-strict' 149 | file = File.join('test', 'data', "#{name}.html") 150 | page = FakePage.new(name, 151 | body: File.open(file).read, 152 | content_type: 'text/html') 153 | @html4_strict_page = @http.get_page(page.url) 154 | validator = subject.new(@html4_strict_page.doc, 155 | @html4_strict_page.body) 156 | validator.valid? 157 | _(validator.errors).must_equal [] 158 | end 159 | end 160 | end 161 | -------------------------------------------------------------------------------- /data/schemas/xhtml-datatypes-1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | XHTML Datatypes 12 | This is the XML Schema datatypes module for XHTML 13 | 14 | Defines containers for the XHTML datatypes, many of 15 | these imported from other specifications and standards. 16 | 17 | $Id: xhtml-datatypes-1.xsd,v 1.11 2008/10/07 16:57:03 ahby Exp $ 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /test/crawler_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path('test_helper', __dir__) 4 | 5 | # rubocop:disable Metrics/BlockLength 6 | describe ValidateWebsite::Crawl do 7 | before do 8 | WebMock.reset! 9 | stub_request(:get, /#{TEST_DOMAIN}/).to_return(status: 200) 10 | _out, _err = capture_io do 11 | @validate_website = ValidateWebsite::Crawl.new(color: false) 12 | end 13 | end 14 | 15 | def validator 16 | ValidateWebsite::Validator 17 | end 18 | 19 | describe 'options' do 20 | it 'can change user-agent' do 21 | ua = %{Linux / Firefox 29: Mozilla/5.0 (X11; Linux x86_64; rv:29.0) \ 22 | Gecko/20100101 Firefox/29.0} 23 | _out, _err = capture_io do 24 | v = ValidateWebsite::Crawl.new(site: TEST_DOMAIN, user_agent: ua) 25 | v.crawl 26 | _(v.crawler.user_agent).must_equal ua 27 | end 28 | end 29 | 30 | it 'can change html5 validator service url' do 31 | original = validator.html5_validator_service_url 32 | new = 'http://localhost:8888/' 33 | _out, _err = capture_io do 34 | ValidateWebsite::Crawl.new(site: TEST_DOMAIN, 35 | html5_validator_service_url: new) 36 | _(validator.html5_validator_service_url).must_equal new 37 | validator.html5_validator_service_url = original 38 | end 39 | end 40 | end 41 | 42 | describe('cookies') do 43 | it 'can set cookies' do 44 | cookies = 'tz=Europe%2FBerlin; guid=ZcpBshbtStgl9VjwTofq' 45 | _out, _err = capture_io do 46 | v = ValidateWebsite::Crawl.new(site: TEST_DOMAIN, cookies: cookies) 47 | v.crawl 48 | _(v.crawler.cookies.cookies_for_host(v.host)) 49 | .must_equal v.default_cookies 50 | end 51 | end 52 | end 53 | 54 | describe('html') do 55 | it 'extract url' do 56 | name = 'xhtml1-strict' 57 | file = File.join('test', 'data', "#{name}.html") 58 | page = FakePage.new(name, 59 | body: File.open(file).read, 60 | content_type: 'text/html') 61 | @validate_website.site = page.url 62 | _out, _err = capture_io do 63 | @validate_website.crawl 64 | end 65 | _(@validate_website.history_count).must_equal 5 66 | end 67 | 68 | it 'extract link' do 69 | name = 'html4-strict' 70 | file = File.join('test', 'data', "#{name}.html") 71 | page = FakePage.new(name, 72 | body: File.open(file).read, 73 | content_type: 'text/html') 74 | @validate_website.site = page.url 75 | _out, _err = capture_io do 76 | @validate_website.crawl 77 | end 78 | _(@validate_website.history_count).must_equal 98 79 | end 80 | 81 | it 'can change validator' do 82 | name = 'html5-fail' 83 | file = File.join('test', 'data', "#{name}.html") 84 | page = FakePage.new(name, 85 | body: File.open(file).read, 86 | content_type: 'text/html') 87 | validator_res = File.join('test', 'data', 'validator.nu-failure.json') 88 | stub_request(:any, /#{validator.html5_validator_service_url}/) 89 | .to_return(body: File.open(validator_res).read) 90 | @validate_website.site = page.url 91 | _out, _err = capture_io do 92 | @validate_website.crawl(html5_validator: :nu, ignore: /Warning/) 93 | end 94 | _(@validate_website.errors_count).must_equal 1 95 | end 96 | 97 | it 'crawl when URLs are not ascii only' do 98 | name = 'cozy-community' 99 | file = File.join('test', 'data', "#{name}.html") 100 | page = FakePage.new(name, 101 | body: File.open(file).read, 102 | content_type: 'text/html') 103 | validator_res = File.join('test', 'data', 'validator.nu-failure.json') 104 | stub_request(:any, /#{validator.html5_validator_service_url}/) 105 | .to_return(body: File.open(validator_res).read) 106 | @validate_website.site = page.url 107 | _out, _err = capture_io do 108 | @validate_website.crawl 109 | end 110 | end 111 | 112 | it 'dont try to extract imgs for redirect' do 113 | url = 'https://wordpress.org/support/bb-login.php' 114 | stub_request(:get, url).to_return( 115 | status: 302, 116 | headers: { 117 | 'Location' => 'https://login.wordpress.org/', 118 | 'Content-Type' => 'text/html; charset=UTF-8' 119 | } 120 | ) 121 | @validate_website.site = url 122 | _out, _err = capture_io do 123 | @validate_website.crawl 124 | end 125 | end 126 | end 127 | 128 | describe('css') do 129 | describe 'extract urls' do 130 | it 'crawl css and extract url' do 131 | page = FakePage.new('test.css', 132 | body: '.t {background-image: url(pouet);} 133 | .t {background-image: url(/image/pouet.png)} 134 | .t {background-image: url(/image/pouet_42.png)} 135 | .t {background-image: url(/image/pouet)}', 136 | content_type: 'text/css') 137 | @validate_website.site = page.url 138 | _out, _err = capture_io do 139 | @validate_website.crawl 140 | end 141 | _(@validate_website.history_count).must_equal 5 142 | end 143 | 144 | it 'should extract url with single quote' do 145 | page = FakePage.new('test.css', 146 | body: ".test {background-image: url('pouet');}", 147 | content_type: 'text/css') 148 | @validate_website.site = page.url 149 | _out, _err = capture_io do 150 | @validate_website.crawl 151 | end 152 | _(@validate_website.history_count).must_equal 2 153 | end 154 | 155 | it 'should extract url with double quote' do 156 | page = FakePage.new('test.css', 157 | body: '.test {background-image: url("pouet");}', 158 | content_type: 'text/css') 159 | @validate_website.site = page.url 160 | _out, _err = capture_io do 161 | @validate_website.crawl 162 | end 163 | _(@validate_website.history_count).must_equal 2 164 | end 165 | 166 | it 'should extract url with params' do 167 | page = FakePage.new('test.css', 168 | body: '.test {background-image: url(/t?size=s);}', 169 | content_type: 'text/css') 170 | @validate_website.site = page.url 171 | _out, _err = capture_io do 172 | @validate_website.crawl 173 | end 174 | _(@validate_website.history_count).must_equal 2 175 | end 176 | 177 | it 'should not extract invalid urls' do 178 | page = FakePage.new('test.css', 179 | body: '.test {background-image: url(/test.png");}', 180 | content_type: 'text/css') 181 | @validate_website.site = page.url 182 | _out, _err = capture_io do 183 | @validate_website.crawl 184 | end 185 | _(@validate_website.history_count).must_equal 1 186 | end 187 | end 188 | 189 | describe 'validate css syntax' do 190 | before do 191 | _out, _err = capture_io do 192 | @validate_website = ValidateWebsite::Crawl.new(color: false, 193 | css_syntax: true) 194 | end 195 | end 196 | it 'should be invalid with bad urls' do 197 | page = FakePage.new('test.css', 198 | body: '.test {background-image: url(/test.png");}', 199 | content_type: 'text/css') 200 | @validate_website.site = page.url 201 | _out, _err = capture_io do 202 | @validate_website.crawl 203 | end 204 | _(@validate_website.errors_count).must_equal 1 205 | end 206 | 207 | it 'should be invalid with syntax error' do 208 | page = FakePage.new('test.css', 209 | body: ' /**/ .foo {} #{bar {}', 210 | content_type: 'text/css') 211 | @validate_website.site = page.url 212 | _out, _err = capture_io do 213 | @validate_website.crawl 214 | end 215 | _(@validate_website.errors_count).must_equal 1 216 | end 217 | end 218 | end 219 | end 220 | -------------------------------------------------------------------------------- /data/schemas/xhtml-basic11.dtd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 28 | 35 | 36 | 37 | 44 | 45 | 46 | 47 | 49 | 50 | 51 | 52 | 64 | 65 | 66 | 67 | 68 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 87 | %xhtml-inlstyle.mod;]]> 88 | 89 | 90 | 93 | 94 | 97 | %xhtml-framework.mod; 98 | 99 | 106 | 107 | 110 | %xhtml-text.mod; 111 | 112 | 115 | %xhtml-hypertext.mod; 116 | 117 | 120 | %xhtml-list.mod; 121 | 122 | 123 | 126 | 127 | 128 | 129 | 130 | 131 | 135 | %xhtml-script.mod;]]> 136 | 137 | 138 | 139 | 143 | %xhtml-style.mod;]]> 144 | 145 | 146 | 147 | 151 | %xhtml-image.mod;]]> 152 | 153 | 154 | 155 | 159 | %xhtml-table.mod;]]> 160 | 161 | 162 | 163 | 167 | %xhtml-form.mod;]]> 168 | 169 | 170 | 171 | 175 | %xhtml-pres.mod;]]> 176 | 177 | 178 | 179 | 183 | %xhtml-link.mod;]]> 184 | 185 | 186 | 187 | 191 | %xhtml-meta.mod;]]> 192 | 193 | 194 | 195 | 199 | %xhtml-base.mod;]]> 200 | 201 | 202 | 203 | 207 | %xhtml-param.mod;]]> 208 | 209 | 210 | 211 | 215 | %xhtml-object.mod;]]> 216 | 217 | 218 | 219 | 223 | %xhtml-inputmode.mod;]]> 224 | 225 | 226 | 227 | 231 | %xhtml-target.mod;]]> 232 | 233 | 234 | 237 | %xhtml-struct.mod; 238 | 239 | 240 | -------------------------------------------------------------------------------- /data/schemas/xml.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | 8 | 9 | 10 |
11 |

About the XML namespace

12 | 13 |
14 |

15 | This schema document describes the XML namespace, in a form 16 | suitable for import by other schema documents. 17 |

18 |

19 | See 20 | http://www.w3.org/XML/1998/namespace.html and 21 | 22 | http://www.w3.org/TR/REC-xml for information 23 | about this namespace. 24 |

25 |

26 | Note that local names in this namespace are intended to be 27 | defined only by the World Wide Web Consortium or its subgroups. 28 | The names currently defined in this namespace are listed below. 29 | They should not be used with conflicting semantics by any Working 30 | Group, specification, or document instance. 31 |

32 |

33 | See further below in this document for more information about how to refer to this schema document from your own 35 | XSD schema documents and about the 36 | namespace-versioning policy governing this schema document. 37 |

38 |
39 |
40 |
41 |
42 | 43 | 44 | 45 | 46 |
47 | 48 |

lang (as an attribute name)

49 |

50 | denotes an attribute whose value 51 | is a language code for the natural language of the content of 52 | any element; its value is inherited. This name is reserved 53 | by virtue of its definition in the XML specification.

54 | 55 |
56 |
57 |

Notes

58 |

59 | Attempting to install the relevant ISO 2- and 3-letter 60 | codes as the enumerated possible values is probably never 61 | going to be a realistic possibility. 62 |

63 |

64 | See BCP 47 at 65 | http://www.rfc-editor.org/rfc/bcp/bcp47.txt 66 | and the IANA language subtag registry at 67 | 68 | http://www.iana.org/assignments/language-subtag-registry 69 | for further information. 70 |

71 |

72 | The union allows for the 'un-declaration' of xml:lang with 73 | the empty string. 74 |

75 |
76 |
77 |
78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 |
88 | 89 | 90 | 91 | 92 |
93 | 94 |

space (as an attribute name)

95 |

96 | denotes an attribute whose 97 | value is a keyword indicating what whitespace processing 98 | discipline is intended for the content of the element; its 99 | value is inherited. This name is reserved by virtue of its 100 | definition in the XML specification.

101 | 102 |
103 |
104 |
105 | 106 | 107 | 108 | 109 | 110 | 111 |
112 | 113 | 114 | 115 |
116 | 117 |

base (as an attribute name)

118 |

119 | denotes an attribute whose value 120 | provides a URI to be used as the base for interpreting any 121 | relative URIs in the scope of the element on which it 122 | appears; its value is inherited. This name is reserved 123 | by virtue of its definition in the XML Base specification.

124 | 125 |

126 | See http://www.w3.org/TR/xmlbase/ 128 | for information about this attribute. 129 |

130 |
131 |
132 |
133 |
134 | 135 | 136 | 137 | 138 |
139 | 140 |

id (as an attribute name)

141 |

142 | denotes an attribute whose value 143 | should be interpreted as if declared to be of type ID. 144 | This name is reserved by virtue of its definition in the 145 | xml:id specification.

146 | 147 |

148 | See http://www.w3.org/TR/xml-id/ 150 | for information about this attribute. 151 |

152 |
153 |
154 |
155 |
156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 |
167 | 168 |

Father (in any context at all)

169 | 170 |
171 |

172 | denotes Jon Bosak, the chair of 173 | the original XML Working Group. This name is reserved by 174 | the following decision of the W3C XML Plenary and 175 | XML Coordination groups: 176 |

177 |
178 |

179 | In appreciation for his vision, leadership and 180 | dedication the W3C XML Plenary on this 10th day of 181 | February, 2000, reserves for Jon Bosak in perpetuity 182 | the XML name "xml:Father". 183 |

184 |
185 |
186 |
187 |
188 |
189 | 190 | 191 | 192 |
193 |

About this schema document

194 | 195 |
196 |

197 | This schema defines attributes and an attribute group suitable 198 | for use by schemas wishing to allow xml:base, 199 | xml:lang, xml:space or 200 | xml:id attributes on elements they define. 201 |

202 |

203 | To enable this, such a schema must import this schema for 204 | the XML namespace, e.g. as follows: 205 |

206 |
207 |           <schema . . .>
208 |            . . .
209 |            <import namespace="http://www.w3.org/XML/1998/namespace"
210 |                       schemaLocation="http://www.w3.org/2001/xml.xsd"/>
211 |      
212 |

213 | or 214 |

215 |
216 |            <import namespace="http://www.w3.org/XML/1998/namespace"
217 |                       schemaLocation="http://www.w3.org/2009/01/xml.xsd"/>
218 |      
219 |

220 | Subsequently, qualified reference to any of the attributes or the 221 | group defined below will have the desired effect, e.g. 222 |

223 |
224 |           <type . . .>
225 |            . . .
226 |            <attributeGroup ref="xml:specialAttrs"/>
227 |      
228 |

229 | will define a type which will schema-validate an instance element 230 | with any of those attributes. 231 |

232 |
233 |
234 |
235 |
236 | 237 | 238 | 239 |
240 |

Versioning policy for this schema document

241 |
242 |

243 | In keeping with the XML Schema WG's standard versioning 244 | policy, this schema document will persist at 245 | 246 | http://www.w3.org/2009/01/xml.xsd. 247 |

248 |

249 | At the date of issue it can also be found at 250 | 251 | http://www.w3.org/2001/xml.xsd. 252 |

253 |

254 | The schema document at that URI may however change in the future, 255 | in order to remain compatible with the latest version of XML 256 | Schema itself, or with the XML namespace itself. In other words, 257 | if the XML Schema or XML namespaces change, the version of this 258 | document at 259 | http://www.w3.org/2001/xml.xsd 260 | 261 | will change accordingly; the version at 262 | 263 | http://www.w3.org/2009/01/xml.xsd 264 | 265 | will not change. 266 |

267 |

268 | Previous dated (and unchanging) versions of this schema 269 | document are at: 270 |

271 | 281 |
282 |
283 |
284 |
285 | 286 |
-------------------------------------------------------------------------------- /data/schemas/xhtml-lat1.ent: -------------------------------------------------------------------------------- 1 | 6 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | 22 | 23 | 25 | 26 | 27 | 29 | 31 | 33 | 35 | 37 | 38 | 40 | 42 | 44 | 46 | 47 | 49 | 51 | 52 | 54 | 56 | 58 | 60 | 62 | 64 | 66 | 69 | 71 | 73 | 75 | 77 | 80 | 83 | 85 | 87 | 89 | 91 | 93 | 95 | 97 | 99 | 101 | 102 | 104 | 106 | 108 | 110 | 112 | 114 | 115 | 118 | 120 | 122 | 124 | 126 | 128 | 130 | 132 | 135 | 137 | 139 | 141 | 143 | 146 | 148 | 150 | 152 | 154 | 156 | 158 | 160 | 162 | 164 | 166 | 167 | 169 | 171 | 173 | 175 | 177 | 179 | 180 | 183 | 185 | 187 | 189 | 191 | 193 | 195 | 197 | -------------------------------------------------------------------------------- /data/schemas/xhtml-symbol.ent: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 15 | 16 | 23 | 24 | 25 | 27 | 28 | 29 | 30 | 31 | 33 | 35 | 36 | 37 | 38 | 40 | 41 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 53 | 54 | 56 | 58 | 59 | 61 | 63 | 64 | 66 | 67 | 69 | 71 | 73 | 74 | 75 | 77 | 78 | 80 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 90 | 92 | 93 | 95 | 96 | 97 | 98 | 100 | 102 | 104 | 105 | 106 | 107 | 109 | 110 | 112 | 113 | 115 | 117 | 118 | 119 | 120 | 122 | 124 | 126 | 127 | 129 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 140 | 141 | 144 | 145 | 147 | 150 | 151 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 161 | 162 | 163 | 164 | 166 | 168 | 169 | 171 | 172 | 173 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 186 | 188 | 189 | 191 | 192 | 193 | 194 | 196 | 197 | 198 | 199 | 200 | 202 | 204 | 206 | 208 | 209 | 210 | 211 | 212 | 214 | 215 | 217 | 218 | 220 | 222 | 224 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 235 | 237 | 238 | -------------------------------------------------------------------------------- /test/data/html4-strict.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debian -- Le système d'exploitation universel 6 | 7 | 8 | 9 | 10 | 11 | 12 | 14 | 16 | 17 | 18 | 19 | 20 | 41 | 42 |
43 |
44 |
45 | 46 | 94 |
95 |

96 | 97 |

98 |
99 |

100 | Le site de notre sponsor 101 |

102 |

103 | Valid HTML 4.01! 104 |

105 |

106 | Valid CSS! 108 |

109 | 110 |
111 |
112 | Debian 5.0 - Le système d'exploitation universel 113 |

Qu'est-ce que Debian ?

114 |

Debian est un système d'exploitation 115 | libre pour votre ordinateur. Un système d'exploitation 116 | est la suite des programmes de base et des utilitaires qui permettent à un 117 | ordinateur de fonctionner. Debian utilise le noyau 118 | Linux (le cœur d'un système d'exploitation), 119 | mais la plupart des outils de base du système proviennent du 120 | projet GNU ; d'où le nom GNU/Linux.

121 |

Debian GNU/Linux est bien plus qu'un simple système d'exploitation : 122 | il contient plus de 25000 123 | paquets ; les paquets sont des composants 124 | logiciels précompilés conçus pour s'installer facilement sur votre machine.

125 |

Suite...

126 |
127 |

Pour commencer

128 |

La dernière version stable de Debian est 129 | la 5.0. La dernière mise à jour de cette version a été publiée 130 | le 4 septembre 2010. Vous pouvez aussi accéder aux 131 | autres versions disponibles de Debian.

132 |

Si vous souhaitez commencer à utiliser Debian, vous pouvez facilement 133 | en obtenir une copie, et ensuite suivre les 134 | instructions d'installation 135 | pour l'installer.

136 |

Si vous mettez à niveau votre système depuis une ancienne version vers 137 | la dernière version stable publiée, veuillez lire les 138 | notes de publication 139 | avant de commencer.

140 |

Pour obtenir de l'aide concernant l'utilisation ou la configuration 141 | de Debian, consultez nos pages sur la documentation 142 | et l'assistance.

143 |

Les utilisateurs qui parlent une langue autre que l'anglais peuvent 144 | consulter la section sur l'international.

145 |

Les personnes ayant un autre système qu'Intel x86 peuvent 146 | consulter la section sur les portages.

147 |
148 |

Actualités

149 |

[19 octobre 2010] Debian sur le point d'accueillir officiellement les contributeurs non empaqueteurs
150 | [7 octobre 2010] Debian à la rencontre de la Society for Neuroscience
151 | [8 septembre 2010] Paris Mini-DebConf 2010
152 | [5 septembre 2010] Le service de rétroportages (« backports ») devient officiel
153 | [4 septembre 2010] Publication de la mise à jour de Debian GNU/Linux 5.0.6
154 | [3 septembre 2010] Conférence 2010 de la communauté Debian italienne - du 17 au 19 septembre à Pérouse, Italie
155 |

156 |

Pour les communiqués plus anciens, consultez la suite de la page actualités. 157 | Si vous voulez recevoir un courrier (en anglais) à chaque fois qu'un communiqué paraît, abonnez-vous 158 | à la liste de diffusion debian-announce.

159 |
160 |

Annonces de sécurité

161 |

[22 octobre 2010] DSA-2122 glibc - missing input sanitization
162 | [19 octobre 2010] DSA-2121 typo3-src - several vulnerabilities
163 | [12 octobre 2010] DSA-2120 postgresql-8.3 - privilege escalation
164 | [12 octobre 2010] DSA-2119 poppler - several vulnerabilities
165 | [8 octobre 2010] DSA-2118 subversion - logic flaw
166 | [4 octobre 2010] DSA-2117 apr-util - denial of service
167 | [4 octobre 2010] DSA-2116 freetype - integer overflow
168 | [29 septembre 2010] DSA-2115 moodle - several vulnerabilities
169 | [26 septembre 2010] DSA-2114 git-core - buffer overflow
170 | [20 septembre 2010] DSA-2113 drupal6 - several vulnerabilities
171 | [20 septembre 2010] DSA-2112 bzip2 - integer overflow
172 |

173 |

Pour les annonces de sécurité, consultez la 174 | page sécurité. 175 | Si vous voulez recevoir les annonces de sécurité (en anglais) dès leur parution, abonnez-vous 176 | à la liste de diffusion debian-security-announce.

177 |
178 |
179 |
180 | 237 |
238 | 239 | 240 | -------------------------------------------------------------------------------- /History.md: -------------------------------------------------------------------------------- 1 | 2 | 1.12.0 / 2022-11-15 3 | =================== 4 | 5 | * Regen manpages 6 | * Bump year 7 | * Fix lint Metrics/AbcSize on check_page 8 | * Fix test/static_test.rb with relative path 9 | * Add relative link test fail 10 | * Fix test/static_test.rb with relative path 11 | * Nokigumbo is merged into nokogiri 1.12~ 12 | 13 | 1.11.1 / 2021-01-10 14 | =================== 15 | 16 | * Add nokogumbo doc 17 | * nokogumbo support for ruby engine 18 | * Add support for nokogumbo 19 | * Add jruby to github actions 20 | * Merge pull request #24 from @marocchino / patch-1 21 | * Some minor improvements 22 | * Remove travis ci 23 | * Init github actions ci 24 | 25 | 1.11.0 / 2021-01-08 26 | =================== 27 | 28 | * Merge pull request #23 from @marocchino / ruby-3-support 29 | * Use webrick's escape instead of encode_www_form_component 30 | * Support ruby 3 31 | * Fix doc for ValidateWebsite::Core initialize 32 | * Switch to gitlab ci and remove 2.{3,4} support 33 | * Update rubocop to 0.76.0 34 | 35 | 1.10.0 / 2020-07-03 36 | ================== 37 | 38 | * Fix build for Ruby 2.3 and 2.4 39 | * Remove rbx-3 from build 40 | * Remove minitest-focus and fix minitest 6 warnings 41 | * Fix html5_validator option and change html5_validator_service_url 42 | * Add Ruby 2.7 to CI and update jruby 43 | * Update rubocop and fix offences 44 | * Remove Ruby 2.2 support and update rubocop 45 | 46 | 1.9.3 / 2019-04-11 47 | ================== 48 | 49 | * Update tidy_ffi to 1.0 50 | * Avoid testing tidy bug with js 51 | 52 | 1.9.2 / 2019-03-09 53 | ================== 54 | 55 | * Load schema when needed instead of boot 56 | 57 | 1.9.1 / 2019-03-05 58 | ================== 59 | 60 | * Improve start message for static validator 61 | * Update travis config 62 | * Fix bundler to <2 on travis (dropped support Ruby < 2.3) 63 | 64 | 1.9.0 / 2018-12-25 65 | ================== 66 | 67 | * Update deps paint; slop; webmock 68 | * Remove Ruby 2.1 support 69 | * Force nonet and disable substitute entities on xhtml parse 70 | * Use coveralls for code coverage 71 | 72 | v1.8.1 / 2018-03-25 73 | =================== 74 | 75 | * Change document to local dtd only when needed 76 | * Switch to asciidoctor to generate manpage 77 | 78 | v1.8.0 / 2017-08-24 79 | =================== 80 | 81 | * Update manpages 82 | * Update rubies and jruby on travis 83 | * Remove encoding and other Style/SymbolArray fixes 84 | * Add tidy validator for html5 85 | 86 | v1.7.0 / 2017-04-08 87 | =================== 88 | 89 | * Use w3c_validators and remove multipart_body gem 90 | * Drop Ruby 2.0.0 91 | * Update webmock to 2.3 92 | * Load xsd on init to avoid conflicting chdir 93 | 94 | v1.6.0 / 2016-09-24 95 | =================== 96 | 97 | * Update spidr and rubocop 98 | * Use more upto date validator service and https 99 | 100 | v1.5.8 / 2016-06-27 101 | =================== 102 | 103 | * Update manpages 104 | * Exclude urls on static crawl 105 | * Fix warning: instance variable not initialized 106 | 107 | v1.5.7 / 2016-06-13 108 | =================== 109 | 110 | * Fix JRuby CI build 111 | * Update JRuby and other rubies on Travis 112 | * Update rubocop to v0.40 113 | * Add env variable VALIDATOR_NU_URL support 114 | 115 | v1.5.6 / 2016-03-02 116 | =================== 117 | 118 | * Fix error with redirect and extract images (see #16) 119 | * Display version number 120 | * Rubocop fixes 121 | * Updates deps 122 | 123 | v1.5.4 / 2015-12-08 124 | =================== 125 | 126 | * Fix URI must be ascii only error 127 | 128 | v1.5.3 / 2015-11-08 129 | =================== 130 | 131 | * Fix test_files on gemspec 132 | * Use File.expand_path for jruby 133 | * Update minitest and webmock 134 | * Capture output on spec 135 | * Move jruby-9.0.0.0 to allow failures 136 | * Added jruby-9.0.0.0 to travis 137 | * Options hash is mandatory on ValidateWebsite::Core 138 | * Added rubocop on default rake task 139 | * Remove unnecessary spacing 140 | * Rakefile: add --display-style-guide option to rubocop 141 | 142 | v1.5.0 / 2015-07-27 143 | =================== 144 | 145 | * Bump to 1.5.0 146 | * Added license badge 147 | * Document --css-syntax option 148 | * Fix --pattern option only string 149 | * Extract CSS methods to Utils class 150 | * Added css_syntax option checking css errors 151 | * Call method only on :not_found enabled 152 | * Rename spec to test 153 | * Add inch documentation badge 154 | * Use Crass gem to extract urls 155 | * Update README 156 | * Only display cop on task 157 | * Fix rubocop build and add custom task 158 | 159 | v1.1.0 / 2015-07-07 160 | =================== 161 | 162 | * Bump to 1.1.0 163 | * Enable rubocop on travis build 164 | * Fix default_args method has too many lines 165 | * Fix crawl#spidr_crawler ABC size 166 | * Fix Core#validate ABC size 167 | * Fix Static#check_static_file ABC size 168 | * Fix Static#crawl ABC size 169 | * Fix check_static_not_found css urls 170 | * Refacto check_static_not_found method 171 | * Syntax fix: use next in Enumarator 172 | * README: typo 173 | * Refactor: create ValidateWebsite::{Static,Crawl} classes 174 | * Refactor Validator 175 | * Syntax fixes 176 | * Syntax fixes 177 | * Update travis 178 | * Fix markup option 179 | * Cleanup default options 180 | * Better args options manage between crawl and static 181 | * Fix jruby ignore tests 182 | * Move on stop support Ruby 1.9 183 | * Opps forget spec data 184 | * Fix ignore option for static crawl and non html5 185 | * Use slop 4.2 186 | 187 | v1.0.5 / 2015-05-25 188 | =================== 189 | 190 | * Bump to 1.0.5 191 | * Added option html5-validator-service-url 192 | * Update paint to 1.0 193 | * Add ruby-head to travis 194 | * Remove docker stuff [ci skip] 195 | * Allow customize html5 validator service url 196 | 197 | v1.0.4 / 2015-03-10 198 | =================== 199 | 200 | * Bump to 1.0.4 201 | * Fix issue #12 with excessive requests to validator.nu 202 | * Added failing test for issue #12 203 | 204 | v1.0.3 / 2015-02-27 205 | =================== 206 | 207 | * Bump to 1.0.3 208 | * Fix static not found with anchor link (see #14) 209 | * Added fig config 210 | * bundle update 211 | * travis: added 2.2.0 version 212 | 213 | v1.0.2 / 2015-02-18 214 | =================== 215 | 216 | * Bump to 1.0.2 217 | * Fix issue #13 218 | * Added failing test for issue #13 219 | * Bump year on LICENSE file 220 | 221 | v1.0.1 / 2015-02-15 222 | =================== 223 | 224 | * Bump to 1.0.1 225 | * Revert "Remove shebang its already handle by RubyGems" 226 | * Fix html5 validator service url (see #11) 227 | * Update year and manpages 228 | * Remove shebang its already handle by RubyGems 229 | * spec/core_spec.rb: codestyle 230 | * Use each_with_object instead of inject 231 | * Ignore asciidoc generated files 232 | * Extract spidr_crawler for less complexity in crawl 233 | * Improve jekyll sample code 234 | * Merge pull request #10 from marocchino/improve-readme 235 | * Improve jekyll sample code 236 | 237 | v1.0.0 / 2014-10-18 238 | =================== 239 | 240 | * Bump to 1.0.0 :exclamation: 241 | * Can set cookies from command line 242 | * Can set cookies 243 | * Documentation update 244 | * Options notfound => not_found 245 | * Can change user-agent 246 | * Move internet connection check to private 247 | * use next instead of return for check static links 248 | * update screenshot 249 | * rubocop fixes (complexity, line too long) 250 | * remove matcher rspec (obsolete) 251 | * fix not found on static webpage 252 | * update linuxfr webpage and add static for tests 253 | * Fix URI::InvalidURIError 254 | * Fix Errno::ENOENT error 255 | * Make tests fail for static not found 256 | * Use slop for ARGV parsing and remove some options 257 | * Fix not_found_error and print not founds status 258 | * Make tests fail for check_static_not_found 259 | * Add status line 260 | 261 | v0.9.5 / 2014-09-23 262 | =================== 263 | 264 | * Bump to 0.9.5 265 | * Change internal verbose option 266 | * Print green dot when quiet 267 | * Fix options parser strings 268 | * Line is too long fix 269 | * Coding style 270 | * Replace class var with a class instance var 271 | * Use next to skip iteration 272 | * Use a guard clause instead of wrapping the code 273 | * spec wrong validation_type 274 | * Prefer `$ERROR_INFO` from the English library over `$!` 275 | * Use fail instead of raise to signal exceptions 276 | * Coding style fix 277 | 278 | v0.9.0 / 2014-09-20 279 | =================== 280 | 281 | * Bump to 0.9.0 282 | * documentation update 283 | * README: add Jekyll static site validation task 284 | * move crawler from anemone to spidr gem 285 | 286 | v0.8.1 / 2014-09-18 287 | =================== 288 | 289 | * bump to 0.8.1 290 | * fix require set 291 | 292 | v0.8.0 / 2014-09-18 293 | =================== 294 | 295 | * gemspec: fix pessimistic dependency 296 | * gemspec: fix open-ended deps and bump to 0.8.0 297 | * travis: remove jruby-head 298 | * README cleanup 299 | * README added badges and screenshot 300 | * spec/validator_spec.rb: cleanup 301 | * fix jruby build use Nokogiri::HTML intead of Nokogiri::XML 302 | * travis: cache bundler 303 | * move http testing to webmock 304 | * travis: added config 305 | * use set instead of array for links 306 | * fix: use HTML5_VALIDATOR_SERVICE 307 | * validate_website/core: code quality crawl 308 | * validate_website/core: code quality extract_urls_from_img_script_iframe_link 309 | * validate_website/core: code quality internet connection 310 | * gemspec: added pry for development 311 | * explanatory comments for classes 312 | * validate_website/validator: code quality 313 | * Change color gem from rainbow to paint 314 | * Fix html5 validator spec 315 | * README: rubygems package dont exist anymore 316 | * Added some comment 317 | 318 | v0.7.9 / 2013-03-18 319 | =================== 320 | 321 | * Bump to v0.7.9 322 | * html5: change host because having some timeout 323 | * README: more readeable 324 | * README: use markdown 325 | * Added info about internet_connection. 326 | * Indent fakeweb_helper. 327 | 328 | v0.7.7 / 2012-07-23 329 | =================== 330 | 331 | * Bump to v0.7.7 332 | * Update doc: Use dependency package default Ruby version 333 | * Add ignore_errors option on validate-website-static 334 | * Add contributors and incr year. 335 | 336 | v0.7.6 / 2012-04-18 337 | =================== 338 | 339 | * Bump version to 0.7.6 340 | * Documentation for --ignore-errors 341 | * Merge default opts on crawl and static validator. 342 | * Add spec for :ignore_errors option 343 | * Add -i option for ignoring certain validation err 344 | 345 | v0.7.5 / 2012-02-07 346 | =================== 347 | 348 | * Bump version to 0.7.5 349 | * Ignore *.gem files. 350 | * Add rspec matcher be_w3c_valid 351 | * Get errors from http://validator.nu for HTML5 352 | * Add encodings. 353 | * README fixes. 354 | 355 | v0.7.1 / 2011-12-25 356 | =================== 357 | 358 | * Bump version to 0.7.1 359 | * Make test fail for issue #4 360 | * Merge pull request #4 from nono/patch-1 361 | * Merge pull request #5 from GunioRobot/clean 362 | * Remove whitespace [Gun.io WhitespaceBot] 363 | * Update lib/validate_website/core.rb 364 | * Move to minitest 365 | * Requirement fixes for tests 366 | * Quiet in tests 367 | * [Documentation] Validator for use on other application. 368 | * Remove rubygems hooks, use bundler. 369 | 370 | v0.7.0 / 2011-06-06 371 | =================== 372 | 373 | * Bump version to 0.7.0 374 | * Check CSS files urls for static files 375 | * Cleanup, useless body variable and not_found check 376 | * Same options parse for static and crawl 377 | * Document --site option for validate-website-static. 378 | * Move to private validate extract_urls check_static_not_found 379 | * Move crawl static logic to Core class and extract urls from img script iframe 380 | * Opps exit status 64 already used for failure markup. 381 | * Add --color, --no-color options. 382 | * Rescue on missing arg or invalid options parse. 383 | 384 | v0.6.5 / 2011-06-05 385 | =================== 386 | 387 | * Bump version to 0.6.5 388 | * Add some todos. 389 | * Update dependencies. 390 | * Use gemspec for build validate-website gem. 391 | * README updates. 392 | * HTML5 support using Validator.nu Web Service. 393 | * Merge branch 'master' of github.com:spk/validate-website 394 | * add alias for task spec 395 | * README fix space 396 | 397 | v0.6.1 / 2011-04-11 398 | =================== 399 | 400 | * Bump version to 0.6.1 401 | * update doc and README 402 | * Add :markup_validation and :not_found to validate-website-static 403 | * add contributors, it is never too late 404 | * follow recommendation from rubygems-test 405 | * share to data directory 406 | * Add Gemfile (bundler) 407 | 408 | v0.6.0 / 2010-12-26 409 | =================== 410 | 411 | * Bump version to 0.6.0 412 | * Add Runner class for executables 413 | * Add option parser and document validate-website-static 414 | * Can pass Hash options to ValidateWebsite::Core 415 | * Add ValidateWebsite module to avoid conflicts 416 | * Update README requirements 417 | 418 | v0.5.7 / 2010-12-10 419 | =================== 420 | 421 | * Add validate-website-static executable 422 | * Cleanup: remove spk-html5 and use upstream anemone 423 | * ValidateWebsite code improvement for options 424 | * Change Validator initialize argument 425 | * Add linuxfr html5 page (should be valid) 426 | 427 | v0.5.3 / 2010-12-05 428 | =================== 429 | 430 | * Bump version to 0.5.3 431 | * Add -q, --quiet option (Only report errors) 432 | * Improve installation documentation for Debian users 433 | * print note on validating website 434 | * rename internal option :error_verbose to :validate_verbose 435 | 436 | v0.5.2 / 2010-11-05 437 | =================== 438 | 439 | * Bump version to 0.5.2 440 | * Using my fork of html5 Rubygem 441 | * Show line for html5 parser errors 442 | 443 | v0.5.1 / 2010-11-04 444 | =================== 445 | 446 | * Bump version to 0.5.1 447 | * Fix issue with 1.9.2 and CSS url (use first instead of to_s) 448 | * Move get_url to private access 449 | * Better requirement and remove require 'rubygems' from spec/spec_helper.rb 450 | 451 | v0.5.0 / 2010-11-01 452 | =================== 453 | 454 | * Bump version to 0.5.0 455 | * Change exit status 456 | * Fix html4 validation by falling back to dtd validation 457 | * Add failing test on html4 strict 458 | * Update documentation 459 | * Sync options with anemone 460 | * Improve documentation and add manpage 461 | * Add experimental html5 support 462 | * Show properly errors with verbose option 463 | * Update RSpec to version 2.0 and add spec task 464 | 465 | v0.4.1 / 2010-10-24 466 | =================== 467 | 468 | * Bump version to 0.4.1 469 | * Move to_file to private access 470 | * Pass missing options to crawl (see on github #2) 471 | * Add Validator spec file, rename and add html test on validate_website_spec 472 | 473 | v0.4.0 / 2010-09-14 474 | =================== 475 | 476 | * Bump version to 0.4.0 477 | * add lib/xhtml/xhtml-basic11.dtd file 478 | * lib/validator.rb: cleanup and rescue on Nokogiri::XML::SyntaxError 479 | * Add --[no-]markup-validation option 480 | * typo capitalize help 481 | * added debug options for anemone, and verbose option for validator errors 482 | * include ColorfulMessages on ValidateWebsite class 483 | 484 | v0.3.5 / 2010-08-25 485 | =================== 486 | 487 | * Bump version to 0.3.5 and add spec directory to pkg files 488 | * Add default for ValidateWebsite initialize and crawl opts 489 | * added test on css 490 | * added development dependency: rspec and fakeweb 491 | * Refactor validate website and crawl url in css 492 | * updated REAME.rdoc 493 | * added option -c for adding cookies 494 | * added verbose option 495 | * lib/validate_website.rb: bug fix on bad uri case bin/validate-website: minor change, use «unless» instead of «if not» 496 | * search 404 in img, link, script and iframe tags 497 | * Rename README to README.rdoc 498 | * Update readme and gem spec 499 | * Add not_found option (thanks to François de Metz) 500 | * exit code depend of validation result 501 | * only try to validate html file 502 | * fix some ruby 1.9 issue 503 | * fix some validation issue with no dtd or xsd 504 | * update readme 505 | * move to anemone web-spider, and use XML Schema for validation of XHTML 506 | * add optparse options 507 | * create a gem 508 | * initial commit 509 | --------------------------------------------------------------------------------