├── .github └── workflows │ └── test.yml ├── MANIFEST ├── MANIFEST.SKIP ├── META.yml ├── Makefile.PL ├── NEWS ├── README.md ├── SIGNATURE ├── bin ├── checklink └── checklink.pod ├── docs ├── checklink.html ├── linkchecker.css ├── linkchecker.js └── tests │ ├── links_borked.html │ ├── links_ok.html │ └── links_redirects.html ├── etc ├── checklink.conf └── perltidyrc ├── images ├── double.png ├── grad.png ├── head-bl.png ├── head-br.png ├── info_icons │ ├── README │ ├── error.png │ ├── info.png │ └── warning.png ├── no_w3c.png ├── round-br.png ├── round-tr.png ├── textbg.png └── w3c.png ├── t └── 00compile.t └── w3c.json /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run test suite 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: {} 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | repository: w3c/linkchecker-testsuite 16 | path: test-suite 17 | - uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.7' 20 | - name: Install test suite dependencies 21 | working-directory: test-suite/harness 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install -r requirements.txt 25 | - uses: actions/checkout@v4 26 | with: 27 | path: link-checker 28 | # using https://github.com/shogo82148/actions-setup-perl created 29 | # issue with installing libcss-dom-perl due to 30 | # old syntax in package 31 | # see https://rt.cpan.org/Public/Bug/Display.html?id=146661 32 | - name: Install perl 33 | run: sudo apt-get install -y perl cpanminus libcss-dom-perl libwww-perl 34 | 35 | - name: Install checker dependencies 36 | working-directory: link-checker 37 | run: cpanm --installdeps . 38 | - name: run test server 39 | working-directory: test-suite 40 | run: python server.py & 41 | - name: install bubblewrap 42 | run: sudo apt-get install -y bubblewrap 43 | - name: run test suite 44 | working-directory: test-suite/harness 45 | run: bwrap --dev-bind / / --ro-bind ../hosts /etc/hosts sh -c "python linktest.py --checker_path ../../link-checker/bin/checklink run" -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | Makefile.PL 2 | MANIFEST 3 | META.yml 4 | NEWS Overview of changes between releases 5 | README Start by reading this 6 | SIGNATURE 7 | bin/checklink The link checker 8 | bin/checklink.pod Manual page for the link checker 9 | etc/checklink.conf Optional configuration file 10 | etc/perltidyrc perltidy(1) profile 11 | docs/checklink.html Additional documentation 12 | docs/linkchecker.css Cascading style sheet used in docs and generated HTML 13 | docs/linkchecker.js JavaScript used in the generated HTML 14 | images/double.png 15 | images/grad.png 16 | images/head-bl.png 17 | images/head-br.png 18 | images/no_w3c.png 19 | images/round-br.png 20 | images/round-tr.png 21 | images/textbg.png 22 | images/info_icons/README 23 | images/info_icons/error.png 24 | images/info_icons/info.png 25 | images/info_icons/warning.png 26 | lib/W3C/LinkChecker.pm Dummy *.pm for CPAN indexing purposes 27 | t/00compile.t 28 | -------------------------------------------------------------------------------- /MANIFEST.SKIP: -------------------------------------------------------------------------------- 1 | \.hgignore$ 2 | \.hgtags$ 3 | \bblib\b 4 | \bpm_to_blib\b 5 | \.hg\b 6 | ~$ 7 | \bMANIFEST\.SKIP$ 8 | \bMakefile$ 9 | \bMakefile\.old$ 10 | \.tar\. 11 | \bREADME\.cvs$ 12 | \bdocs/tests\b 13 | \bimages/w3c\.png$ 14 | -------------------------------------------------------------------------------- /META.yml: -------------------------------------------------------------------------------- 1 | --- #YAML:1.0 2 | name: W3C-LinkChecker 3 | version: 4.81 4 | abstract: W3C Link Checker 5 | author: 6 | - W3C QA-dev Team 7 | license: open_source 8 | distribution_type: module 9 | configure_requires: 10 | ExtUtils::MakeMaker: 0 11 | build_requires: 12 | ExtUtils::MakeMaker: 0 13 | requires: 14 | CGI: 0 15 | CGI::Carp: 0 16 | CGI::Cookie: 0 17 | Config::General: 2.06 18 | CSS::DOM: 0.09 19 | CSS::DOM::Constants: 0 20 | CSS::DOM::Style: 0 21 | CSS::DOM::Util: 0 22 | Encode: 0 23 | Encode::Locale: 0 24 | File::Spec: 0 25 | Getopt::Long: 2.17 26 | HTML::Entities: 0 27 | HTML::Parser: 3.40 28 | HTTP::Cookies: 0 29 | HTTP::Headers::Util: 0 30 | HTTP::Message: 5.827 31 | HTTP::Request: 0 32 | HTTP::Response: 1.50 33 | Locale::Country: 0 34 | Locale::Language: 0 35 | LWP::RobotUA: 1.19 36 | LWP::UserAgent: 0 37 | Net::hostent: 0 38 | Net::HTTP::Methods: 5.833 39 | Net::IP: 0 40 | Socket: 0 41 | Term::ReadKey: 2 42 | Test::More: 0 43 | Text::Wrap: 0 44 | Time::HiRes: 0 45 | URI: 1.53 46 | URI::Escape: 0 47 | URI::file: 0 48 | resources: 49 | bugtracker: https://github.com/w3c/link-checker/issues/ 50 | homepage: https://validator.w3.org/checklink 51 | MailingList: https://lists.w3.org/Archives/Public/www-validator/ 52 | repository: https://github.com/w3c/link-checker/ 53 | no_index: 54 | directory: 55 | - t 56 | - inc 57 | generated_by: ExtUtils::MakeMaker version 6.56 58 | meta-spec: 59 | url: http://module-build.sourceforge.net/META-spec-v1.4.html 60 | version: 1.4 61 | -------------------------------------------------------------------------------- /Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.008; 2 | use ExtUtils::MakeMaker; 3 | 4 | WriteMakefile( 5 | NAME => 'W3C::LinkChecker', 6 | ABSTRACT => 'W3C Link Checker', 7 | AUTHOR => 'W3C QA-dev Team ', 8 | LICENSE => 'open_source', 9 | VERSION_FROM => 'bin/checklink', 10 | PREREQ_PM => { 11 | 12 | # Hard dependencies: 13 | CSS::DOM => 0.09, 14 | CSS::DOM::Constants => 0, 15 | CSS::DOM::Style => 0, 16 | CSS::DOM::Util => 0, 17 | Encode => 0, 18 | HTML::Entities => 0, 19 | HTML::Parser => "3.40", 20 | HTTP::Headers::Util => 0, 21 | HTTP::Message => 5.827, 22 | HTTP::Request => 0, 23 | HTTP::Response => "1.50", 24 | LWP::RobotUA => 1.19, 25 | LWP::UserAgent => 0, 26 | Net::HTTP::Methods => 5.833, 27 | Time::HiRes => 0, 28 | URI => 1.53, 29 | URI::Escape => 0, 30 | 31 | # Optional, but required if using a config file: 32 | Config::General => 2.06, 33 | 34 | # Optional, but required if private IPs are disallowed: 35 | Net::hostent => 0, 36 | Net::IP => 0, 37 | Socket => 0, 38 | 39 | # Optional, but required in command line mode: 40 | File::Spec => 0, 41 | Getopt::Long => 2.17, 42 | Text::Wrap => 0, 43 | URI::file => 0, 44 | 45 | # Optional, used for password input in command line mode: 46 | Term::ReadKey => 2.00, 47 | 48 | # Optional, used for guessing language in command line mode: 49 | Locale::Country => 0, 50 | Locale::Language => 0, 51 | 52 | # Optional, used when decoding arguments in command line mode: 53 | Encode::Locale => 0, 54 | 55 | # Optional, but required in CGI mode: 56 | CGI => 0, 57 | CGI::Carp => 0, 58 | CGI::Cookie => 0, 59 | 60 | # Optional, required if using cookies: 61 | HTTP::Cookies => 0, 62 | 63 | # Required for the test suite: 64 | File::Spec => 0, 65 | Test::More => 0, 66 | }, 67 | PM => {'lib/W3C/LinkChecker.pm' => '$(INST_LIB)/W3C/LinkChecker.pm'}, 68 | EXE_FILES => ['bin/checklink'], 69 | MAN1PODS => 70 | {'bin/checklink.pod' => '$(INST_MAN1DIR)/checklink.$(MAN1EXT)',}, 71 | META_MERGE => { 72 | resources => { 73 | homepage => 'https://validator.w3.org/checklink', 74 | bugtracker => 'https://github.com/w3c/link-checker/issues/', 75 | repository => 'https://github.com/w3/link-checker/', 76 | MailingList => 77 | 'https://lists.w3.org/Archives/Public/www-validator/', 78 | }, 79 | }, 80 | depend => {distdir => 'lib/W3C/LinkChecker.pm'}, 81 | dist => {TARFLAGS => '--owner=0 --group=0 -cvf'}, 82 | clean => {FILES => 'Makefile.PL.bak bin/checklink.bak'}, 83 | ); 84 | 85 | sub MY::postamble 86 | { 87 | return <<'MAKE_FRAG'; 88 | lib/W3C/LinkChecker.pm: Makefile.PL bin/checklink 89 | $(MKPATH) lib/W3C 90 | $(ECHO) "# Dummy module for CPAN indexing purposes." > $@ 91 | $(ECHO) "package $(NAME);" >> $@ 92 | $(ECHO) "use strict;" >> $@ 93 | $(ECHO) "use vars qw(\$$VERSION);" >> $@ 94 | $(ECHO) "\$$VERSION = \"$(VERSION)\";" >> $@ 95 | $(ECHO) "1;" >> $@ 96 | 97 | PERLTIDY = perltidy --profile=etc/perltidyrc --backup-and-modify-in-place \ 98 | --backup-file-extension=/ 99 | 100 | perltidy: 101 | @for file in Makefile.PL bin/checklink ; do \ 102 | echo "$(PERLTIDY) $$file" ; \ 103 | $(PERLTIDY) $$file ; \ 104 | done 105 | MAKE_FRAG 106 | } 107 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | This document contains information about high level changes between 2 | Link Checker releases. 3 | 4 | Version 4.81 - 2011-10-16 5 | - Work around some related problems (#12720, rt.cpan.org#54361). 6 | - Eliminate some warnings (emitted by code, not from results). 7 | 8 | Version 4.8 - 2011-04-02 9 | - Avoid some robot delays by improving the order in which links are checked. 10 | - Avoid some unnecessary HEAD requests in recursive mode. 11 | - Clarify output wrt. links that have already been checked. 12 | - Make connection cache size configurable, and increase the default to 2. 13 | - Move JavaScript to an external file. 14 | - Check applet and object archive links. 15 | 16 | Version 4.7 - 2011-03-17 17 | - Support for IRI. 18 | - Support for more HTML5 links. 19 | - Decode query string parameters as UTF-8. 20 | - Decode command line arguments according to locale. 21 | - New dependencies: Encode-Locale (command line mode only). 22 | - Updated dependencies: libwww-perl >= 5.833, URI >= 1.53. 23 | 24 | Version 4.6 - 2010-05-01 25 | - Support for checking links in CSS. 26 | - Results UI improvements, added "progress bar". 27 | - Support for larger variety of character and content encodings. 28 | - Support for HTTP responses with > 4kB header lines. 29 | - Additional output suppress options in command line mode. 30 | - Improved heuristics when passed non-absolute URLs. 31 | - Support for cookies (command line only for now). 32 | - More "false positive" failure avoidance efforts for "make test". 33 | - The set of forbidden protocols is now configurable. 34 | - New dependencies: CSS-DOM >= 0.09. 35 | - Updated dependencies: Perl >= 5.8. 36 | 37 | Version 4.5 - 2009-03-30 38 | - Removed W3C trademarked icons from distribution tarball. 39 | - Avoid "false positive" failures from "make test" in certain setups. 40 | - Make quiet command line mode quieter. 41 | - Lowered default timeout to 30 seconds. 42 | 43 | Version 4.4 - 2009-02-12 44 | - checking more elements and attributes, such as BLOCKQUOTE cite="", BODY 45 | background="", EMBED, etc 46 | - Changes in the UI to make it match other validators more closely 47 | - in HTML/cgi output, using javascript to show checklink status as it happens 48 | - added support for HTML5 links 49 | - softer wording for broken link results 50 | - Add non-robot developer mode 51 | - many bug fixes and code cleanup 52 | 53 | Version 4.3 - 2006-10-22 54 | - Various minor improvements to result output, both in text and HTML modes. 55 | - Fix --quiet and checking multiple documents to match documentation. 56 | - Eliminate various warnings (emitted by code, not from results). 57 | - Documentation improvements. 58 | 59 | Version 4.2.1 - 2005-05-15 60 | - Include documentation of the reorganized access keys. 61 | 62 | Version 4.2 - 2005-04-27 63 | - Access key reorganization, making them less likely to conflict with 64 | browsers' "native" key bindings. 65 | - Redirects are now checked for private IP addresses too. 66 | 67 | Version 4.1 - 2004-11-24 68 | - Added workarounds against browser timeouts in "summary only" mode. 69 | - Improved caching and reuse of fetched /robots.txt information. 70 | - Fixed a bug where a complete protocol response (including headers) 71 | was passed to the HTML parser, which caused unexpected behaviour. 72 | - Minor user interface and installation related improvements. 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # W3C-LinkChecker 2 | 3 | This distribution contains the W3C Link Checker. 4 | 5 | The link checker can be run as a CGI script in a web server as well as 6 | on the command line. The CGI version provides a HTML interface as 7 | seen at [http://validator.w3.org/checklink](http://validator.w3.org/checklink). 8 | 9 | 10 | ## Install 11 | 12 | To install the distribution for command line use: 13 | 14 | ```sh 15 | git clone https://github.com/w3c/link-checker.git 16 | cd link-checker 17 | #if you have cpanminus installed 18 | cpanm --installdeps . 19 | perl Makefile.PL 20 | make 21 | make test 22 | make install # as root unless you are using local::lib 23 | ``` 24 | 25 | To install the CGI version, in addition to the above, copy the 26 | bin/checklink script into a location in your web server from where 27 | execution of CGI scripts is allowed, and make sure that the web server 28 | user has execute permissions to the script. The CGI directory is 29 | typically named "cgi-bin" somewhere under your web server root 30 | directory. 31 | 32 | For more information, please consult the POD documentation in the 33 | checklink.pod file, typically (in the directory where you unpacked the 34 | source): 35 | 36 | ```sh 37 | perldoc ./bin/checklink.pod 38 | ``` 39 | 40 | ...as well as the HTML documentation in docs/checklink.html. 41 | 42 | ## As a Docker container 43 | 44 | You may want to use [@stupchiy](https://github.com/stupchiy)'s [Dockerfile](https://github.com/stupchiy/docker-w3c-checklink/blob/master/Dockerfile), which is 45 | based on Ubuntu Linux, and follow his instructions: 46 | 47 | ```sh 48 | $ docker build -t link-checker . # Build an image 49 | $ docker run -it --rm link-checker # Run a container 50 | $ docker run -it --rm link-checker checklink https://foo.bar # Run script directly 51 | $ docker run -it --rm -v "$PWD":/home/checklink link-checker checklink -H https://foo.bar > report.html # Write to HTML file 52 | ``` 53 | 54 | ## Useful links 55 | 56 | - [Archived bug list](https://www.w3.org/Bugs/Public/buglist.cgi?product=LinkChecker&component=checklink) (Decommissioned Bugzilla) 57 | - [www-validator MailingList](https://lists.w3.org/Archives/Public/www-validator/) used to discuss the W3C Markup Validation Service, Link Checker and Log Validator. 58 | 59 | ## Copyright and License 60 | 61 | Written by the following people for the W3C: 62 | 63 | - Hugo Haas 64 | - Ville Skyttä 65 | - W3C QA-dev Team 66 | 67 | Copyright (C) 1994-2023 World Wide Web Consortium Inc. 68 | All Rights Reserved. This work is 69 | distributed under the [W3C Software License](http://www.w3.org/Consortium/Legal/2015/copyright-software-and-document) in the hope that it 70 | will be useful, but WITHOUT ANY WARRANTY; without even the implied 71 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 72 | -------------------------------------------------------------------------------- /SIGNATURE: -------------------------------------------------------------------------------- 1 | This file contains message digests of all files listed in MANIFEST, 2 | signed via the Module::Signature module, version 0.68. 3 | 4 | To verify the content in this distribution, first make sure you have 5 | Module::Signature installed, then type: 6 | 7 | % cpansign -v 8 | 9 | It will check each file's integrity, as well as the signature's 10 | validity. If "==> Signature verified OK! <==" is not displayed, 11 | the distribution may already have been compromised, and you should 12 | not run its Makefile.PL or Build.PL. 13 | 14 | -----BEGIN PGP SIGNED MESSAGE----- 15 | Hash: SHA1 16 | 17 | SHA1 b075772a968f5694bfbb4ce33eadf26566a25f47 MANIFEST 18 | SHA1 e8175087619cebc9d0e0ead8ca06b9d8ee73b678 META.yml 19 | SHA1 ab9150095a45776c2020e5781d19054c7018da8b Makefile.PL 20 | SHA1 05b3e35c8352063f2c99efa9cd3881c208fa1bb0 NEWS 21 | SHA1 f1f868ea73db7d39ab491ebb50c84de76cce4b44 README 22 | SHA1 619d90efc63090552be8926418d69a0364989501 bin/checklink 23 | SHA1 4406433ae670dd4f7be3f2c76d55aefb239e9bc9 bin/checklink.pod 24 | SHA1 b188063249c820f0aa5a34b5f735e8f334a536e1 docs/checklink.html 25 | SHA1 fa101fed018fc8e41beca63a0a667fb94c10a557 docs/linkchecker.css 26 | SHA1 8fa71b54357c9ed6ac8e01ab600120032d35b080 docs/linkchecker.js 27 | SHA1 92d01a8a6e7edcd200d70492f4e551984b97b7a0 etc/checklink.conf 28 | SHA1 87c74944dbc80b5d6ab8aac1d09419607b15efff etc/perltidyrc 29 | SHA1 bcb7896bee3764f85a03ab14495efc233f70e215 images/double.png 30 | SHA1 ff9a7be7fee245dd81a7dc4124544d692a140119 images/grad.png 31 | SHA1 61aeb3ea5616833678f66c7baa6db373eedcd86b images/head-bl.png 32 | SHA1 bcb7bf006b79106309350bfa578e94af80aed82d images/head-br.png 33 | SHA1 11243aa6b3463dd8d6a9b2e69027e42a1d9480ab images/info_icons/README 34 | SHA1 a54abf3d12f207b81e19ea8ce783d37c6200cf40 images/info_icons/error.png 35 | SHA1 3fd2638079cd0698655614a5a5afc97a976a4af4 images/info_icons/info.png 36 | SHA1 552c52188188f560dc02a03200164de3045ac3f4 images/info_icons/warning.png 37 | SHA1 1631ed7d5b20c2c036e61225854134f0674cb10a images/no_w3c.png 38 | SHA1 401b5fba02d0d8484775a4a77503fa0d136b96ce images/round-br.png 39 | SHA1 9eb1ee6188391715284a3db080e6e92d163864d9 images/round-tr.png 40 | SHA1 cc01bd358bc1d6d42ca350ad0a4a42778ca4440e images/textbg.png 41 | SHA1 7587466f1487eb446fe5da1a70d445e7b33efd36 lib/W3C/LinkChecker.pm 42 | SHA1 962ba9fff082c4087239b55618ada2a8f1564464 t/00compile.t 43 | -----BEGIN PGP SIGNATURE----- 44 | Version: GnuPG v1.4.11 (GNU/Linux) 45 | 46 | iQEVAwUBTpqXHId580Rxl2NsAQJqPAf/XrqTWrlZa9DFkWrnOSxIYsyDGPl14fCl 47 | ohGFL7jBYxdKndEHo2aA7bA95EOypZVxakUIFcizpC5ujbrqasGPEnxinhJQYLqA 48 | S+4G+yzen3DqbbLndd5eIWVLPS5992gXwuLaeZrNFlGv/kG892NSLGfu3JQiePlc 49 | jNNUZ4dvwe+MHSSvs3DEkAPJqeIR7bx55tp+O7n5HX3ab/sqYIaqI2V3tXP/EHFy 50 | PA/Ig9QFQmfB7SY3TFN7iUFuIDRqIQOzC/Ij/WqY1Uj9885zZJvq0GWT/huFvyVG 51 | IGFM+sZp8gr6fr/bkB7de5xCoVUpkCz+mFkIJFQCu1cwJcP9pa81+g== 52 | =W5f6 53 | -----END PGP SIGNATURE----- 54 | -------------------------------------------------------------------------------- /bin/checklink: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -wT 2 | # 3 | # W3C Link Checker 4 | # by Hugo Haas 5 | # (c) 1999-2023 World Wide Web Consortium 6 | # based on Renaud Bruyeron's checklink.pl 7 | # 8 | # This program is licensed under the W3C(r) Software License: 9 | # https://www.w3.org/Consortium/Legal/copyright-software 10 | # 11 | # The documentation is at: 12 | # https://validator.w3.org/checklink/docs/checklink.html 13 | # 14 | # See the Github repository at: 15 | # https://github.com/w3c/link-checker/ 16 | # 17 | # An online version is available at: 18 | # https://validator.w3.org/checklink 19 | # 20 | # Comments and suggestions should be filed on github 21 | 22 | use strict; 23 | use 5.008; 24 | 25 | # Get rid of potentially unsafe and unneeded environment variables. 26 | delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)}); 27 | $ENV{PATH} = undef; 28 | 29 | # ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib, 30 | # http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html 31 | use Config qw(%Config); 32 | use lib map { /(.*)/ } 33 | defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) : 34 | defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) : 35 | (); 36 | 37 | # ----------------------------------------------------------------------------- 38 | 39 | package W3C::UserAgent; 40 | 41 | use LWP::RobotUA 1.19 qw(); 42 | use LWP::UserAgent qw(); 43 | use Net::HTTP::Methods 5.833 qw(); # >= 5.833 for 4kB cookies (#6678) 44 | 45 | # if 0, ignore robots exclusion (useful for testing) 46 | use constant USE_ROBOT_UA => 1; 47 | 48 | if (USE_ROBOT_UA) { 49 | @W3C::UserAgent::ISA = qw(LWP::RobotUA); 50 | } 51 | else { 52 | @W3C::UserAgent::ISA = qw(LWP::UserAgent); 53 | } 54 | 55 | sub new 56 | { 57 | my $proto = shift; 58 | my $class = ref($proto) || $proto; 59 | my ($name, $from, $rules) = @_; 60 | 61 | # For security/privacy reasons, if $from was not given, do not send it. 62 | # Cheat by defining something for the constructor, and resetting it later. 63 | my $from_ok = $from; 64 | $from ||= 'www-validator@w3.org'; 65 | 66 | my $self; 67 | if (USE_ROBOT_UA) { 68 | $self = $class->SUPER::new($name, $from, $rules); 69 | } 70 | else { 71 | my %cnf; 72 | @cnf{qw(agent from)} = ($name, $from); 73 | $self = LWP::UserAgent->new(%cnf); 74 | $self = bless $self, $class; 75 | } 76 | 77 | $self->from(undef) unless $from_ok; 78 | 79 | $self->env_proxy(); 80 | 81 | $self->allow_private_ips(1); 82 | 83 | $self->protocols_allowed([qw(http https ftp)]); 84 | 85 | return $self; 86 | } 87 | 88 | sub allow_private_ips 89 | { 90 | my $self = shift; 91 | if (@_) { 92 | $self->{Checklink_allow_private_ips} = shift; 93 | if (!$self->{Checklink_allow_private_ips}) { 94 | 95 | # Pull in dependencies 96 | require Net::IP; 97 | require Socket; 98 | require Net::hostent; 99 | } 100 | } 101 | return $self->{Checklink_allow_private_ips}; 102 | } 103 | 104 | sub redirect_progress_callback 105 | { 106 | my $self = shift; 107 | $self->{Checklink_redirect_callback} = shift if @_; 108 | return $self->{Checklink_redirect_callback}; 109 | } 110 | 111 | sub simple_request 112 | { 113 | my $self = shift; 114 | 115 | my $response = $self->ip_disallowed($_[0]->uri()); 116 | 117 | # RFC 2616, section 15.1.3 118 | $_[0]->remove_header("Referer") 119 | if ($_[0]->referer() && 120 | (!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure())); 121 | 122 | $response ||= do { 123 | local $SIG{__WARN__} = 124 | sub { # Suppress some warnings, rt.cpan.org #18902 125 | warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); 126 | }; 127 | 128 | # @@@ Why not just $self->SUPER::simple_request? 129 | $self->W3C::UserAgent::SUPER::simple_request(@_); 130 | }; 131 | 132 | if (!defined($self->{FirstResponse})) { 133 | $self->{FirstResponse} = $response->code(); 134 | $self->{FirstMessage} = $response->message() || '(no message)'; 135 | } 136 | 137 | return $response; 138 | } 139 | 140 | sub redirect_ok 141 | { 142 | my ($self, $request, $response) = @_; 143 | 144 | if (my $callback = $self->redirect_progress_callback()) { 145 | 146 | # @@@ TODO: when an LWP internal robots.txt request gets redirected, 147 | # this will a bit confusingly fire for it too. Would need a robust 148 | # way to determine whether the request is such a LWP "internal 149 | # robots.txt" one. 150 | &$callback($request->method(), $request->uri()); 151 | } 152 | 153 | return 0 unless $self->SUPER::redirect_ok($request, $response); 154 | 155 | if (my $res = $self->ip_disallowed($request->uri())) { 156 | $response->previous($response->clone()); 157 | $response->request($request); 158 | $response->code($res->code()); 159 | $response->message($res->message()); 160 | return 0; 161 | } 162 | 163 | return 1; 164 | } 165 | 166 | # 167 | # Checks whether we're allowed to retrieve the document based on its IP 168 | # address. Takes an URI object and returns a HTTP::Response containing the 169 | # appropriate status and error message if the IP was disallowed, 0 170 | # otherwise. URIs without hostname or IP address are always allowed, 171 | # including schemes where those make no sense (eg. data:, often javascript:). 172 | # 173 | sub ip_disallowed 174 | { 175 | my ($self, $uri) = @_; 176 | return 0 if $self->allow_private_ips(); # Short-circuit 177 | 178 | my $hostname = undef; 179 | eval { $hostname = $uri->host() }; # Not all URIs implement host()... 180 | return 0 unless $hostname; 181 | 182 | my $addr = my $iptype = my $resp = undef; 183 | if (my $host = Net::hostent::gethostbyname($hostname)) { 184 | $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); 185 | if ($addr && (my $ip = Net::IP->new($addr))) { 186 | $iptype = $ip->iptype(); 187 | } 188 | } 189 | if ($iptype && $iptype ne 'PUBLIC') { 190 | $resp = HTTP::Response->new(403, 191 | 'Checking non-public IP address disallowed by link checker configuration' 192 | ); 193 | $resp->header('Client-Warning', 'Internal response'); 194 | } 195 | 196 | # #defaults to 80 197 | my $port = undef; 198 | $port = $uri->port(); 199 | #whitelist regex if short enough 200 | if ($port !~ m/^(8[0-9]|21|443)$/ && $port < 1024) { 201 | $resp = HTTP::Response->new(403, 202 | 'Checking certain ports disallowed by link checker configuration' 203 | ); 204 | $resp->header('Client-Warning', 'Internal response'); 205 | } 206 | 207 | return $resp; 208 | } 209 | 210 | # ----------------------------------------------------------------------------- 211 | 212 | package W3C::LinkChecker; 213 | 214 | use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION 215 | $DocType $Head $Accept $ContentTypes %Cfg $CssUrl); 216 | 217 | use CSS::DOM 0.09 qw(); # >= 0.09 for many bugfixes 218 | use CSS::DOM::Constants qw(:rule); 219 | use CSS::DOM::Style qw(); 220 | use CSS::DOM::Util qw(); 221 | use Encode qw(); 222 | use HTML::Entities qw(); 223 | use HTML::Parser 3.40 qw(); # >= 3.40 for utf8_mode() 224 | use HTTP::Headers::Util qw(); 225 | use HTTP::Message 5.827 qw(); # >= 5.827 for content_charset() 226 | use HTTP::Request 5.814 qw(); # >= 5.814 for accept_decodable() 227 | use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content() 228 | use Time::HiRes qw(); 229 | use URI 1.53 qw(); # >= 1.53 for secure() 230 | use URI::Escape qw(); 231 | use URI::Heuristic qw(); 232 | 233 | # @@@ Needs also W3C::UserAgent but can't use() it here. 234 | 235 | use constant RC_ROBOTS_TXT => -1; 236 | use constant RC_DNS_ERROR => -2; 237 | use constant RC_IP_DISALLOWED => -3; 238 | use constant RC_PROTOCOL_DISALLOWED => -4; 239 | use constant RC_MIXEDCONTENT_BLOCK => -5; 240 | 241 | use constant LINE_UNKNOWN => -1; 242 | 243 | use constant MP2 => 244 | (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); 245 | 246 | # Mixed content status based on https://w3c.github.io/webappsec-mixed-content/ 247 | use constant MC_NONE => 0; 248 | use constant MC_OPTIONAL => 1; 249 | use constant MC_CONDITIONAL => 2; 250 | use constant MC_BLOCK => 3; 251 | 252 | # Tag=>[[mixedContentStatus, attribute]] mapping of things we treat as links. 253 | # Note: meta/@http-equiv gets special treatment, see start() for details. 254 | use constant LINK_ATTRS => { 255 | a => [[MC_NONE, 'href']], 256 | 257 | # base/@href intentionally not checked 258 | # https://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi 259 | area => [[MC_NONE, 'href']], 260 | audio => [[MC_OPTIONAL, 'src']], 261 | blockquote => [[MC_NONE, 'cite']], 262 | body => [[MC_OPTIONAL, 'background']], 263 | command => [[MC_OPTIONAL, 'icon']], # no spec says how to handle this 264 | # wrt to mixed content 265 | 266 | # button/@formaction not checked (side effects) 267 | del => [[MC_NONE, 'cite']], 268 | 269 | # @pluginspage, @pluginurl, @href: pre-HTML5 proprietary 270 | embed => [[MC_BLOCK, 'href'], [MC_NONE, 'pluginspage'], [MC_NONE, 'pluginurl'], [MC_BLOCK, 'src']], 271 | 272 | # form/@action not checked (side effects) 273 | frame => [[MC_BLOCK, 'longdesc'], [MC_BLOCK, 'src']], 274 | html => [[MC_BLOCK, 'manifest']], 275 | iframe => [[MC_BLOCK, 'longdesc'], [MC_BLOCK, 'src']], 276 | img => [[MC_BLOCK, 'longdesc'], [MC_CONDITIONAL, 'src']], 277 | 278 | # input/@action, input/@formaction not checked (side effects) 279 | input => [[MC_BLOCK,'src']], 280 | ins => [[MC_NONE, 'cite']], 281 | link => [[MC_CONDITIONAL, 'href']], 282 | object => [[MC_BLOCK, 'data']], 283 | q => [[MC_NONE, 'cite']], 284 | script => [[MC_BLOCK, 'src']], 285 | source => [[MC_OPTIONAL, 'src']], 286 | track => [[MC_BLOCK, 'src']], 287 | video => [[MC_OPTIONAL, 'src'], [MC_BLOCK, 'poster']], 288 | }; 289 | 290 | # Tag=>[separator, subseparator, attributes] mapping of things we treat as lists of links. 291 | use constant LINK_LIST_ATTRS => { 292 | a => [qr/\s+/, qr/$/, [MC_BLOCK, 'ping']], 293 | applet => [qr/[\s,]+/, qr/$/, [MC_BLOCK, 'archive']], 294 | area => [qr/\s+/, qr/$/, [MC_BLOCK, 'ping']], 295 | img => [qr/\s*,\s*/, qr/\s.*$/, [MC_BLOCK, 'srcset']], 296 | head => [qr/\s+/, qr/$/, [MC_NONE, 'profile']], 297 | object => [qr/\s+/, qr/$/, [MC_BLOCK, 'archive']], 298 | source => [qr/\s*,\s*/, qr/\s.*$/, [MC_BLOCK, 'srcset']] 299 | }; 300 | 301 | # TBD/TODO: 302 | # - applet/@code? 303 | # - bgsound/@src? 304 | # - object/@classid? 305 | # - isindex/@action? 306 | # - layer/@background,@src? 307 | # - ilayer/@background? 308 | # - table,tr,td,th/@background? 309 | # - xmp/@href? 310 | 311 | @W3C::LinkChecker::ISA = qw(HTML::Parser); 312 | 313 | BEGIN { 314 | 315 | # Version info 316 | $PACKAGE = 'W3C Link Checker'; 317 | $PROGRAM = 'W3C-checklink'; 318 | $VERSION = '5.0.0'; 319 | $REVISION = sprintf('version %s (c) 1999-2019 W3C', $VERSION); 320 | $AGENT = sprintf( 321 | '%s/%s', 322 | $PROGRAM, $VERSION 323 | ); 324 | 325 | # Pull in mod_perl modules if applicable. 326 | eval { 327 | local $SIG{__DIE__} = undef; 328 | require Apache2::RequestUtil; 329 | } if MP2(); 330 | 331 | my @content_types = qw( 332 | text/html 333 | application/xhtml+xml;q=0.9 334 | application/vnd.wap.xhtml+xml;q=0.6 335 | ); 336 | $Accept = join(', ', @content_types, '*/*;q=0.5'); 337 | push(@content_types, 'text/css', 'text/html-sandboxed'); 338 | my $re = join('|', map { s/;.*//; quotemeta } @content_types); 339 | $ContentTypes = qr{\b(?:$re)\b}io; 340 | 341 | # Regexp for matching URL values in CSS. 342 | $CssUrl = qr/url\(\s*(['"]?)(.*?)\1\s*\)/; 343 | 344 | # 345 | # Read configuration. If the W3C_CHECKLINK_CFG environment variable has 346 | # been set or the default contains a non-empty file, read it. Otherwise, 347 | # skip silently. 348 | # 349 | my $defaultconfig = '/etc/w3c/checklink.conf'; 350 | if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) { 351 | 352 | require Config::General; 353 | Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy 354 | 355 | my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig; 356 | eval { 357 | my %config_opts = ( 358 | -ConfigFile => $conffile, 359 | -SplitPolicy => 'equalsign', 360 | -AllowMultiOptions => 'no', 361 | ); 362 | %Cfg = Config::General->new(%config_opts)->getall(); 363 | }; 364 | if ($@) { 365 | die <<"EOF"; 366 | Failed to read configuration from '$conffile': 367 | $@ 368 | EOF 369 | } 370 | } 371 | $Cfg{Markup_Validator_URI} ||= 'https://validator.w3.org/check?uri=%s'; 372 | $Cfg{CSS_Validator_URI} ||= 373 | 'https://jigsaw.w3.org/css-validator/validator?uri=%s'; 374 | $Cfg{Doc_URI} ||= 'https://validator.w3.org/checklink/docs/checklink.html'; 375 | $Cfg{Doc_Base_URI} ||= '/checklink/docs/'; 376 | $Cfg{Doc_Images_URI} ||= '/checklink/images/'; 377 | 378 | # Untaint config params that are used as the format argument to (s)printf(), 379 | # Perl 5.10 does not want to see that in taint mode. 380 | ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/); 381 | ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/); 382 | 383 | $DocType = 384 | ''; 385 | my $css_url = $Cfg{Doc_Base_URI} . 'linkchecker.css'; 386 | my $js_url = $Cfg{Doc_Base_URI} . 'linkchecker.js'; 387 | $Head = 388 | sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url); 389 | 390 | 391 | 392 | 393 | EOF 394 | 395 | # Trusted environment variables that need laundering in taint mode. 396 | for (qw(NNTPSERVER NEWSHOST)) { 397 | ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_}; 398 | } 399 | 400 | # Use passive FTP by default, see Net::FTP(3). 401 | $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); 402 | } 403 | 404 | # Autoflush 405 | $| = 1; 406 | 407 | # Different options specified by the user 408 | my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); 409 | my %Opts = ( 410 | Command_Line => $cmdline, 411 | Quiet => 0, 412 | Summary_Only => 0, 413 | Verbose => 0, 414 | Progress => 0, 415 | HTML => 0, 416 | Timeout => 30, 417 | Redirects => 1, 418 | Dir_Redirects => 1, 419 | Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, 420 | Cookies => undef, 421 | No_Referer => 0, 422 | Hide_Same_Realm => 0, 423 | Follow_File_Links => 0, 424 | Depth => 0, # < 0 means unlimited recursion. 425 | Sleep_Time => 1, 426 | Connection_Cache_Size => 2, 427 | Max_Documents => 150, # For the online version. 428 | User => undef, 429 | Password => undef, 430 | Base_Locations => [], 431 | Exclude => undef, 432 | Exclude_Docs => undef, 433 | Suppress_Redirect => [], 434 | Suppress_Redirect_Prefix => [], 435 | Suppress_Redirect_Regexp => [], 436 | Suppress_Temp_Redirects => 1, 437 | Suppress_Broken => [], 438 | Suppress_Fragment => [], 439 | Masquerade => 0, 440 | Masquerade_From => '', 441 | Masquerade_To => '', 442 | Trusted => $Cfg{Trusted}, 443 | Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ? 444 | $Cfg{Allow_Private_IPs} : 445 | $cmdline, 446 | ); 447 | undef $cmdline; 448 | 449 | # Global variables 450 | # What URI's did we process? (used for recursive mode) 451 | my %processed; 452 | 453 | # Result of the HTTP query 454 | my %results; 455 | 456 | # List of redirects 457 | my %redirects; 458 | 459 | # Count of the number of documents checked 460 | my $doc_count = 0; 461 | 462 | # Time stamp 463 | my $timestamp = &get_timestamp(); 464 | 465 | # Per-document header; undefined if already printed. See print_doc_header(). 466 | my $doc_header; 467 | 468 | &parse_arguments() if $Opts{Command_Line}; 469 | 470 | my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address 471 | 472 | $ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}}); 473 | if ($ua->can('delay')) { 474 | $ua->delay($Opts{Sleep_Time} / 60); 475 | } 476 | $ua->timeout($Opts{Timeout}); 477 | 478 | # Set up cookie stash if requested 479 | if (defined($Opts{Cookies})) { 480 | require HTTP::Cookies; 481 | my $cookie_file = $Opts{Cookies}; 482 | if ($cookie_file eq 'tmp') { 483 | $cookie_file = undef; 484 | } 485 | elsif ($cookie_file =~ /^(.*)$/) { 486 | $cookie_file = $1; # untaint 487 | } 488 | $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1)); 489 | } 490 | eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); }; 491 | if ($@) { 492 | die <<"EOF"; 493 | Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and 494 | Net::hostent modules: 495 | $@ 496 | EOF 497 | } 498 | 499 | # Add configured allowed protocols 500 | if ($Cfg{Allowed_Protocols}) { 501 | my $allowed = $ua->protocols_allowed(); 502 | push(@$allowed, split(/[,\s]+/, lc($Cfg{Allowed_Protocols}))); 503 | $ua->protocols_allowed($allowed); 504 | } 505 | if ($Opts{Follow_File_Links}) { 506 | my $allowed = $ua->protocols_allowed(); 507 | push(@$allowed, "file"); 508 | $ua->protocols_allowed($allowed); 509 | } 510 | 511 | if ($Opts{Command_Line}) { 512 | 513 | require Text::Wrap; 514 | Text::Wrap->import('wrap'); 515 | 516 | require URI::file; 517 | 518 | &usage(1) unless scalar(@ARGV); 519 | 520 | $Opts{_Self_URI} = 'https://validator.w3.org/checklink'; # For HTML output 521 | 522 | &ask_password() if ($Opts{User} && !$Opts{Password}); 523 | 524 | if (!$Opts{Summary_Only}) { 525 | printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; 526 | } 527 | else { 528 | $Opts{Verbose} = 0; 529 | $Opts{Progress} = 0; 530 | } 531 | 532 | # Populate data for print_form() 533 | my %params = ( 534 | summary => $Opts{Summary_Only}, 535 | hide_redirects => !$Opts{Redirects}, 536 | hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all', 537 | no_accept_language => !( 538 | defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto' 539 | ), 540 | no_referer => $Opts{No_Referer}, 541 | recursive => ($Opts{Depth} != 0), 542 | depth => $Opts{Depth}, 543 | ); 544 | 545 | my $check_num = 1; 546 | my $has_error = 0; 547 | my @bases = @{$Opts{Base_Locations}}; 548 | for my $uri (@ARGV) { 549 | 550 | # Reset base locations so that previous URI's given on the command line 551 | # won't affect the recursion scope for this URI (see check_uri()) 552 | @{$Opts{Base_Locations}} = @bases; 553 | 554 | # Transform the parameter into a URI 555 | $uri = &urize($uri); 556 | $params{uri} = $uri; 557 | $has_error += &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1); 558 | $check_num++; 559 | } 560 | undef $check_num; 561 | 562 | if ($Opts{HTML}) { 563 | &html_footer(); 564 | } 565 | elsif ($doc_count > 0 && !$Opts{Summary_Only}) { 566 | printf("\n%s\n", &global_stats()); 567 | } 568 | exit ($has_error ? 64 : 0); 569 | } 570 | else { 571 | 572 | require CGI; 573 | require CGI::Carp; 574 | CGI::Carp->import(qw(fatalsToBrowser)); 575 | require CGI::Cookie; 576 | 577 | my $query = CGI->new(); 578 | 579 | for my $param ($query->param()) { 580 | my @values = map { Encode::decode_utf8($_) } $query->param($param); 581 | $query->param($param, @values); 582 | } 583 | 584 | # Set a few parameters in CGI mode 585 | $Opts{Verbose} = 0; 586 | $Opts{Progress} = 0; 587 | $Opts{HTML} = 1; 588 | $Opts{_Self_URI} = $query->url(-relative => 1); 589 | 590 | # Backwards compatibility 591 | my $uri = undef; 592 | if ($uri = $query->param('url')) { 593 | $query->param('uri', $uri) unless $query->param('uri'); 594 | $query->delete('url'); 595 | } 596 | $uri = $query->param('uri'); 597 | 598 | if (!$uri) { 599 | &html_header('', undef); # Set cookie only from results page. 600 | my %cookies = CGI::Cookie->fetch(); 601 | &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1); 602 | &html_footer(); 603 | exit; 604 | } 605 | 606 | # Backwards compatibility 607 | if ($query->param('hide_dir_redirects')) { 608 | $query->param('hide_redirects', 'on'); 609 | $query->param('hide_type', 'dir'); 610 | $query->delete('hide_dir_redirects'); 611 | } 612 | 613 | $Opts{Summary_Only} = 1 if $query->param('summary'); 614 | 615 | if ($query->param('hide_redirects')) { 616 | $Opts{Dir_Redirects} = 0; 617 | if (my $type = $query->param('hide_type')) { 618 | $Opts{Redirects} = 0 if ($type ne 'dir'); 619 | } 620 | else { 621 | $Opts{Redirects} = 0; 622 | } 623 | } 624 | 625 | $Opts{Accept_Language} = undef if $query->param('no_accept_language'); 626 | $Opts{No_Referer} = $query->param('no_referer'); 627 | 628 | $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); 629 | if (my $depth = $query->param('depth')) { 630 | 631 | # @@@ Ignore invalid depth silently for now. 632 | $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/); 633 | } 634 | 635 | # Save, clear or leave cookie as is. 636 | my $cookie = undef; 637 | if (my $action = $query->param('cookie')) { 638 | if ($action eq 'clear') { 639 | 640 | # Clear the cookie. 641 | $cookie = CGI::Cookie->new(-name => $PROGRAM,-value => $PROGRAM); 642 | $cookie->value({clear => 1}); 643 | $cookie->expires('-1M'); 644 | } 645 | elsif ($action eq 'set') { 646 | 647 | # Set the options. 648 | $cookie = CGI::Cookie->new(-name => $PROGRAM,-value => $PROGRAM); 649 | my %options = $query->Vars(); 650 | delete($options{$_}) 651 | for qw(url uri check cookie); # Non-persistent. 652 | $cookie->value(\%options); 653 | } 654 | } 655 | if (!$cookie) { 656 | my %cookies = CGI::Cookie->fetch(); 657 | $cookie = $cookies{$PROGRAM}; 658 | } 659 | 660 | # Always refresh cookie expiration time. 661 | $cookie->expires('+1M') if ($cookie && !$cookie->expires()); 662 | 663 | # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. 664 | # If we're under mod_perl, there is a way around it... 665 | eval { 666 | local $SIG{__DIE__} = undef; 667 | my $auth = 668 | Apache2::RequestUtil->request()->headers_in()->{Authorization}; 669 | $ENV{HTTP_AUTHORIZATION} = $auth if $auth; 670 | } if (MP2() && !$ENV{HTTP_AUTHORIZATION}); 671 | 672 | $uri =~ s/^\s+//g; 673 | if ($uri =~ /:/) { 674 | $uri = URI->new($uri); 675 | } 676 | else { 677 | if ($uri =~ m|^//|) { 678 | $uri = URI->new("http:$uri"); 679 | } 680 | else { 681 | local $ENV{URL_GUESS_PATTERN} = ''; 682 | my $guess = URI::Heuristic::uf_uri($uri); 683 | if ($guess->scheme() && $ua->is_protocol_supported($guess)) { 684 | $uri = $guess; 685 | } 686 | else { 687 | $uri = URI->new("http://$uri"); 688 | } 689 | } 690 | } 691 | $uri = $uri->canonical(); 692 | $query->param("uri", $uri); 693 | 694 | &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie); 695 | undef $query; # Not needed any more. 696 | &html_footer(); 697 | } 698 | 699 | ############################################################################### 700 | 701 | ################################ 702 | # Command line and usage stuff # 703 | ################################ 704 | 705 | sub parse_arguments () 706 | { 707 | require Encode::Locale; 708 | Encode::Locale::decode_argv(); 709 | 710 | require Getopt::Long; 711 | Getopt::Long->require_version(2.17); 712 | Getopt::Long->import('GetOptions'); 713 | Getopt::Long::Configure('bundling', 'no_ignore_case'); 714 | my $masq = ''; 715 | my @locs = (); 716 | 717 | GetOptions( 718 | 'help|h|?' => sub { usage(0) }, 719 | 'q|quiet' => sub { 720 | $Opts{Quiet} = 1; 721 | $Opts{Summary_Only} = 1; 722 | }, 723 | 's|summary' => \$Opts{Summary_Only}, 724 | 'b|broken' => sub { 725 | $Opts{Redirects} = 0; 726 | $Opts{Dir_Redirects} = 0; 727 | }, 728 | 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; }, 729 | 'v|verbose' => \$Opts{Verbose}, 730 | 'i|indicator' => \$Opts{Progress}, 731 | 'H|html' => \$Opts{HTML}, 732 | 'r|recursive' => sub { 733 | $Opts{Depth} = -1 734 | if $Opts{Depth} == 0; 735 | }, 736 | 'l|location=s' => \@locs, 737 | 'X|exclude=s@' => \@{$Opts{Exclude}}, 738 | 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}}, 739 | 'follow-file-links' => \$Opts{Follow_File_Links}, 740 | 'https' => \$Opts{Https}, 741 | 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}}, 742 | 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}}, 743 | 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects}, 744 | 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}}, 745 | 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}}, 746 | 'u|user=s' => \$Opts{User}, 747 | 'p|password=s' => \$Opts{Password}, 748 | 't|timeout=i' => \$Opts{Timeout}, 749 | 'C|connection-cache=i' => \$Opts{Connection_Cache_Size}, 750 | 'S|sleep=i' => \$Opts{Sleep_Time}, 751 | 'L|languages=s' => \$Opts{Accept_Language}, 752 | 'c|cookies=s' => \$Opts{Cookies}, 753 | 'R|no-referer' => \$Opts{No_Referer}, 754 | 'D|depth=i' => sub { 755 | $Opts{Depth} = $_[1] 756 | unless $_[1] == 0; 757 | }, 758 | 'd|domain=s' => \$Opts{Trusted}, 759 | 'masquerade=s' => \$masq, 760 | 'hide-same-realm' => \$Opts{Hide_Same_Realm}, 761 | 'V|version' => \&version, 762 | ) || 763 | usage(1); 764 | 765 | if ($masq) { 766 | $Opts{Masquerade} = 1; 767 | my @masq = split(/\s+/, $masq); 768 | if (scalar(@masq) != 2 || 769 | !defined($masq[0]) || 770 | $masq[0] !~ /\S/ || 771 | !defined($masq[1]) || 772 | $masq[1] !~ /\S/) 773 | { 774 | usage(1, 775 | "Error: --masquerade takes two whitespace separated URIs."); 776 | } 777 | else { 778 | require URI::file; 779 | $Opts{Masquerade_From} = $masq[0]; 780 | my $u = URI->new($masq[1]); 781 | $Opts{Masquerade_To} = 782 | $u->scheme() ? $u : URI::file->new_abs($masq[1]); 783 | } 784 | } 785 | 786 | if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') { 787 | $Opts{Accept_Language} = &guess_language(); 788 | } 789 | 790 | if (($Opts{Sleep_Time} || 0) < 1) { 791 | warn( 792 | "*** Warning: minimum allowed sleep time is 1 second, resetting.\n" 793 | ); 794 | $Opts{Sleep_Time} = 1; 795 | } 796 | 797 | push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs); 798 | 799 | $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); 800 | 801 | # Precompile/error-check regular expressions. 802 | for my $i (0 .. $#{$Opts{Exclude}}) { 803 | eval { $Opts{Exclude}->[$i] = qr/$Opts{Exclude}->[$i]/; }; 804 | &usage(1, "Error in exclude regexp: $@") if $@; 805 | } 806 | for my $i (0 .. $#{$Opts{Exclude_Docs}}) { 807 | eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; 808 | &usage(1, "Error in exclude-docs regexp: $@") if $@; 809 | } 810 | if (defined($Opts{Trusted})) { 811 | eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; 812 | &usage(1, "Error in trusted domains regexp: $@") if $@; 813 | } 814 | 815 | # Sanity-check error-suppression arguments 816 | for my $i (0 .. $#{$Opts{Suppress_Redirect}}) { 817 | ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/; 818 | my $sr_arg = ${$Opts{Suppress_Redirect}}[$i]; 819 | if ($sr_arg !~ /.->./) { 820 | &usage(1, 821 | "Bad suppress-redirect argument, should contain \"->\": $sr_arg" 822 | ); 823 | } 824 | } 825 | for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) { 826 | my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i]; 827 | $srp_arg =~ s/ /->/; 828 | if ($srp_arg !~ /^(.*)->(.*)$/) { 829 | &usage(1, 830 | "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg" 831 | ); 832 | } 833 | 834 | # Turn prefixes into a regexp. 835 | ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism; 836 | } 837 | for my $i (0 .. $#{$Opts{Suppress_Broken}}) { 838 | ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/; 839 | my $sb_arg = ${$Opts{Suppress_Broken}}[$i]; 840 | if ($sb_arg !~ /^(-1|[0-9]+)(:.)?/) { 841 | &usage(1, 842 | "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg" 843 | ); 844 | } 845 | } 846 | for my $sf_arg (@{$Opts{Suppress_Fragment}}) { 847 | if ($sf_arg !~ /.#./) { 848 | &usage(1, 849 | "Bad suppress-fragment argument, should contain \"#\": $sf_arg" 850 | ); 851 | } 852 | } 853 | 854 | return; 855 | } 856 | 857 | sub version () 858 | { 859 | print "$PACKAGE $REVISION\n"; 860 | exit 0; 861 | } 862 | 863 | sub usage () 864 | { 865 | my ($exitval, $msg) = @_; 866 | $exitval = 0 unless defined($exitval); 867 | $msg ||= ''; 868 | $msg =~ s/[\r\n]*$/\n\n/ if $msg; 869 | 870 | die($msg) unless $Opts{Command_Line}; 871 | 872 | my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; 873 | 874 | select(STDERR) if $exitval; 875 | print "$msg$PACKAGE $REVISION 876 | 877 | Usage: checklink 878 | Options: 879 | -s, --summary Result summary only. 880 | -b, --broken Show only the broken links, not the redirects. 881 | -e, --dir-redirects Hide directory redirects, for example 882 | https://www.w3.org/TR -> https://www.w3.org/TR/ 883 | -r, --recursive Check the documents linked from the first one. 884 | -D, --depth N Check the documents linked from the first one to 885 | depth N (implies --recursive). 886 | -l, --location URI Scope of the documents checked in recursive mode 887 | (implies --recursive). Can be specified multiple 888 | times. If not specified, the default eg. for 889 | https://www.w3.org/TR/html4/Overview.html 890 | would be https://www.w3.org/TR/html4/ 891 | -X, --exclude REGEXP Do not check links whose full, canonical URIs 892 | match REGEXP; also limits recursion the same way 893 | as --exclude-docs with the same regexp would. 894 | This option may be specified multiple times. 895 | --exclude-docs REGEXP In recursive mode, do not check links in documents 896 | whose full, canonical URIs match REGEXP. This 897 | option may be specified multiple times. 898 | --suppress-redirect URI->URI Do not report a redirect from the first to the 899 | second URI. This option may be specified multiple 900 | times. 901 | --suppress-redirect-prefix URI->URI Do not report a redirect from a child of 902 | the first URI to the same child of the second URI. 903 | This option may be specified multiple times. 904 | --suppress-temp-redirects Suppress warnings about temporary redirects. 905 | --suppress-broken CODE(:URI) Do not report a broken link with the given CODE. 906 | CODE is HTTP response, or -1 for robots exclusion. 907 | This option may be specified multiple times, and 908 | can optionally be limited to a single pair of CODE / 909 | URI. 910 | --suppress-fragment URI Do not report the given broken fragment URI. 911 | A fragment URI contains \"#\". This option may be 912 | specified multiple times. 913 | --follow-file-links When encountering a file: URI, try to follow it 914 | (disabled by default for security reasons) 915 | -L, --languages LANGS Accept-Language header to send. The special value 916 | 'auto' causes autodetection from the environment. 917 | -c, --cookies FILE Use cookies, load/save them in FILE. The special 918 | value 'tmp' causes non-persistent use of cookies. 919 | -R, --no-referer Do not send the Referer HTTP header. 920 | -q, --quiet No output if no errors are found (implies -s). 921 | -v, --verbose Verbose mode. 922 | -i, --indicator Show percentage of lines processed while parsing. 923 | -u, --user USERNAME Specify a username for authentication. 924 | -p, --password PASSWORD Specify a password. 925 | --hide-same-realm Hide 401's that are in the same realm as the 926 | document checked. 927 | -S, --sleep SECS Sleep SECS seconds between requests to each server 928 | (default and minimum: 1 second). 929 | -t, --timeout SECS Timeout for requests in seconds (default: 30). 930 | -d, --domain DOMAIN Regular expression describing the domain to which 931 | authentication information will be sent 932 | (default: $trust). 933 | --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2. See the 934 | manual page for more information. 935 | --https Check the file as if it was served on https:// 936 | -H, --html HTML output. 937 | -?, -h, --help Show this message and exit. 938 | -V, --version Output version information and exit. 939 | 940 | See \"perldoc LWP\" for information about proxy server support, 941 | \"perldoc Net::FTP\" for information about various environment variables 942 | affecting FTP connections and \"perldoc Net::NNTP\" for setting a default 943 | NNTP server for news: URIs. 944 | 945 | The W3C_CHECKLINK_CFG environment variable can be used to set the 946 | configuration file to use. See details in the full manual page, it can 947 | be displayed with: perldoc checklink 948 | 949 | More documentation at: $Cfg{Doc_URI} 950 | Please file bug reports and comments at https://github.com/w3c/link-checker/issues 951 | "; 952 | exit $exitval; 953 | } 954 | 955 | sub ask_password () 956 | { 957 | eval { 958 | local $SIG{__DIE__} = undef; 959 | require Term::ReadKey; 960 | Term::ReadKey->require_version(2.00); 961 | Term::ReadKey->import(qw(ReadMode)); 962 | }; 963 | if ($@) { 964 | warn('Warning: Term::ReadKey 2.00 or newer not available, ' . 965 | "password input disabled.\n"); 966 | return; 967 | } 968 | printf(STDERR 'Enter the password for user %s: ', $Opts{User}); 969 | ReadMode('noecho', *STDIN); 970 | chomp($Opts{Password} = ); 971 | ReadMode('restore', *STDIN); 972 | print(STDERR "ok.\n"); 973 | return; 974 | } 975 | 976 | ############################################################################### 977 | 978 | ########################################################################### 979 | # Guess an Accept-Language header based on the $LANG environment variable # 980 | ########################################################################### 981 | 982 | sub guess_language () 983 | { 984 | my $lang = $ENV{LANG} or return; 985 | 986 | $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro... 987 | 988 | return 'en' if ($lang eq 'C' || $lang eq 'POSIX'); 989 | 990 | my $res = undef; 991 | eval { 992 | require Locale::Language; 993 | if (my $tmp = Locale::Language::language2code($lang)) { 994 | $lang = $tmp; 995 | } 996 | if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) { 997 | if (Locale::Language::code2language($l)) { 998 | $res = $l; 999 | if ($c) { 1000 | require Locale::Country; 1001 | $res .= "-$c" if Locale::Country::code2country($c); 1002 | } 1003 | } 1004 | } 1005 | }; 1006 | return $res; 1007 | } 1008 | 1009 | ############################ 1010 | # Transform foo into a URI # 1011 | ############################ 1012 | 1013 | sub urize ($) 1014 | { 1015 | my $arg = shift; 1016 | my $uarg = URI::Escape::uri_unescape($arg); 1017 | my $uri; 1018 | if (-d $uarg) { 1019 | 1020 | # look for an "index" file in dir, return it if found 1021 | require File::Spec; 1022 | for my $index (map { File::Spec->catfile($uarg, $_) } 1023 | qw(index.html index.xhtml index.htm index.xhtm)) 1024 | { 1025 | if (-e $index) { 1026 | $uri = URI::file->new_abs($index); 1027 | last; 1028 | } 1029 | } 1030 | 1031 | # return dir itself if an index file was not found 1032 | $uri ||= URI::file->new_abs($uarg); 1033 | } 1034 | elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) { 1035 | $uri = URI::file->new_abs($uarg); 1036 | } 1037 | else { 1038 | my $newuri = URI->new($arg); 1039 | if ($newuri->scheme()) { 1040 | $uri = $newuri; 1041 | } 1042 | else { 1043 | local $ENV{URL_GUESS_PATTERN} = ''; 1044 | $uri = URI::Heuristic::uf_uri($arg); 1045 | $uri = URI::file->new_abs($uri) unless $uri->scheme(); 1046 | } 1047 | } 1048 | return $uri->canonical(); 1049 | } 1050 | 1051 | sub should_exclude ($) 1052 | { 1053 | my ($canon_uri) = @_; 1054 | for my $ex (@{$Opts{Exclude}}) { 1055 | return 1 if ($canon_uri =~ $ex); 1056 | } 1057 | return 0; 1058 | } 1059 | 1060 | ######################################## 1061 | # Check for broken links in a resource # 1062 | ######################################## 1063 | 1064 | sub check_uri (\%\$$$$;\$$) 1065 | { 1066 | my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_; 1067 | $is_start ||= ($check_num == 1); 1068 | 1069 | my $start = $Opts{Summary_Only} ? 0 : &get_timestamp(); 1070 | my $error_found = 0; 1071 | 1072 | # Get and parse the document 1073 | my $response = &get_document( 1074 | 'GET', $uri, $doc_count, \%redirects, $referer, 1075 | $cookie, $params, $check_num, $is_start 1076 | ); 1077 | 1078 | # Can we check the resource? If not, we exit here... 1079 | return 0 if defined($response->{Stop}); 1080 | 1081 | if ($Opts{HTML}) { 1082 | &html_header($uri, $cookie) if ($check_num == 1); 1083 | &print_form($params, $cookie, $check_num) if $is_start; 1084 | } 1085 | 1086 | if ($is_start) { # Starting point of a new check, eg. from the command line 1087 | # Use the first URI as the recursion base unless specified otherwise. 1088 | push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical()) 1089 | unless @{$Opts{Base_Locations}}; 1090 | } 1091 | else { 1092 | 1093 | # Before fetching the document, we don't know if we'll be within the 1094 | # recursion scope or not (think redirects). 1095 | if (!&in_recursion_scope($response->{absolute_uri})) { 1096 | hprintf("Not in recursion scope: %s\n") 1097 | if ($Opts{Verbose}); 1098 | $response->content(""); 1099 | return $error_found; 1100 | } 1101 | } 1102 | 1103 | # Define the document header, and perhaps print it. 1104 | # (It might still be defined if the previous document had no errors; 1105 | # just redefine it in that case.) 1106 | 1107 | if ($check_num != 1) { 1108 | if ($Opts{HTML}) { 1109 | $doc_header = "\n\n"; 1110 | } 1111 | else { 1112 | $doc_header = "\n" . ('-' x 40) . "\n"; 1113 | } 1114 | } 1115 | 1116 | if ($Opts{HTML}) { 1117 | $doc_header .= 1118 | ("
{absolute_uri}) . "\">\n\t

\nProcessing\t" . &show_url($response->{absolute_uri}) . 1119 | "\n

\n\n"); 1120 | } 1121 | else { 1122 | $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n"; 1123 | } 1124 | 1125 | if (!$Opts{Quiet}) { 1126 | print_doc_header(); 1127 | } 1128 | 1129 | # We are checking a new document 1130 | $doc_count++; 1131 | 1132 | my $result_anchor = 'results' . $doc_count; 1133 | 1134 | if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) { 1135 | my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; 1136 | my $acclang = $Opts{Accept_Language} || '(not sent)'; 1137 | my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; 1138 | my $cookies = 'not used'; 1139 | if (defined($Opts{Cookies})) { 1140 | $cookies = 'used, '; 1141 | if ($Opts{Cookies} eq 'tmp') { 1142 | $cookies .= 'non-persistent'; 1143 | } 1144 | else { 1145 | $cookies .= "file $Opts{Cookies}"; 1146 | } 1147 | } 1148 | printf( 1149 | <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s); 1150 | 1151 | Settings used: 1152 | - Accept: %s 1153 | - Accept-Language: %s 1154 | - Referer: %s 1155 | - Cookies: %s 1156 | - Upgrade-Insecure-Requets: 1 1157 | - Sleeping %d second%s between requests to each server 1158 | EOF 1159 | printf("- Excluding links matching %s\n", 1160 | join(', ', @{$Opts{Exclude}})) 1161 | if @{$Opts{Exclude}}; 1162 | printf("- Excluding links in documents whose URIs match %s\n", 1163 | join(', ', @{$Opts{Exclude_Docs}})) 1164 | if @{$Opts{Exclude_Docs}}; 1165 | } 1166 | 1167 | if ($Opts{HTML}) { 1168 | if (!$Opts{Summary_Only}) { 1169 | my $accept = &encode($Accept); 1170 | my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); 1171 | my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; 1172 | my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; 1173 | printf( 1174 | <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s); 1175 |
1176 | Settings used: 1177 | 1183 |
1184 | EOF 1185 | printf("

Go to the results.

\n", 1186 | $result_anchor); 1187 | my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri}, 1188 | "^A-Za-z0-9."); 1189 | print "

For reliable link checking results, check "; 1190 | 1191 | if (!$response->{IsCss}) { 1192 | printf("HTML validity and ", 1193 | &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri))); 1194 | } 1195 | printf( 1196 | "CSS validity first.

1197 |

Back to the link checker.

\n", 1198 | &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), 1199 | &encode($Opts{_Self_URI}) 1200 | ); 1201 | 1202 | printf(<<'EOF', $result_anchor); 1203 |
1204 |

Status:

1205 | 1209 |
1210 | EOF
1211 |         }
1212 |     }
1213 | 
1214 |     if ($Opts{Summary_Only} && !$Opts{Quiet}) {
1215 |         print '

' if $Opts{HTML}; 1216 | print 'This may take some time'; 1217 | print "... (why?)

" 1218 | if $Opts{HTML}; 1219 | print " if the document has many links to check.\n" unless $Opts{HTML}; 1220 | } 1221 | 1222 | # Record that we have processed this resource 1223 | $processed{$response->{absolute_uri}} = 1; 1224 | 1225 | # Parse the document 1226 | my $p = 1227 | &parse_document($uri, $response->base(), $response, 1, ($depth != 0)); 1228 | my $base = URI->new($p->{base}); 1229 | if (!defined($base->scheme)) { 1230 | $base = URI->new_abs($p->{base}, $uri); 1231 | } 1232 | my $check_mixed_content = ($base->scheme eq 'https' || $Opts{Https}); 1233 | 1234 | # Check anchors 1235 | ############### 1236 | 1237 | print "Checking anchors...\n" unless $Opts{Summary_Only}; 1238 | 1239 | my %errors; 1240 | while (my ($anchor, $lines) = each(%{$p->{Anchors}})) { 1241 | if (!length($anchor)) { 1242 | 1243 | # Empty IDREF's are not allowed 1244 | $errors{$anchor} = 1; 1245 | } 1246 | else { 1247 | my $times = 0; 1248 | $times += $_ for values(%$lines); 1249 | 1250 | # They should appear only once 1251 | $errors{$anchor} = 1 if ($times > 1); 1252 | } 1253 | } 1254 | print " done.\n" unless $Opts{Summary_Only}; 1255 | 1256 | # Check links 1257 | ############# 1258 | 1259 | &hprintf("Recording all the links found: %d\n", 1260 | scalar(keys %{$p->{Links}})) 1261 | if ($Opts{Verbose}); 1262 | my %links; 1263 | my %hostlinks; 1264 | 1265 | # Record all the links found 1266 | while (my ($link, $lines) = each(%{$p->{Links}})) { 1267 | #my @hash = values(%$lines); 1268 | #print join(',', map { keys(%$_) } @hash); 1269 | #print $link, $mixedcontent; 1270 | my $link_uri = URI->new($link); 1271 | my $abs_link_uri = URI->new_abs($link_uri, $base); 1272 | 1273 | if ($Opts{Masquerade}) { 1274 | if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) { 1275 | print_doc_header(); 1276 | printf("processing %s in base %s\n", 1277 | $abs_link_uri, $Opts{Masquerade_To}); 1278 | my $nlink = $abs_link_uri; 1279 | $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|; 1280 | $abs_link_uri = URI->new($nlink); 1281 | } 1282 | } 1283 | 1284 | my $canon_uri = URI->new($abs_link_uri->canonical()); 1285 | my $fragment = $canon_uri->fragment(undef); 1286 | if (!should_exclude($canon_uri)) { 1287 | if (!exists($links{$canon_uri})) { 1288 | my $hostport; 1289 | $hostport = $canon_uri->host_port() 1290 | if $canon_uri->can('host_port'); 1291 | $hostport = '' unless defined $hostport; 1292 | push(@{$hostlinks{$hostport}}, $canon_uri); 1293 | } 1294 | for my $line_num (keys(%{$lines})) { 1295 | my @mixedcontent = keys(%{%$lines{$line_num}}); 1296 | my $mc_block = MC_BLOCK; 1297 | if (grep(/^$mc_block$/, @mixedcontent)) { 1298 | 1299 | push(@{$links{$canon_uri}{mixedcontent}}, $line_num); 1300 | } 1301 | # take optional into account if CSP: block-all-mixed-content is set 1302 | my $mc_optional = MC_OPTIONAL; 1303 | if ($p->{csp} && grep(qr/^block-all-mixed-content$/, $p->{csp}) 1304 | && grep(/^$mc_optional$/, @mixedcontent)) { 1305 | push(@{$links{$canon_uri}{mixedcontent}}, $line_num); 1306 | } 1307 | if (!defined($fragment) || !length($fragment)) { 1308 | 1309 | # Document without fragment 1310 | $links{$canon_uri}{location}{$line_num} = @mixedcontent; 1311 | } 1312 | else { 1313 | 1314 | # Resource with a fragment 1315 | if (! grep { $_ eq "$canon_uri#$fragment" } @{$Opts{Suppress_Fragment}}) { 1316 | $links{$canon_uri}{fragments}{$fragment}{$line_num} = @mixedcontent; 1317 | } 1318 | } 1319 | } 1320 | } 1321 | } 1322 | 1323 | my @order = &distribute_links(\%hostlinks); 1324 | undef %hostlinks; 1325 | 1326 | # Build the list of broken URI's 1327 | 1328 | my $nlinks = scalar(@order); 1329 | 1330 | &hprintf("Checking %d links to build list of broken URI's\n", $nlinks) 1331 | if ($Opts{Verbose}); 1332 | 1333 | my %broken; 1334 | my %blocked; 1335 | my $link_num = 0; 1336 | for my $u (@order) { 1337 | my $ulinks = $links{$u}; 1338 | 1339 | if ($Opts{Summary_Only}) { 1340 | 1341 | # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896 1342 | print ' ' if ($Opts{HTML} && !$Opts{Command_Line}); 1343 | } 1344 | else { 1345 | &hprintf("\nChecking link %s\n", $u); 1346 | my $progress = ($link_num / $nlinks) * 100; 1347 | printf( 1348 | '', 1349 | $result_anchor, &encode($u), $progress) 1350 | if (!$Opts{Command_Line} && 1351 | $Opts{HTML} && 1352 | !$Opts{Summary_Only}); 1353 | } 1354 | $link_num++; 1355 | 1356 | # Check that a link is valid 1357 | my $mixedcontent_lines = $check_mixed_content && %$ulinks{"mixedcontent"} ? %$ulinks{"mixedcontent"} : (); 1358 | &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)), 1359 | \%links, \%redirects, $mixedcontent_lines, $p->{csp}); 1360 | &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) 1361 | if ($Opts{Verbose}); 1362 | if ($results{$u}{location}{success}) { 1363 | 1364 | # Even though it was not broken, we might want to display it 1365 | # on the results page (e.g. because it required authentication) 1366 | $broken{$u}{location} = 1 1367 | if ($results{$u}{location}{display} >= 400); 1368 | 1369 | # List the broken fragments 1370 | while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) { 1371 | my $fragment_broken = $results{$u}{fragments}{$fragment} == 0 1372 | # 100 matches what we get for ignored error codes 1373 | || $results{$u}{location}{code} == 100; 1374 | if ($Opts{Verbose}) { 1375 | my @line_nums = sort { $a <=> $b } keys(%$lines); 1376 | &hprintf( 1377 | "\t\t%s %s - Line%s: %s\n", 1378 | $fragment, 1379 | $fragment_broken ? 'Not found' : ( $results{$u}{fragments}{$fragment} > 0 ? 'OK' : 'Not checked'), 1380 | (scalar(@line_nums) > 1) ? 's' : '', 1381 | join(', ', @line_nums) 1382 | ); 1383 | } 1384 | 1385 | # A broken fragment? 1386 | $broken{$u}{fragments}{$fragment} += 2 if $fragment_broken; 1387 | } 1388 | } 1389 | elsif (!($Opts{Quiet} && defined($results{$u}{location}{code}) && &informational($results{$u}{location}{code}))) 1390 | { 1391 | 1392 | # Couldn't find the document 1393 | $broken{$u}{location} = 1; 1394 | 1395 | # All the fragments associated are hence broken 1396 | for my $fragment (keys %{$ulinks->{fragments}}) { 1397 | $broken{$u}{fragments}{$fragment}++; 1398 | } 1399 | } 1400 | if ($results{$u}{mixedcontent_blocked}) { 1401 | $blocked{$u} = $results{$u}{mixedcontent_blocked}; 1402 | } 1403 | } 1404 | &hprintf( 1405 | "\nProcessed in %s seconds.\n", 1406 | &time_diff($start, &get_timestamp()) 1407 | ) unless $Opts{Summary_Only}; 1408 | printf( 1409 | '', 1410 | $result_anchor, &time_diff($start, &get_timestamp())) 1411 | if ($Opts{HTML} && !$Opts{Summary_Only}); 1412 | 1413 | # Display results 1414 | if ($Opts{HTML} && !$Opts{Summary_Only}) { 1415 | print("
\n
\n"); 1416 | printf("

Results

\n", $result_anchor); 1417 | } 1418 | print "\n" unless $Opts{Quiet}; 1419 | 1420 | &links_summary(\%links, \%results, \%broken, \%redirects, \%blocked); 1421 | &anchors_summary($p->{Anchors}, \%errors); 1422 | 1423 | $error_found = scalar(keys(%errors)) + scalar(keys(%broken)) + scalar(keys(%blocked)); 1424 | 1425 | # Do we want to process other documents? 1426 | if ($depth != 0) { 1427 | 1428 | for my $u (map { URI->new($_) } keys %links) { 1429 | 1430 | next unless $results{$u}{location}{success}; # Broken link? 1431 | 1432 | next unless &in_recursion_scope($u); 1433 | 1434 | # Do we understand its content type? 1435 | next unless (defined $results{$u}{location}{type} && $results{$u}{location}{type} =~ $ContentTypes); 1436 | 1437 | # Have we already processed this URI? 1438 | next if &already_processed($u, $uri); 1439 | 1440 | # Do the job 1441 | print "\n" unless $Opts{Quiet}; 1442 | if ($Opts{HTML}) { 1443 | if (!$Opts{Command_Line}) { 1444 | if ($doc_count == $Opts{Max_Documents}) { 1445 | print( 1446 | "
\n

Maximum number of documents ($Opts{Max_Documents}) reached!

\n" 1447 | ); 1448 | } 1449 | if ($doc_count >= $Opts{Max_Documents}) { 1450 | $doc_count++; 1451 | print("

Not checking $u

\n"); 1452 | $processed{$u} = 1; 1453 | next; 1454 | } 1455 | } 1456 | } 1457 | 1458 | # This is an inherently recursive algorithm, so Perl's warning is not 1459 | # helpful. You may wish to comment this out when debugging, though. 1460 | no warnings 'recursion'; 1461 | 1462 | if ($depth < 0) { 1463 | $error_found += &check_uri($params, $u, 0, -1, $cookie, $uri); 1464 | } 1465 | else { 1466 | $error_found += &check_uri($params, $u, 0, $depth - 1, $cookie, $uri); 1467 | } 1468 | } 1469 | } 1470 | return $error_found; 1471 | } 1472 | 1473 | ############################################################### 1474 | # Distribute links based on host:port to avoid RobotUA delays # 1475 | ############################################################### 1476 | 1477 | sub distribute_links(\%) 1478 | { 1479 | my $hostlinks = shift; 1480 | 1481 | # Hosts ordered by weight (number of links), descending 1482 | my @order = 1483 | sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) } 1484 | keys %$hostlinks; 1485 | 1486 | # All link list flattened into one, in host weight order 1487 | my @all; 1488 | push(@all, @{$hostlinks->{$_}}) for @order; 1489 | 1490 | return @all if (scalar(@order) < 2); 1491 | 1492 | # Indexes and chunk size for "zipping" the end result list 1493 | my $num = scalar(@{$hostlinks->{$order[0]}}); 1494 | my @indexes = map { $_ * $num } (0 .. $num - 1); 1495 | 1496 | # Distribute them 1497 | my @result; 1498 | while (my @chunk = splice(@all, 0, $num)) { 1499 | @result[@indexes] = @chunk; 1500 | @indexes = map { $_ + 1 } @indexes; 1501 | } 1502 | 1503 | # Weed out undefs 1504 | @result = grep(defined, @result); 1505 | 1506 | return @result; 1507 | } 1508 | 1509 | ########################################## 1510 | # Decode Content-Encodings in a response # 1511 | ########################################## 1512 | 1513 | sub decode_content ($) 1514 | { 1515 | my $response = shift; 1516 | my $error = undef; 1517 | 1518 | my $docref = $response->decoded_content(ref => 1); 1519 | if (defined($docref)) { 1520 | utf8::encode($$docref); 1521 | $response->content_ref($docref); 1522 | 1523 | # Remove Content-Encoding so it won't be decoded again later. 1524 | $response->remove_header('Content-Encoding'); 1525 | } 1526 | else { 1527 | my $ce = $response->header('Content-Encoding'); 1528 | $ce = defined($ce) ? "'$ce'" : 'undefined'; 1529 | my $ct = $response->header('Content-Type'); 1530 | $ct = defined($ct) ? "'$ct'" : 'undefined'; 1531 | my $request_uri = $response->request->url; 1532 | 1533 | my $cs = $response->content_charset(); 1534 | $cs = defined($cs) ? "'$cs'" : 'unknown'; 1535 | $error = 1536 | "Error decoding document at <$request_uri>, Content-Type $ct, " . 1537 | "Content-Encoding $ce, content charset $cs: '$@'"; 1538 | } 1539 | return $error; 1540 | } 1541 | 1542 | ####################################### 1543 | # Get and parse a resource to process # 1544 | ####################################### 1545 | 1546 | sub get_document ($\$$;\%\$$$$$) 1547 | { 1548 | my ($method, $uri, $in_recursion, $redirects, $referer, 1549 | $cookie, $params, $check_num, $is_start 1550 | ) = @_; 1551 | 1552 | # $method contains the HTTP method the use (GET or HEAD) 1553 | # $uri object contains the identifier of the resource 1554 | # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least 1555 | # the second resource checked) 1556 | # $redirects is a pointer to the hash containing the map of the redirects 1557 | # $referer is the URI object of the referring document 1558 | # $cookie, $params, $check_num, and $is_start are for printing HTTP headers 1559 | # and the form if $in_recursion == 0 and not authenticating 1560 | 1561 | # Get the resource 1562 | my $response; 1563 | if (defined($results{$uri}{response}) && 1564 | !($method eq 'GET' && $results{$uri}{method} eq 'HEAD')) 1565 | { 1566 | $response = $results{$uri}{response}; 1567 | } 1568 | else { 1569 | $response = &get_uri($method, $uri, $referer); 1570 | &record_results($uri, $method, $response, $referer); 1571 | &record_redirects($redirects, $response); 1572 | } 1573 | if (!$response->is_success()) { 1574 | if (!$in_recursion) { 1575 | 1576 | # Is it too late to request authentication? 1577 | if ($response->code() == 401) { 1578 | &authentication($response, $cookie, $params, $check_num, 1579 | $is_start); 1580 | } 1581 | else { 1582 | if ($Opts{HTML}) { 1583 | &html_header($uri, $cookie) if ($check_num == 1); 1584 | &print_form($params, $cookie, $check_num) if $is_start; 1585 | print "

", &status_icon($response->code()); 1586 | } 1587 | &hprintf("\nError: %d %s\n", 1588 | $response->code(), $response->message() || '(no message)'); 1589 | print "

\n" if $Opts{HTML}; 1590 | } 1591 | } 1592 | $response->{Stop} = 1; 1593 | $response->content(""); 1594 | return ($response); 1595 | } 1596 | 1597 | # What is the URI of the resource that we are processing by the way? 1598 | my $base_uri = $response->base(); 1599 | my $request_uri = URI->new($response->request->url); 1600 | $response->{absolute_uri} = $request_uri->abs($base_uri); 1601 | 1602 | # Can we parse the document? 1603 | my $failed_reason; 1604 | my $ct = $response->header('Content-Type'); 1605 | if (!$ct || $ct !~ $ContentTypes) { 1606 | $failed_reason = "Content-Type for <$request_uri> is " . 1607 | (defined($ct) ? "'$ct'" : 'undefined'); 1608 | } 1609 | else { 1610 | $failed_reason = decode_content($response); 1611 | } 1612 | if ($failed_reason) { 1613 | 1614 | # No, there is a problem... 1615 | if (!$in_recursion) { 1616 | if ($Opts{HTML}) { 1617 | &html_header($uri, $cookie) if ($check_num == 1); 1618 | &print_form($params, $cookie, $check_num) if $is_start; 1619 | print "

", &status_icon(406); 1620 | 1621 | } 1622 | &hprintf("Can't check links: %s.\n", $failed_reason); 1623 | print "

\n" if $Opts{HTML}; 1624 | } 1625 | $response->{Stop} = 1; 1626 | $response->content(""); 1627 | } 1628 | 1629 | # Ok, return the information 1630 | return ($response); 1631 | } 1632 | 1633 | ######################################################### 1634 | # Check whether a URI is within the scope of recursion. # 1635 | ######################################################### 1636 | 1637 | sub in_recursion_scope (\$) 1638 | { 1639 | my ($uri) = @_; 1640 | return 0 unless $uri; 1641 | 1642 | my $candidate = $uri->canonical(); 1643 | 1644 | return 0 if should_exclude($candidate); 1645 | 1646 | for my $excluded_doc (@{$Opts{Exclude_Docs}}) { 1647 | return 0 if ($candidate =~ $excluded_doc); 1648 | } 1649 | 1650 | for my $base (@{$Opts{Base_Locations}}) { 1651 | my $rel = $candidate->rel($base); 1652 | next if ($candidate eq $rel); # Relative path not possible? 1653 | next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards? 1654 | return 1; 1655 | } 1656 | 1657 | return 0; # We always have at least one base location, but none matched. 1658 | } 1659 | 1660 | ################################# 1661 | # Check for content type match. # 1662 | ################################# 1663 | 1664 | sub is_content_type ($$) 1665 | { 1666 | my ($candidate, $type) = @_; 1667 | return 0 unless ($candidate && $type); 1668 | my @v = HTTP::Headers::Util::split_header_words($candidate); 1669 | return scalar(@v) ? $type eq lc($v[0]->[0]) : 0; 1670 | } 1671 | 1672 | ################################################## 1673 | # Check whether a URI has already been processed # 1674 | ################################################## 1675 | 1676 | sub already_processed (\$\$) 1677 | { 1678 | my ($uri, $referer) = @_; 1679 | 1680 | # Don't be verbose for that part... 1681 | my $summary_value = $Opts{Summary_Only}; 1682 | $Opts{Summary_Only} = 1; 1683 | 1684 | # Do a GET: if it fails, we stop, if not, the results are cached 1685 | my $response = &get_document('GET', $uri, 1, undef, $referer); 1686 | 1687 | # ... but just for that part 1688 | $Opts{Summary_Only} = $summary_value; 1689 | 1690 | # Can we process the resource? 1691 | return -1 if defined($response->{Stop}); 1692 | 1693 | # Have we already processed it? 1694 | return 1 if defined($processed{$response->{absolute_uri}->as_string()}); 1695 | 1696 | # It's not processed yet and it is processable: return 0 1697 | return 0; 1698 | } 1699 | 1700 | ############################ 1701 | # Get the content of a URI # 1702 | ############################ 1703 | 1704 | sub get_uri ($\$;\$$\%$$$$) 1705 | { 1706 | 1707 | # Here we have a lot of extra parameters in order not to lose information 1708 | # if the function is called several times (401's) 1709 | my ($method, $uri, $referer, $start, $redirects, 1710 | $code, $realm, $message, $auth 1711 | ) = @_; 1712 | 1713 | # $method contains the method used 1714 | # $uri object contains the target of the request 1715 | # $referer is the URI object of the referring document 1716 | # $start is a timestamp (not defined the first time the function is called) 1717 | # $redirects is a map of redirects 1718 | # $code is the first HTTP return code 1719 | # $realm is the realm of the request 1720 | # $message is the HTTP message received 1721 | # $auth equals 1 if we want to send out authentication information 1722 | 1723 | # For timing purposes 1724 | $start = &get_timestamp() unless defined($start); 1725 | 1726 | # Prepare the query 1727 | 1728 | # Do we want printouts of progress? 1729 | my $verbose_progress = 1730 | !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML})); 1731 | 1732 | &hprintf("%s %s ", $method, $uri) if $verbose_progress; 1733 | 1734 | my $request = HTTP::Request->new($method, $uri); 1735 | 1736 | $request->header('Accept-Language' => $Opts{Accept_Language}) 1737 | if $Opts{Accept_Language}; 1738 | $request->header('Accept', $Accept); 1739 | $request->header('Upgrade-Insecure-Requests', 1); 1740 | $request->accept_decodable(); 1741 | 1742 | # Are we providing authentication info? 1743 | if ($auth && $request->url()->host() =~ $Opts{Trusted}) { 1744 | if (defined($ENV{HTTP_AUTHORIZATION})) { 1745 | $request->header(Authorization => $ENV{HTTP_AUTHORIZATION}); 1746 | } 1747 | elsif (defined($Opts{User}) && defined($Opts{Password})) { 1748 | $request->authorization_basic($Opts{User}, $Opts{Password}); 1749 | } 1750 | } 1751 | 1752 | # Tell the user agent if we want progress reports for redirects or not. 1753 | $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); }) 1754 | if $verbose_progress; 1755 | 1756 | # Set referer 1757 | $request->referer($referer) if (!$Opts{No_Referer} && $referer); 1758 | 1759 | # Telling caches in the middle we want a fresh copy (Bug 4998) 1760 | $request->header(Cache_Control => "max-age=0"); 1761 | 1762 | # Do the query 1763 | my $response = $ua->request($request); 1764 | 1765 | # Get the results 1766 | # Record the very first response 1767 | if (!defined($code)) { 1768 | ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); 1769 | } 1770 | 1771 | # Authentication requested? 1772 | if ($response->code() == 401 && 1773 | !defined($auth) && 1774 | (defined($ENV{HTTP_AUTHORIZATION}) || 1775 | (defined($Opts{User}) && defined($Opts{Password}))) 1776 | ) 1777 | { 1778 | 1779 | # Set host as trusted domain unless we already have one. 1780 | if (!$Opts{Trusted}) { 1781 | my $re = sprintf('^%s$', quotemeta($response->base()->host())); 1782 | $Opts{Trusted} = qr/$re/io; 1783 | } 1784 | 1785 | # Deal with authentication and avoid loops 1786 | if (!defined($realm) && 1787 | $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) 1788 | { 1789 | $realm = $1; 1790 | } 1791 | 1792 | print "\n" if $verbose_progress; 1793 | return &get_uri($method, $response->request()->url(), 1794 | $referer, $start, $redirects, $code, $realm, $message, 1); 1795 | } 1796 | 1797 | # @@@ subtract robot delay from the "fetched in" time? 1798 | &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp())) 1799 | if $verbose_progress; 1800 | 1801 | $response->{IsCss} = 1802 | is_content_type($response->content_type(), "text/css"); 1803 | $response->{Realm} = $realm if defined($realm); 1804 | 1805 | return $response; 1806 | } 1807 | 1808 | ######################################### 1809 | # Record the results of an HTTP request # 1810 | ######################################### 1811 | 1812 | sub record_results (\$$$$\@) 1813 | { 1814 | my ($uri, $method, $response, $referer, $mixedcontent_lines, @csp) = @_; 1815 | $results{$uri}{referer} = $referer; 1816 | $results{$uri}{response} = $response; 1817 | $results{$uri}{method} = $method; 1818 | $results{$uri}{location}{code} = $response->code(); 1819 | $results{$uri}{location}{code} = RC_ROBOTS_TXT() 1820 | if ($results{$uri}{location}{code} == 403 && 1821 | $response->message() =~ /Forbidden by robots\.txt/); 1822 | $results{$uri}{location}{code} = RC_IP_DISALLOWED() 1823 | if ($results{$uri}{location}{code} == 403 && 1824 | $response->message() =~ /non-public IP/); 1825 | $results{$uri}{location}{code} = RC_DNS_ERROR() 1826 | if ($results{$uri}{location}{code} == 500 && 1827 | $response->message() =~ /Bad hostname '[^\']*'/); 1828 | $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED() 1829 | if ($results{$uri}{location}{code} == 500 && 1830 | $response->message() =~ /Access to '[^\']*' URIs has been disabled/); 1831 | 1832 | $results{$uri}{location}{type} = $response->header('Content-type'); 1833 | $results{$uri}{location}{display} = $results{$uri}{location}{code}; 1834 | 1835 | # Rewind, check for the original code and message. 1836 | for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) { 1837 | $results{$uri}{location}{orig} = $tmp->code(); 1838 | $results{$uri}{location}{orig_message} = $tmp->message() || 1839 | '(no message)'; 1840 | } 1841 | # dealing with mixed content 1842 | if ($mixedcontent_lines && $uri->scheme ne 'https') { 1843 | # Check headers of CSP: upgrade-insecure-request 1844 | if (! ($uri->scheme eq 'http' && $#csp != 0 && grep(/^upgrade-insecure-requests$/, @csp))) { 1845 | # Set this the lines where the URL is used in mixedcontent_block context 1846 | $results{$uri}{mixedcontent_blocked} = $mixedcontent_lines; 1847 | } 1848 | } 1849 | $results{$uri}{location}{success} = $response->is_success(); 1850 | 1851 | # If a suppressed broken link, fill the data structure like a typical success. 1852 | # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; 1853 | if (!$results{$uri}{location}{success}) { 1854 | my $code = $results{$uri}{location}{code}; 1855 | my $matchCodeUri = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; 1856 | my $matchCode = grep { $_ eq "$code" } @{$Opts{Suppress_Broken}}; 1857 | if ($matchCodeUri || $matchCode) { 1858 | $results{$uri}{location}{success} = 1; 1859 | $results{$uri}{location}{code} = 100; 1860 | $results{$uri}{location}{display} = 100; 1861 | } 1862 | } 1863 | 1864 | # Stores the authentication information 1865 | if (defined($response->{Realm})) { 1866 | $results{$uri}{location}{realm} = $response->{Realm}; 1867 | $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm}; 1868 | } 1869 | 1870 | # What type of broken link is it? (stored in {record} - the {display} 1871 | # information is just for visual use only) 1872 | if ($results{$uri}{location}{display} == 401 && 1873 | $results{$uri}{location}{code} == 404) 1874 | { 1875 | $results{$uri}{location}{record} = 404; 1876 | } 1877 | else { 1878 | $results{$uri}{location}{record} = $results{$uri}{location}{display}; 1879 | } 1880 | 1881 | # Did it fail? 1882 | $results{$uri}{location}{message} = $response->message() || '(no message)'; 1883 | if (!$results{$uri}{location}{success}) { 1884 | &hprintf( 1885 | "Error: %d %s\n", 1886 | $results{$uri}{location}{code}, 1887 | $results{$uri}{location}{message} 1888 | ) if ($Opts{Verbose}); 1889 | } 1890 | return; 1891 | } 1892 | 1893 | #################### 1894 | # Parse a document # 1895 | #################### 1896 | 1897 | sub parse_document (\$\$$$$) 1898 | { 1899 | my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_; 1900 | 1901 | print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n") 1902 | if $Opts{Verbose}; 1903 | 1904 | my $p; 1905 | 1906 | if (defined($results{$uri}{parsing})) { 1907 | 1908 | # We have already done the job. Woohoo! 1909 | $p->{base} = $results{$uri}{parsing}{base}; 1910 | $p->{Anchors} = $results{$uri}{parsing}{Anchors}; 1911 | $p->{Links} = $results{$uri}{parsing}{Links}; 1912 | return $p; 1913 | } 1914 | 1915 | $p = W3C::LinkChecker->new(); 1916 | $p->{base} = $base_uri; 1917 | 1918 | $p->{csp} = $response->header('Content-Security-Policy'); 1919 | 1920 | my $stype = $response->header("Content-Style-Type"); 1921 | $p->{style_is_css} = !$stype || is_content_type($stype, "text/css"); 1922 | 1923 | my $start; 1924 | if (!$Opts{Summary_Only}) { 1925 | $start = &get_timestamp(); 1926 | print("Parsing...\n"); 1927 | } 1928 | 1929 | # Content-Encoding etc already decoded in get_document(). 1930 | my $docref = $response->content_ref(); 1931 | 1932 | # Count lines beforehand if needed (for progress indicator, or CSS while 1933 | # we don't get any line context out of the parser). In case of HTML, the 1934 | # actual final number of lines processed shown is populated by our 1935 | # end_document handler. 1936 | $p->{Total} = ($$docref =~ tr/\n//) 1937 | if ($response->{IsCss} || $Opts{Progress}); 1938 | 1939 | # We only look for anchors if we are not interested in the links 1940 | # obviously, or if we are running a recursive checking because we 1941 | # might need this information later 1942 | $p->{only_anchors} = !($links || $rec_needs_links); 1943 | 1944 | if ($response->{IsCss}) { 1945 | 1946 | # Parse as CSS 1947 | 1948 | $p->parse_css($$docref, LINE_UNKNOWN()); 1949 | } 1950 | else { 1951 | 1952 | # Parse as HTML 1953 | 1954 | # Transform into for parsing 1955 | # Processing instructions are not parsed by process, but in this case 1956 | # it should be. It's expensive, it's horrible, but it's the easiest way 1957 | # for right now. 1958 | $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ 1959 | unless $p->{only_anchors}; 1960 | 1961 | $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); 1962 | 1963 | $p->parse($$docref)->eof(); 1964 | } 1965 | 1966 | $response->content(""); 1967 | 1968 | if (!$Opts{Summary_Only}) { 1969 | my $stop = &get_timestamp(); 1970 | print "\r" if $Opts{Progress}; 1971 | &hprintf(" done (%d lines in %s seconds).\n", 1972 | $p->{Total}, &time_diff($start, $stop)); 1973 | } 1974 | 1975 | # Save the results before exiting 1976 | $results{$uri}{parsing}{base} = $p->{base}; 1977 | $results{$uri}{parsing}{Anchors} = $p->{Anchors}; 1978 | $results{$uri}{parsing}{Links} = $p->{Links}; 1979 | 1980 | return $p; 1981 | } 1982 | 1983 | #################################### 1984 | # Constructor for W3C::LinkChecker # 1985 | #################################### 1986 | 1987 | sub new 1988 | { 1989 | my $p = HTML::Parser::new(@_, api_version => 3); 1990 | $p->utf8_mode(1); 1991 | 1992 | # Set up handlers 1993 | 1994 | $p->handler(start => 'start', 'self, tagname, attr, line'); 1995 | $p->handler(end => 'end', 'self, tagname, line'); 1996 | $p->handler(text => 'text', 'self, dtext, line'); 1997 | $p->handler( 1998 | declaration => sub { 1999 | my $self = shift; 2000 | $self->declaration(substr($_[0], 2, -1)); 2001 | }, 2002 | 'self, text, line' 2003 | ); 2004 | $p->handler(end_document => 'end_document', 'self, line'); 2005 | if ($Opts{Progress}) { 2006 | $p->handler(default => 'parse_progress', 'self, line'); 2007 | $p->{last_percentage} = 0; 2008 | } 2009 | 2010 | # Check ? 2011 | $p->{check_name} = 1; 2012 | 2013 | # Check <[..] id="..">? 2014 | $p->{check_id} = 1; 2015 | 2016 | # Don't interpret comment loosely 2017 | $p->strict_comment(1); 2018 | 2019 | return $p; 2020 | } 2021 | 2022 | ################################################# 2023 | # Record or return the doctype of the document # 2024 | ################################################# 2025 | 2026 | sub doctype 2027 | { 2028 | my ($self, $dc) = @_; 2029 | return $self->{doctype} unless $dc; 2030 | $_ = $self->{doctype} = $dc; 2031 | 2032 | # What to look for depending on the doctype 2033 | 2034 | # Check for ? 2035 | $self->{check_name} = 0 2036 | if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %; 2037 | 2038 | # Check for <* id="...">? 2039 | $self->{check_id} = 0 2040 | if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%); 2041 | 2042 | # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...) 2043 | $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%); 2044 | 2045 | return; 2046 | } 2047 | 2048 | ################################### 2049 | # Print parse progress indication # 2050 | ################################### 2051 | 2052 | sub parse_progress 2053 | { 2054 | my ($self, $line) = @_; 2055 | return unless defined($line) && $line > 0 && $self->{Total} > 0; 2056 | 2057 | my $percentage = int($line / $self->{Total} * 100); 2058 | if ($percentage != $self->{last_percentage}) { 2059 | printf("\r%4d%%", $percentage); 2060 | $self->{last_percentage} = $percentage; 2061 | } 2062 | 2063 | return; 2064 | } 2065 | 2066 | ############################# 2067 | # Extraction of the anchors # 2068 | ############################# 2069 | 2070 | sub get_anchor 2071 | { 2072 | my ($self, $tag, $attr) = @_; 2073 | 2074 | my $anchor = $self->{check_id} ? $attr->{id} : undef; 2075 | if ($self->{check_name} && ($tag eq 'a')) { 2076 | 2077 | # @@@@ In XHTML, is mandatory 2078 | # Force an error if it's not the case (or if id's and name's values 2079 | # are different) 2080 | # If id is defined, name if defined must have the same value 2081 | $anchor ||= $attr->{name}; 2082 | } 2083 | 2084 | return $anchor; 2085 | } 2086 | 2087 | ############################# 2088 | # W3C::LinkChecker handlers # 2089 | ############################# 2090 | 2091 | sub add_link 2092 | { 2093 | my ($self, $uri, $base, $line, $mixedcontent) = @_; 2094 | if (defined($uri)) { 2095 | 2096 | # Remove repeated slashes after the . or .. in relative links, to avoid 2097 | # duplicated checking or infinite recursion. 2098 | $uri =~ s|^(\.\.?/)/+|$1|o; 2099 | $uri = Encode::decode_utf8($uri); 2100 | $uri = URI->new_abs($uri, $base) if defined($base); 2101 | $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}{$mixedcontent}++; 2102 | } 2103 | return; 2104 | } 2105 | 2106 | sub start 2107 | { 2108 | my ($self, $tag, $attr, $line) = @_; 2109 | $line = LINE_UNKNOWN() unless defined($line); 2110 | 2111 | # Anchors 2112 | my $anchor = $self->get_anchor($tag, $attr); 2113 | $self->{Anchors}{$anchor}{$line}++ if defined($anchor); 2114 | 2115 | # Links 2116 | if (!$self->{only_anchors}) { 2117 | 2118 | my $tag_local_base = undef; 2119 | 2120 | # Special case: base/@href 2121 | # @@@TODO: The reason for handling ourselves is that LWP's 2122 | # head parsing magic fails at least for responses that have 2123 | # Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361 2124 | if ($tag eq 'base') { 2125 | 2126 | # Ignore with missing/empty href. 2127 | $self->{base} = $attr->{href} 2128 | if (defined($attr->{href}) && length($attr->{href})); 2129 | } 2130 | 2131 | # Special case: meta[@http-equiv=Refresh]/@content 2132 | elsif ($tag eq 'meta') { 2133 | if ($attr->{'http-equiv'} && 2134 | lc($attr->{'http-equiv'}) eq 'refresh') 2135 | { 2136 | my $content = $attr->{content}; 2137 | if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) { 2138 | $self->add_link($1, undef, $line, MC_BLOCK); 2139 | } 2140 | } 2141 | } 2142 | 2143 | # Special case: tags that have "local base" 2144 | elsif ($tag eq 'applet' || $tag eq 'object') { 2145 | if (my $codebase = $attr->{codebase}) { 2146 | 2147 | # Applet codebases are directories, append trailing slash 2148 | # if it's not there so that new_abs does the right thing. 2149 | $codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|); 2150 | 2151 | # TODO: HTML 4 spec says applet/@codebase may only point to 2152 | # subdirs of the directory containing the current document. 2153 | # Should we do something about that? 2154 | $tag_local_base = URI->new_abs($codebase, $self->{base}); 2155 | } 2156 | } 2157 | 2158 | # Link attributes: 2159 | if (my $link_attrs = LINK_ATTRS()->{$tag}) { 2160 | for my $attrdesc (@$link_attrs) { 2161 | my ($mixedcontent, $la) = @$attrdesc; 2162 | my $attributes = []; 2163 | my $skip = 0; 2164 | if ($mixedcontent == MC_CONDITIONAL) { 2165 | if ($tag eq 'img') { 2166 | if ($attr->{crossorigin}) { 2167 | $mixedcontent = MC_BLOCK; 2168 | } else { 2169 | $mixedcontent = MC_OPTIONAL; 2170 | } 2171 | } elsif ($tag eq 'link') { 2172 | my @blocked_link_types = ('stylesheet', 'prefetch', 'preload', 'preconnect', 'prerender', 'modulepreload', 'icon', 'dns-prefetch'); 2173 | my @link_types = map{ lc($_)} split(/\s+/, $attr->{rel}); 2174 | for my $type (@link_types) { 2175 | if ( grep (/^$type$/, @blocked_link_types)) { 2176 | $mixedcontent = MC_BLOCK; 2177 | last; 2178 | } 2179 | } 2180 | if ($mixedcontent == MC_CONDITIONAL) { 2181 | $mixedcontent = MC_NONE; 2182 | } 2183 | } 2184 | } 2185 | if ($tag eq 'link') { 2186 | my @skipped_link_types = ('preconnect', 'dns-prefetch'); 2187 | my @link_types = map{ lc($_)} split(/\s+/, $attr->{rel}); 2188 | for my $type (@link_types) { 2189 | if ( grep (/^$type$/, @skipped_link_types)) { 2190 | $skip = 1; 2191 | last; 2192 | } 2193 | } 2194 | } 2195 | if (!$skip) { 2196 | $self->add_link($attr->{$la}, $tag_local_base, $line, $mixedcontent); 2197 | } 2198 | } 2199 | } 2200 | 2201 | # List of links attributes: 2202 | if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) { 2203 | my ($sep, $subsep, $attrdesc) = @$link_attrs; 2204 | my ($mixedcontent, $la) = @$attrdesc; 2205 | if (defined(my $value = $attr->{$la})) { 2206 | for my $comp (split($sep, $value)) { 2207 | for my $link (split($subsep, $comp)) { 2208 | $self->add_link($link, $tag_local_base, $line, $mixedcontent); 2209 | } 2210 | } 2211 | } 2212 | } 2213 | 2214 | # Inline CSS: 2215 | delete $self->{csstext}; 2216 | if ($tag eq 'style') { 2217 | $self->{csstext} = '' 2218 | if ((!$attr->{type} && $self->{style_is_css}) || 2219 | is_content_type($attr->{type}, "text/css")); 2220 | } 2221 | elsif ($self->{style_is_css} && (my $style = $attr->{style})) { 2222 | $style = CSS::DOM::Style::parse($style); 2223 | $self->parse_style($style, $line); 2224 | } 2225 | } 2226 | 2227 | $self->parse_progress($line) if $Opts{Progress}; 2228 | return; 2229 | } 2230 | 2231 | sub end 2232 | { 2233 | my ($self, $tagname, $line) = @_; 2234 | 2235 | $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style'); 2236 | delete $self->{csstext}; 2237 | 2238 | $self->parse_progress($line) if $Opts{Progress}; 2239 | return; 2240 | } 2241 | 2242 | sub parse_css 2243 | { 2244 | my ($self, $css, $line) = @_; 2245 | return unless $css; 2246 | 2247 | my $sheet = CSS::DOM::parse($css); 2248 | for my $rule (@{$sheet->cssRules()}) { 2249 | if ($rule->type() == IMPORT_RULE()) { 2250 | $self->add_link($rule->href(), $self->{base}, $line, MC_BLOCK); 2251 | } 2252 | elsif ($rule->type == STYLE_RULE()) { 2253 | $self->parse_style($rule->style(), $line); 2254 | } 2255 | } 2256 | return; 2257 | } 2258 | 2259 | sub parse_style 2260 | { 2261 | my ($self, $style, $line) = @_; 2262 | return unless $style; 2263 | 2264 | for (my $i = 0, my $len = $style->length(); $i < $len; $i++) { 2265 | my $prop = $style->item($i); 2266 | my $val = $style->getPropertyValue($prop); 2267 | 2268 | while ($val =~ /$CssUrl/go) { 2269 | my $url = CSS::DOM::Util::unescape($2); 2270 | # FIXME: are there CSS rules with URLs that aren't mixed content blockable? 2271 | $self->add_link($url, $self->{base}, $line, MC_OPTIONAL); 2272 | } 2273 | } 2274 | 2275 | return; 2276 | } 2277 | 2278 | sub declaration 2279 | { 2280 | my ($self, $text, $line) = @_; 2281 | 2282 | # Extract the doctype 2283 | my @declaration = split(/\s+/, $text, 4); 2284 | if ($#declaration >= 3 && 2285 | $declaration[0] eq 'DOCTYPE' && 2286 | lc($declaration[1]) eq 'html') 2287 | { 2288 | 2289 | # Parse the doctype declaration 2290 | if ($text =~ 2291 | m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i 2292 | ) 2293 | { 2294 | 2295 | # Store the doctype 2296 | $self->doctype($1) if $1; 2297 | 2298 | # If there is a link to the DTD, record it 2299 | $self->add_link($3, undef, $line, MC_NONE) 2300 | if (!$self->{only_anchors} && $3); 2301 | } 2302 | } 2303 | 2304 | $self->text($text) unless $self->{only_anchors}; 2305 | 2306 | return; 2307 | } 2308 | 2309 | sub text 2310 | { 2311 | my ($self, $text, $line) = @_; 2312 | $self->{csstext} .= $text if defined($self->{csstext}); 2313 | $self->parse_progress($line) if $Opts{Progress}; 2314 | return; 2315 | } 2316 | 2317 | sub end_document 2318 | { 2319 | my ($self, $line) = @_; 2320 | $self->{Total} = $line; 2321 | delete $self->{csstext}; 2322 | return; 2323 | } 2324 | 2325 | sub validate_pdf_fragment 2326 | { 2327 | my ($uri, $fragments) = @_; 2328 | mark_fragments_as_not_checked($uri, $fragments); 2329 | # TODO: Check the rules defined in 2330 | # https://pdfobject.com/pdf/pdf_open_parameters_acro8.pdf 2331 | return; 2332 | } 2333 | 2334 | sub mark_fragments_as_not_checked 2335 | { 2336 | my ($uri, $fragments) = @_; 2337 | for my $fragment (keys %$fragments) { 2338 | $results{$uri}{fragments}{$fragment} = -1; 2339 | } 2340 | return; 2341 | } 2342 | 2343 | ################################ 2344 | # Check the validity of a link # 2345 | ################################ 2346 | 2347 | sub check_validity (\$\$$\%\%\$\$) 2348 | { 2349 | my ($referer, $uri, $want_links, $links, $redirects, $mixedcontent_lines, $csp) = @_; 2350 | 2351 | # $referer is the URI object of the document checked 2352 | # $uri is the URI object of the target that we are verifying 2353 | # $want_links is true if we're interested in links in the target doc 2354 | # $links is a hash of the links in the documents checked 2355 | # $redirects is a map of the redirects encountered 2356 | # $mixedcontent_lines is an array of lines where the link appears 2357 | # in a mixedcontent context (if any) 2358 | # $csp is an array of values from the content-security-policy header 2359 | # of the page 2360 | 2361 | # Get the document with the appropriate method: GET if there are 2362 | # fragments to check or links are wanted, HEAD is enough otherwise. 2363 | my $fragments = $links->{$uri}{fragments} || {}; 2364 | my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD'; 2365 | 2366 | my $response; 2367 | my $being_processed = 0; 2368 | if (!defined($results{$uri}) || 2369 | ($method eq 'GET' && $results{$uri}{method} eq 'HEAD')) 2370 | { 2371 | $being_processed = 1; 2372 | $response = &get_uri($method, $uri, $referer); 2373 | 2374 | # Get the information back from get_uri() 2375 | &record_results($uri, $method, $response, $referer, $mixedcontent_lines, $csp); 2376 | 2377 | # Record the redirects 2378 | &record_redirects($redirects, $response); 2379 | } 2380 | elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) { 2381 | my $ref = $results{$uri}{referer}; 2382 | &hprintf("Already checked%s\n", $ref ? ", referrer $ref" : "."); 2383 | } 2384 | 2385 | # We got the response of the HTTP request. Stop here if it was a HEAD. 2386 | return if ($method eq 'HEAD'); 2387 | 2388 | # There are fragments. Parse the document. 2389 | my $p; 2390 | if ($being_processed) { 2391 | if (!defined($results{$uri}{location}{type})) { 2392 | &hprintf("Can't check content: Content-Type for '%s' is undefined.\n", 2393 | $uri) 2394 | if ($Opts{Verbose}); 2395 | $response->content(""); 2396 | return; 2397 | } elsif ($results{$uri}{location}{type} =~ "application/pdf") { 2398 | validate_pdf_fragment($uri, $fragments); 2399 | $response->content(""); 2400 | return; 2401 | } elsif ($results{$uri}{location}{type} !~ $ContentTypes) { 2402 | &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", 2403 | $uri, $results{$uri}{location}{type}) 2404 | if ($Opts{Verbose}); 2405 | # TODO: find content types for which fragment are defined 2406 | # Maybe starting from https://en.wikipedia.org/wiki/URI_fragment#Examples 2407 | mark_fragments_as_not_checked($uri, $fragments); 2408 | $response->content(""); 2409 | return; 2410 | } 2411 | 2412 | # Do it then 2413 | if (my $error = decode_content($response)) { 2414 | &hprintf("%s\n.", $error); 2415 | } 2416 | # @@@TODO: this isn't the best thing to do if a decode error occurred 2417 | $p = 2418 | &parse_document($uri, $response->base(), $response, 0, 2419 | $want_links); 2420 | } 2421 | else { 2422 | 2423 | # We already had the information 2424 | $p->{Anchors} = $results{$uri}{parsing}{Anchors}; 2425 | } 2426 | 2427 | # Check that the fragments exist 2428 | for my $fragment (keys %$fragments) { 2429 | if (defined($p->{Anchors}{$fragment}) || 2430 | &escape_match($fragment, $p->{Anchors}) || 2431 | # per https://html.spec.whatwg.org/multipage/browsing-the-web.html#scroll-to-fragid:ascii-case-insensitive 2432 | # 'top' is always a valid fragment in HTML 2433 | (!$response->{IsCss} && $fragment eq 'top') || 2434 | # TODO #top is OK in HTML documents 2435 | grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) 2436 | { 2437 | $results{$uri}{fragments}{$fragment} = 1; 2438 | } 2439 | else { 2440 | $results{$uri}{fragments}{$fragment} = 0; 2441 | } 2442 | } 2443 | return; 2444 | } 2445 | 2446 | sub escape_match ($\%) 2447 | { 2448 | my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); 2449 | for my $b (keys %$hash) { 2450 | return 1 if ($a eq URI::Escape::uri_unescape($b)); 2451 | } 2452 | return 0; 2453 | } 2454 | 2455 | ########################## 2456 | # Ask for authentication # 2457 | ########################## 2458 | 2459 | sub authentication ($;$$$$) 2460 | { 2461 | my ($response, $cookie, $params, $check_num, $is_start) = @_; 2462 | 2463 | my $realm = ''; 2464 | if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { 2465 | $realm = $1; 2466 | } 2467 | 2468 | if ($Opts{Command_Line}) { 2469 | printf STDERR <<'EOF', $response->request()->url(), $realm; 2470 | 2471 | Authentication is required for %s. 2472 | The realm is "%s". 2473 | Use the -u and -p options to specify a username and password and the -d option 2474 | to specify trusted domains. 2475 | EOF 2476 | } 2477 | else { 2478 | 2479 | printf( 2480 | "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", 2481 | $response->www_authenticate(), 2482 | $cookie ? "Set-Cookie: $cookie\n" : "", 2483 | ); 2484 | 2485 | printf( 2486 | "%s 2487 | 2488 | 2489 | W3C Link Checker: 401 Authorization Required 2490 | %s 2491 | ", $DocType, $Head 2492 | ); 2493 | &banner(': 401 Authorization Required'); 2494 | &print_form($params, $cookie, $check_num) if $is_start; 2495 | printf( 2496 | '

2497 | %s 2498 | You need "%s" access to %s to perform link checking.
2499 | ', 2500 | &status_icon(401), 2501 | &encode($realm), (&encode($response->request()->url())) x 2 2502 | ); 2503 | 2504 | my $host = $response->request()->url()->host(); 2505 | if ($Opts{Trusted} && $host !~ $Opts{Trusted}) { 2506 | printf <<'EOF', &encode($Opts{Trusted}), &encode($host); 2507 | This service has been configured to send authentication only to hostnames 2508 | matching the regular expression %s, but the hostname 2509 | %s does not match it. 2510 | EOF 2511 | } 2512 | 2513 | print "

\n"; 2514 | } 2515 | return; 2516 | } 2517 | 2518 | ################## 2519 | # Get statistics # 2520 | ################## 2521 | 2522 | sub get_timestamp () 2523 | { 2524 | return pack('LL', Time::HiRes::gettimeofday()); 2525 | } 2526 | 2527 | sub time_diff ($$) 2528 | { 2529 | my @start = unpack('LL', $_[0]); 2530 | my @stop = unpack('LL', $_[1]); 2531 | for ($start[1], $stop[1]) { 2532 | $_ /= 1_000_000; 2533 | } 2534 | return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1]))); 2535 | } 2536 | 2537 | ######################## 2538 | # Handle the redirects # 2539 | ######################## 2540 | 2541 | # Record the redirects in a hash 2542 | sub record_redirects (\%$) 2543 | { 2544 | my ($redirects, $response) = @_; 2545 | for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { 2546 | 2547 | # Check for redirect match. 2548 | my $from = $prev->request()->url(); 2549 | my $to = $response->request()->url(); # same on every loop iteration 2550 | my $from_to = $from . '->' . $to; 2551 | my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; 2552 | 2553 | # print STDERR "Result $match of redirect checking $from_to\n"; 2554 | if ($match) { next; } 2555 | 2556 | $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}}; 2557 | 2558 | # print STDERR "Result $match of regexp checking $from_to\n"; 2559 | if ($match) { next; } 2560 | 2561 | my $c = $prev->code(); 2562 | if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) { 2563 | next; 2564 | } 2565 | 2566 | $redirects->{$prev->request()->url()} = $response->request()->url(); 2567 | } 2568 | return; 2569 | } 2570 | 2571 | # Determine if a request is redirected 2572 | sub is_redirected ($%) 2573 | { 2574 | my ($uri, %redirects) = @_; 2575 | return (defined($redirects{$uri})); 2576 | } 2577 | 2578 | # Get a list of redirects for a URI 2579 | sub get_redirects ($%) 2580 | { 2581 | my ($uri, %redirects) = @_; 2582 | my @history = ($uri); 2583 | my %seen = ($uri => 1); # for tracking redirect loops 2584 | my $loop = 0; 2585 | while ($redirects{$uri}) { 2586 | $uri = $redirects{$uri}; 2587 | push(@history, $uri); 2588 | if ($seen{$uri}) { 2589 | $loop = 1; 2590 | last; 2591 | } 2592 | else { 2593 | $seen{$uri}++; 2594 | } 2595 | } 2596 | return ($loop, @history); 2597 | } 2598 | 2599 | #################################################### 2600 | # Tool for sorting the unique elements of an array # 2601 | #################################################### 2602 | 2603 | sub sort_unique (@) 2604 | { 2605 | my %saw; 2606 | @saw{@_} = (); 2607 | # Treat "(N/A)" as 0 2608 | return sort { (($a =~ /(\d+)/)[0] || 0) <=> (($b =~ /(\d+)/)[0] || 0) } keys %saw; 2609 | } 2610 | 2611 | ##################### 2612 | # Print the results # 2613 | ##################### 2614 | 2615 | sub line_number ($) 2616 | { 2617 | my $line = shift; 2618 | return $line if ($line >= 0); 2619 | return "(N/A)"; 2620 | } 2621 | 2622 | sub http_rc ($) 2623 | { 2624 | my $rc = shift; 2625 | return $rc if ($rc >= 0); 2626 | return "(N/A)"; 2627 | } 2628 | 2629 | # returns true if the given code is informational 2630 | sub informational ($) 2631 | { 2632 | my $rc = shift; 2633 | return $rc == RC_ROBOTS_TXT() || 2634 | $rc == RC_IP_DISALLOWED() || 2635 | $rc == RC_PROTOCOL_DISALLOWED(); 2636 | } 2637 | 2638 | sub anchors_summary (\%\%) 2639 | { 2640 | my ($anchors, $errors) = @_; 2641 | 2642 | # Number of anchors found. 2643 | my $n = scalar(keys(%$anchors)); 2644 | if (!$Opts{Quiet}) { 2645 | if ($Opts{HTML}) { 2646 | print("

Anchors

\n

"); 2647 | } 2648 | else { 2649 | print("Anchors\n\n"); 2650 | } 2651 | &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's'); 2652 | print("

\n") if $Opts{HTML}; 2653 | } 2654 | 2655 | # List of the duplicates, if any. 2656 | my @errors = keys %{$errors}; 2657 | if (!scalar(@errors)) { 2658 | print("

Valid anchors!

\n") 2659 | if (!$Opts{Quiet} && $Opts{HTML} && $n); 2660 | return; 2661 | } 2662 | undef $n; 2663 | 2664 | print_doc_header(); 2665 | print('

') if $Opts{HTML}; 2666 | print('List of duplicate and empty anchors'); 2667 | print <<'EOF' if $Opts{HTML}; 2668 |

2669 | 2670 | 2671 | 2672 | 2673 | 2674 | 2675 | 2676 | 2677 | EOF 2678 | print("\n"); 2679 | 2680 | for my $anchor (@errors) { 2681 | my $format; 2682 | my @unique = &sort_unique( 2683 | map { line_number($_) } 2684 | keys %{$anchors->{$anchor}} 2685 | ); 2686 | if ($Opts{HTML}) { 2687 | $format = "\n"; 2688 | } 2689 | else { 2690 | my $s = (scalar(@unique) > 1) ? 's' : ''; 2691 | $format = "\t%s\tLine$s: %s\n"; 2692 | } 2693 | printf($format, 2694 | &encode(length($anchor) ? $anchor : 'Empty anchor'), 2695 | join(', ', @unique)); 2696 | } 2697 | 2698 | print("\n
AnchorLines
%s%s
\n") if $Opts{HTML}; 2699 | 2700 | return; 2701 | } 2702 | 2703 | sub show_link_report (\%\%\%\%\@;$\%) 2704 | { 2705 | my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; 2706 | 2707 | print("\n
") if $Opts{HTML}; 2708 | print("\n") if (!$Opts{Quiet}); 2709 | 2710 | # Process each URL 2711 | my ($c, $previous_c); 2712 | for my $u (@$urls) { 2713 | my @fragments = keys %{$broken->{$u}{fragments}}; 2714 | 2715 | # Did we get a redirect? 2716 | my $redirected = &is_redirected($u, %$redirects); 2717 | 2718 | # List of lines 2719 | my @total_lines; 2720 | push(@total_lines, keys(%{$links->{$u}{location}})); 2721 | for my $f (@fragments) { 2722 | push(@total_lines, keys(%{$links->{$u}{fragments}{$f}})) 2723 | unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()})); 2724 | } 2725 | 2726 | my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects); 2727 | my $currloc = $results->{$u}{location}; 2728 | 2729 | # Error type 2730 | $c = &code_shown($u, $results); 2731 | 2732 | # What to do 2733 | my $whattodo = ''; 2734 | my $redirect_too; 2735 | if ($todo) { 2736 | if ($u =~ m/^javascript:/) { 2737 | if ($Opts{HTML}) { 2738 | $whattodo = 2739 | 'You should change this link: people using a browser without JavaScript support 2740 | will not be able to follow this link.'; 2741 | } 2742 | else { 2743 | $whattodo = 2744 | 'Change this link: people using a browser without JavaScript support will not be able to follow this link.'; 2745 | } 2746 | } 2747 | elsif ($c == RC_ROBOTS_TXT()) { 2748 | $whattodo = 2749 | 'The link was not checked due to robots exclusion ' . 2750 | 'rules. Check the link manually.'; 2751 | } 2752 | elsif ($redirect_loop) { 2753 | $whattodo = 2754 | 'Retrieving the URI results in a redirect loop, that should be ' 2755 | . 'fixed. Examine the redirect sequence to see where the loop ' 2756 | . 'occurs.'; 2757 | } 2758 | else { 2759 | $whattodo = $todo->{$c}; 2760 | } 2761 | } 2762 | elsif (defined($redirects{$u})) { 2763 | 2764 | # Redirects 2765 | if (($u . '/') eq $redirects{$u}) { 2766 | $whattodo = 2767 | 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.'; 2768 | } 2769 | elsif ($c == 307 || $c == 302) { 2770 | $whattodo = 2771 | 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.'; 2772 | } 2773 | elsif ($c == 301 || $c == 308) { 2774 | $whattodo = 2775 | 'This is a permanent redirect. The link should be updated.'; 2776 | } 2777 | } 2778 | 2779 | my @unique = &sort_unique(map { line_number($_) } @total_lines); 2780 | my $lines_list = join(', ', @unique); 2781 | my $s = (scalar(@unique) > 1) ? 's' : ''; 2782 | undef @unique; 2783 | 2784 | my @http_codes = ($currloc->{code}); 2785 | unshift(@http_codes, $currloc->{orig}) if $currloc->{orig}; 2786 | @http_codes = map { http_rc($_) } @http_codes; 2787 | 2788 | if ($Opts{HTML}) { 2789 | 2790 | # Style stuff 2791 | my $idref = ''; 2792 | if ($codes && (!defined($previous_c) || ($c != $previous_c))) { 2793 | $idref = ' id="d' . $doc_count . 'code_' . $c . '"'; 2794 | $previous_c = $c; 2795 | } 2796 | 2797 | # Main info 2798 | for (@redirects_urls) { 2799 | $_ = &show_url($_); 2800 | } 2801 | 2802 | # HTTP message 2803 | my $http_message; 2804 | if ($currloc->{message}) { 2805 | $http_message = &encode($currloc->{message}); 2806 | if ($c == 404 || $c == 500) { 2807 | $http_message = 2808 | '' . $http_message . ''; 2809 | } 2810 | } 2811 | my $redirmsg = 2812 | $redirect_loop ? ' redirect loop detected' : ''; 2813 | printf(" 2814 | %s Line%s: %s %s 2815 |
Status: %s %s %s
2816 |

%s %s

\n", 2817 | 2818 | # Anchor for return codes 2819 | $idref, 2820 | 2821 | # Color 2822 | &status_icon($c), 2823 | $s, 2824 | 2825 | # List of lines 2826 | $lines_list, 2827 | 2828 | # List of redirects 2829 | $redirected ? 2830 | join(' redirected to ', @redirects_urls) . $redirmsg : 2831 | &show_url($u), 2832 | 2833 | # Realm 2834 | defined($currloc->{realm}) ? 2835 | sprintf('Realm: %s
', &encode($currloc->{realm})) : 2836 | '', 2837 | 2838 | # HTTP original message 2839 | # defined($currloc->{orig_message}) 2840 | # ? &encode($currloc->{orig_message}). 2841 | # ' -> ' 2842 | # : '', 2843 | 2844 | # Response code chain 2845 | join( 2846 | ' -> ', 2847 | map { &encode($_) } @http_codes), 2848 | 2849 | # HTTP final message 2850 | $http_message, 2851 | 2852 | # What to do 2853 | $whattodo || "?", 2854 | 2855 | # Redirect too? 2856 | $redirect_too ? 2857 | sprintf(' %s', 2858 | &bgcolor(301), $redirect_too) : 2859 | '', 2860 | ); 2861 | if ($#fragments >= 0) { 2862 | printf("
Broken fragments:
    \n"); 2863 | } 2864 | } 2865 | else { 2866 | my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; 2867 | printf( 2868 | "\n%s\t%s\n Code: %s %s\n%s\n", 2869 | 2870 | # List of redirects 2871 | $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u, 2872 | 2873 | # List of lines 2874 | $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '', 2875 | 2876 | # Response code chain 2877 | join(' -> ', @http_codes), 2878 | 2879 | # HTTP message 2880 | $currloc->{message} || '', 2881 | 2882 | # What to do 2883 | wrap(' To do: ', ' ', $whattodo) 2884 | ); 2885 | if ($#fragments >= 0) { 2886 | if ($currloc->{code} == 200) { 2887 | print("The following fragments need to be fixed:\n"); 2888 | } 2889 | else { 2890 | print("Fragments:\n"); 2891 | } 2892 | } 2893 | } 2894 | 2895 | # Fragments 2896 | for my $f (@fragments) { 2897 | my @unique_lines = 2898 | &sort_unique(keys %{$links->{$u}{fragments}{$f}}); 2899 | my $plural = (scalar(@unique_lines) > 1) ? 's' : ''; 2900 | my $unique_lines = join(', ', @unique_lines); 2901 | if ($Opts{HTML}) { 2902 | printf("
  • %s#%s (line%s %s)
  • \n", 2903 | &encode($u), &encode($f), $plural, $unique_lines); 2904 | } 2905 | else { 2906 | printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines); 2907 | } 2908 | } 2909 | 2910 | print("
\n") if ($Opts{HTML} && scalar(@fragments)); 2911 | } 2912 | 2913 | # End of the table 2914 | print("
\n") if $Opts{HTML}; 2915 | 2916 | return; 2917 | } 2918 | 2919 | sub show_mixedcontent_report (\@\%) 2920 | { 2921 | my ($mc_blocked_urls, $blocked) = @_; 2922 | 2923 | print("\n
") if $Opts{HTML}; 2924 | print("\n") if (!$Opts{Quiet}); 2925 | print("\n
") if $Opts{HTML}; 2926 | for my $url (@$mc_blocked_urls) { 2927 | if ($Opts{HTML}) { 2928 | printf("
%s
\n", &show_url($url)) ; 2929 | printf("
Blocked in mixed content context on lines: %s
\n", join(", ", @{%$blocked{$url}})); 2930 | } else { 2931 | printf("\n%s\n", $url) ; 2932 | print "Blocked in mixed content context on lines: "; 2933 | print join(", ", @{%$blocked{$url}}); 2934 | print "\n\n"; 2935 | 2936 | } 2937 | } 2938 | } 2939 | 2940 | sub code_shown ($$) 2941 | { 2942 | my ($u, $results) = @_; 2943 | 2944 | if ($results->{$u}{location}{record} == 200) { 2945 | return $results->{$u}{location}{orig} || 2946 | $results->{$u}{location}{record}; 2947 | } 2948 | else { 2949 | return $results->{$u}{location}{record}; 2950 | } 2951 | } 2952 | 2953 | sub links_summary (\%\%\%\%\%) 2954 | { 2955 | 2956 | # Advices to fix the problems 2957 | 2958 | my %todo = ( 2959 | 200 => 2960 | 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).', 2961 | 300 => 2962 | 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.', 2963 | 301 => 2964 | 'This is a permanent redirect. The link should be updated to point to the more recent URI.', 2965 | 302 => 2966 | 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', 2967 | 303 => 2968 | 'This rare status code points to a "See Other" resource. There is generally nothing to be done.', 2969 | 307 => 2970 | 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', 2971 | 308 => 2972 | 'This is a permanent redirect. The link should be updated to point to the more recent URI.', 2973 | 400 => 2974 | 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.', 2975 | 401 => 2976 | "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.", 2977 | 403 => 2978 | 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 2979 | 404 => 2980 | 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.', 2981 | 405 => 2982 | 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.', 2983 | 406 => 2984 | "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.", 2985 | 407 => 'The link is a proxy, but requires Authentication.', 2986 | 408 => 'The request timed out.', 2987 | 410 => 'The resource is gone. You should remove this link.', 2988 | 415 => 'The media type is not supported.', 2989 | 500 => 'This is a server side problem. Check the URI.', 2990 | 501 => 2991 | 'Could not check this link: method not implemented or scheme not supported.', 2992 | 503 => 2993 | 'The server cannot service the request, for some unknown reason.', 2994 | 2995 | # Non-HTTP codes: 2996 | RC_ROBOTS_TXT() => sprintf( 2997 | 'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.', 2998 | $Opts{HTML} ? ( 2999 | '', '', 3000 | "", '' 3001 | ) : ('') x 4 3002 | ), 3003 | RC_DNS_ERROR() => 3004 | 'The hostname could not be resolved. Check the link for typos.', 3005 | RC_IP_DISALLOWED() => 3006 | sprintf( 3007 | 'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.', 3008 | $Opts{HTML} ? 3009 | ('', '') : 3010 | ('') x 2), 3011 | RC_PROTOCOL_DISALLOWED() => 3012 | 'Accessing links with this URI scheme has been disabled in link checker.', 3013 | RC_MIXEDCONTENT_BLOCK() => 3014 | 'The link is blocked as mixed-content. The resource needs to be fetched over https.', 3015 | ); 3016 | my %priority = ( 3017 | 410 => 1, 3018 | 404 => 2, 3019 | 403 => 5, 3020 | 200 => 10, 3021 | 300 => 15, 3022 | 401 => 20 3023 | ); 3024 | 3025 | my ($links, $results, $broken, $redirects, $blocked) = @_; 3026 | 3027 | # List of the broken links 3028 | my @urls = keys %{$broken}; 3029 | my @mc_blocked_urls = keys %{$blocked}; 3030 | my @dir_redirect_urls = (); 3031 | if ($Opts{Redirects}) { 3032 | 3033 | # Add the redirected URI's to the report 3034 | for my $l (keys %$redirects) { 3035 | next 3036 | unless (defined($results->{$l}) && 3037 | defined($links->{$l}) && 3038 | !defined($broken->{$l})); 3039 | 3040 | # Check whether we have a "directory redirect" 3041 | # e.g. https://www.w3.org/TR -> https://www.w3.org/TR/ 3042 | my ($redirect_loop, @redirects) = get_redirects($l, %$redirects); 3043 | if ($#redirects == 1) { 3044 | push(@dir_redirect_urls, $l); 3045 | next; 3046 | } 3047 | push(@urls, $l); 3048 | } 3049 | } 3050 | 3051 | @urls = grep { defined($results->{$_}{location}{record}) } @urls; 3052 | 3053 | # Broken links and redirects 3054 | if ($#urls < 0) { 3055 | if (!$Opts{Quiet}) { 3056 | print_doc_header(); 3057 | if ($Opts{HTML}) { 3058 | print "

Links

\n

Valid links!

\n"; 3059 | } 3060 | else { 3061 | print "\nValid links.\n"; 3062 | } 3063 | } 3064 | } 3065 | else { 3066 | print_doc_header(); 3067 | print('

') if $Opts{HTML}; 3068 | print("\nList of broken links and other issues"); 3069 | 3070 | #print(' and redirects') if $Opts{Redirects}; 3071 | 3072 | # Sort the URI's by HTTP Code 3073 | my %code_summary; 3074 | my @idx; 3075 | for my $u (@urls) { 3076 | if (defined($results->{$u}{location}{record})) { 3077 | my $c = &code_shown($u, $results); 3078 | $code_summary{$c}++; 3079 | push(@idx, $c); 3080 | } 3081 | } 3082 | my @sorted = @urls[ 3083 | sort { 3084 | defined($priority{$idx[$a]}) ? 3085 | defined($priority{$idx[$b]}) ? 3086 | $priority{$idx[$a]} <=> $priority{$idx[$b]} : 3087 | -1 : 3088 | defined($priority{$idx[$b]}) ? 1 : 3089 | $idx[$a] <=> $idx[$b] 3090 | } 0 .. $#idx 3091 | ]; 3092 | @urls = @sorted; 3093 | undef(@sorted); 3094 | undef(@idx); 3095 | 3096 | if ($Opts{HTML}) { 3097 | 3098 | # Print a summary 3099 | print <<'EOF'; 3100 |

3101 |

There are issues with the URLs listed below. The table summarizes the 3102 | issues and suggested actions by HTTP response status code.

3103 | 3104 | 3105 | 3106 | 3107 | 3108 | 3109 | 3110 | 3111 | 3112 | EOF 3113 | for my $code (sort(keys(%code_summary))) { 3114 | printf('', &bgcolor($code)); 3115 | printf('', 3116 | $doc_count, $code, http_rc($code)); 3117 | printf('', $code_summary{$code}); 3118 | printf('', $todo{$code} || "?"); 3119 | print "\n"; 3120 | } 3121 | print "\n
CodeOccurrencesWhat to do
%s%s%s
\n"; 3122 | } 3123 | else { 3124 | print(':'); 3125 | } 3126 | &show_link_report($links, $results, $broken, $redirects, \@urls, 1, 3127 | \%todo); 3128 | } 3129 | 3130 | # Show directory redirects 3131 | if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) { 3132 | print_doc_header(); 3133 | print('

') if $Opts{HTML}; 3134 | print("\nList of redirects"); 3135 | print( 3136 | "

\n

The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.

" 3137 | ) if $Opts{HTML}; 3138 | &show_link_report($links, $results, $broken, $redirects, 3139 | \@dir_redirect_urls); 3140 | } 3141 | 3142 | # Show mixedcontent blocked URLs 3143 | if ($#mc_blocked_urls > -1) { 3144 | print_doc_header(); 3145 | print('

') if $Opts{HTML}; 3146 | print("\nList of URLs blocked because they're mixed content"); 3147 | print( 3148 | "

\n

The URLs below are not necessarily broken, but they will be not loaded correctly since they’re http: URLs loaded in an https: page and will be blocked as mixed content.

" 3149 | ) if $Opts{HTML}; 3150 | &show_mixedcontent_report(\@mc_blocked_urls, \%$blocked); 3151 | } 3152 | return; 3153 | } 3154 | 3155 | ############################################################################### 3156 | 3157 | ################ 3158 | # Global stats # 3159 | ################ 3160 | 3161 | sub global_stats () 3162 | { 3163 | my $stop = &get_timestamp(); 3164 | my $n_docs = 3165 | ($doc_count <= $Opts{Max_Documents}) ? $doc_count : 3166 | $Opts{Max_Documents}; 3167 | return sprintf( 3168 | 'Checked %d document%s in %s seconds.', 3169 | $n_docs, 3170 | ($n_docs == 1) ? '' : 's', 3171 | &time_diff($timestamp, $stop) 3172 | ); 3173 | } 3174 | 3175 | ################## 3176 | # HTML interface # 3177 | ################## 3178 | 3179 | sub html_header ($$) 3180 | { 3181 | my ($uri, $cookie) = @_; 3182 | 3183 | my $title = defined($uri) ? $uri : ''; 3184 | $title = ': ' . $title if ($title =~ /\S/); 3185 | 3186 | my $headers = ''; 3187 | if (!$Opts{Command_Line}) { 3188 | $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri; 3189 | $headers .= "Content-Type: text/html; charset=utf-8\n"; 3190 | $headers .= "Set-Cookie: $cookie\n" if $cookie; 3191 | 3192 | # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same 3193 | # print() statement as the last header 3194 | $headers .= "Content-Language: en\n\n"; 3195 | } 3196 | 3197 | my $onload = $uri ? '' : 3198 | ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"'; 3199 | 3200 | print $headers, $DocType, " 3201 | 3202 | 3203 | W3C Link Checker", &encode($title), " 3204 | ", $Head, " 3205 | '; 3206 | &banner($title); 3207 | return; 3208 | } 3209 | 3210 | sub banner ($) 3211 | { 3212 | my $tagline = "Check links and anchors in Web pages or full Web sites"; 3213 | 3214 | printf( 3215 | <<'EOF', $Cfg{Doc_Images_URI} . 'w3c.png', $tagline); 3216 | 3220 |
3221 | EOF 3222 | return; 3223 | } 3224 | 3225 | sub status_icon($) 3226 | { 3227 | my ($code) = @_; 3228 | my $icon_type; 3229 | my $r = HTTP::Response->new($code); 3230 | if ($r->is_success()) { 3231 | $icon_type = 'error' 3232 | ; # if is success but reported, it's because of broken frags => error 3233 | } 3234 | elsif (&informational($code)) { 3235 | $icon_type = 'info'; 3236 | } 3237 | elsif ($code == 300) { 3238 | $icon_type = 'info'; 3239 | } 3240 | elsif ($code == 401) { 3241 | $icon_type = 'error'; 3242 | } 3243 | elsif ($r->is_redirect()) { 3244 | $icon_type = 'warning'; 3245 | } 3246 | elsif ($r->is_error()) { 3247 | $icon_type = 'error'; 3248 | } 3249 | else { 3250 | $icon_type = 'error'; 3251 | } 3252 | return sprintf('%s', 3253 | $Cfg{Doc_Images_URI} . 'info_icons/' . $icon_type . '.png', 3254 | $icon_type); 3255 | } 3256 | 3257 | sub bgcolor ($) 3258 | { 3259 | my ($code) = @_; 3260 | my $class; 3261 | my $r = HTTP::Response->new($code); 3262 | if ($r->is_success()) { 3263 | return ''; 3264 | } 3265 | elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) { 3266 | $class = 'dubious'; 3267 | } 3268 | elsif ($code == 300) { 3269 | $class = 'multiple'; 3270 | } 3271 | elsif ($code == 401) { 3272 | $class = 'unauthorized'; 3273 | } 3274 | elsif ($r->is_redirect()) { 3275 | $class = 'redirect'; 3276 | } 3277 | elsif ($r->is_error()) { 3278 | $class = 'broken'; 3279 | } 3280 | else { 3281 | $class = 'broken'; 3282 | } 3283 | return (' class="' . $class . '"'); 3284 | } 3285 | 3286 | sub show_url ($) 3287 | { 3288 | my ($url) = @_; 3289 | return sprintf('%s', (&encode($url)) x 2); 3290 | } 3291 | 3292 | sub html_footer () 3293 | { 3294 | printf("

%s

\n", &global_stats()) 3295 | if ($doc_count > 0 && !$Opts{Quiet}); 3296 | if (!$doc_count) { 3297 | print <<'EOF'; 3298 |
3299 |

3300 | This Link Checker looks for issues in links, anchors and referenced objects 3301 | in a Web page, CSS style sheet, or recursively on a whole Web site. For 3302 | best results, it is recommended to first ensure that the documents checked 3303 | use Valid (X)HTML Markup and 3304 | CSS. The Link Checker is 3305 | part of the W3C's validators and 3306 | Quality Web tools. 3307 |

3308 |
3309 | EOF 3310 | } 3311 | printf(<<'EOF', $Cfg{Doc_URI}, $PACKAGE, $REVISION); 3312 |
3313 | 3326 | 3327 | 3328 | EOF 3329 | return; 3330 | } 3331 | 3332 | sub print_form (\%$$) 3333 | { 3334 | my ($params, $cookie, $check_num) = @_; 3335 | 3336 | # Split params on \0, see CGI's docs on Vars() 3337 | while (my ($key, $value) = each(%$params)) { 3338 | if ($value) { 3339 | my @vals = split(/\0/, $value, 2); 3340 | $params->{$key} = $vals[0]; 3341 | } 3342 | } 3343 | 3344 | # Override undefined values from the cookie, if we got one. 3345 | my $valid_cookie = 0; 3346 | if ($cookie) { 3347 | my %cookie_values = $cookie->value(); 3348 | if (!$cookie_values{clear}) 3349 | { # XXX no easy way to check if cookie expired? 3350 | $valid_cookie = 1; 3351 | while (my ($key, $value) = each(%cookie_values)) { 3352 | $params->{$key} = $value unless defined($params->{$key}); 3353 | } 3354 | } 3355 | } 3356 | 3357 | my $chk = ' checked="checked"'; 3358 | $params->{hide_type} = 'all' unless $params->{hide_type}; 3359 | 3360 | my $requested_uri = &encode($params->{uri} || ''); 3361 | my $sum = $params->{summary} ? $chk : ''; 3362 | my $red = $params->{hide_redirects} ? $chk : ''; 3363 | my $all = ($params->{hide_type} ne 'dir') ? $chk : ''; 3364 | my $dir = $all ? '' : $chk; 3365 | my $acc = $params->{no_accept_language} ? $chk : ''; 3366 | my $ref = $params->{no_referer} ? $chk : ''; 3367 | my $rec = $params->{recursive} ? $chk : ''; 3368 | my $dep = &encode($params->{depth} || ''); 3369 | 3370 | my $cookie_options = ''; 3371 | if ($valid_cookie) { 3372 | $cookie_options = " 3373 | 3374 | 3375 | "; 3376 | } 3377 | else { 3378 | $cookie_options = " 3379 | "; 3380 | } 3381 | 3382 | print "
3384 |

3386 |

3388 |
3389 | More Options 3390 |
3391 |

3392 | 3394 |
3395 | 3398 | 3400 | 3402 |
3403 | 3406 |
3407 | 3410 |
3411 | , 3415 | 3417 |

", $cookie_options, " 3418 |

3419 |
3420 |
3421 |

3422 |
3423 |
3424 | "; 3425 | return; 3426 | } 3427 | 3428 | sub encode (@) 3429 | { 3430 | return $Opts{HTML} ? HTML::Entities::encode(@_) : @_; 3431 | } 3432 | 3433 | sub hprintf (@) 3434 | { 3435 | print_doc_header(); 3436 | if (!$Opts{HTML}) { 3437 | printf(@_); 3438 | } 3439 | else { 3440 | print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1])); 3441 | } 3442 | return; 3443 | } 3444 | 3445 | # Print the document header, if it hasn't been printed already. 3446 | # This is invoked before most other output operations, in order 3447 | # to enable quiet processing that doesn't clutter the output with 3448 | # "Processing..." messages when nothing else will be reported. 3449 | sub print_doc_header () 3450 | { 3451 | if (defined($doc_header)) { 3452 | print $doc_header; 3453 | undef($doc_header); 3454 | } 3455 | } 3456 | 3457 | # Local Variables: 3458 | # mode: perl 3459 | # indent-tabs-mode: nil 3460 | # cperl-indent-level: 4 3461 | # cperl-continued-statement-offset: 4 3462 | # cperl-brace-offset: -4 3463 | # perl-indent-level: 4 3464 | # End: 3465 | # ex: ts=4 sw=4 et 3466 | -------------------------------------------------------------------------------- /bin/checklink.pod: -------------------------------------------------------------------------------- 1 | =encoding utf8 2 | 3 | =head1 NAME 4 | 5 | checklink - check the validity of links in an HTML or XHTML document 6 | 7 | =head1 SYNOPSIS 8 | 9 | B [ I ] I ... 10 | 11 | =head1 DESCRIPTION 12 | 13 | This manual page documents briefly the B command, a.k.a. the 14 | W3C® Link Checker. 15 | 16 | B is a program that reads an HTML or XHTML document, 17 | extracts a list of anchors and links and checks that no anchor is 18 | defined twice and that all the links are dereferenceable, including 19 | the fragments. It warns about HTTP redirects, including directory 20 | redirects, and can check recursively a part of a web site. 21 | 22 | The program can be used either as a command line tool or as a CGI script. 23 | 24 | =head1 OPTIONS 25 | 26 | This program follow the usual GNU command line syntax, with long options 27 | starting with two dashes (`-'). A summary of options is included below. 28 | 29 | =over 5 30 | 31 | =item B<-?, -h, --help> 32 | 33 | Show summary of options. 34 | 35 | =item B<-V, --version> 36 | 37 | Output version information. 38 | 39 | =item B<-s, --summary> 40 | 41 | Show result summary only. 42 | 43 | =item B<-b, --broken> 44 | 45 | Show only the broken links, not the redirects. 46 | 47 | =item B<-e, --dir-redirects> 48 | 49 | Hide directory redirects - e.g. L -> 50 | L. 51 | 52 | =item B<-r, --recursive> 53 | 54 | Check the documents linked from the first one. 55 | 56 | =item B<-D, --depth> I 57 | 58 | Check the documents linked from the first one to depth I 59 | (implies B<--recursive>). 60 | 61 | =item B<-l, --location> I 62 | 63 | Scope of the documents checked (implies B<--recursive>). 64 | Can be specified multiple times in order to specify multiple recursion 65 | bases. If the URI of a candidate document is downwards relative to any of 66 | the bases, it is considered to be within the scope. If not specified, the 67 | default is the base URI of the initial document, for example for 68 | L it would be 69 | L. 70 | 71 | =item B<-X, --exclude> I 72 | 73 | Do not check links whose full, canonical URIs match I. Note that 74 | this option limits recursion the same way as B<--exclude-docs> with the same 75 | regular expression would. 76 | 77 | =item B<--exclude-docs> I 78 | 79 | In recursive mode, do not check links in documents whose full, canonical 80 | URIs match I. This option may be specified multiple times. 81 | 82 | =item B<--suppress-redirect> IURI> 83 | 84 | Do not report a redirect from the first to the second URI. The "-E" is 85 | literal text. This option may be specified multiple times. Whitespace may 86 | be used instead of "-E" to separate the URIs. 87 | 88 | =item B<--suppress-redirect-prefix> IURI> 89 | 90 | Do not report a redirect from a child of the first URI to the same child of 91 | the second URI. The \"->\" is literal text. This option may be specified 92 | multiple times. Whitespace may be used instead of "-E" to separate the 93 | URIs. 94 | 95 | =item B<--suppress-temp-redirects> 96 | 97 | Do not report warnings about temporary redirects. 98 | 99 | =item B<--suppress-broken> I 100 | 101 | Do not report a broken link with the given CODE. CODE is the HTTP 102 | response, or -1 for robots exclusion. The ":" is literal text. This 103 | option may be specified multiple times. Whitespace may be used instead of 104 | ":" to separate the CODE and the URI. 105 | 106 | =item B<--suppress-fragment> I 107 | 108 | Do not report the given broken fragment URI. A fragment URI contains "#". 109 | This option may be specified multiple times. 110 | 111 | =item B<-L, --languages> I 112 | 113 | The C HTTP header to send. In command line mode, 114 | this header is not sent by default. The special value C causes 115 | a value to be detected from the C environment variable, and sent 116 | if found. In CGI mode, the default is to send the value received from 117 | the client as is. 118 | 119 | =item B<-c, --cookies> I 120 | 121 | Use cookies, load/save them in I. The special value 122 | C causes non-persistent use of cookies, i.e. they are used but 123 | only stored in memory for the duration of this link checker run. 124 | 125 | =item B<-R, --no-referer> 126 | 127 | Do not send the C HTTP header. 128 | 129 | =item B<-q, --quiet> 130 | 131 | No output if no errors are found. Implies B<--summary>. 132 | 133 | =item B<-v, --verbose> 134 | 135 | Verbose mode. 136 | 137 | =item B<-i, --indicator> 138 | 139 | Show progress while parsing as percentage of lines processed. No 140 | indicator is shown for documents containing no linefeeds. 141 | 142 | =item B<-u, --user> I 143 | 144 | Specify a username for authentication. 145 | 146 | =item B<-p, --password> I 147 | 148 | Specify a password for authentication. 149 | 150 | =item B<--hide-same-realm> 151 | 152 | Hide 401's that are in the same realm as the document checked. 153 | 154 | =item B<-S, --sleep> I 155 | 156 | Sleep the specified number of seconds between requests to each server. 157 | Defaults to 1 second, which is also the minimum allowed. 158 | 159 | =item B<-t, --timeout> I 160 | 161 | Timeout for requests, in seconds. The default is 30. 162 | 163 | =item B<-C, --connection-cache> I 164 | 165 | Maximum number of cached connections. Using this option overrides the 166 | C configuration file parameter, see its 167 | documentation below for the default value and more information. 168 | 169 | =item B<-d, --domain> I 170 | 171 | Perl regular expression describing the domain to which the authentication 172 | information (if present) will be sent. The default value can be specified 173 | in the configuration file. See the C entry in the configuration 174 | file description below for more information. 175 | 176 | =item B<--masquerade> I<"real-prefix surrogate-prefix"> 177 | 178 | Perform a simple string substitution: URIs which begin with the 179 | string C are rewritten using the C 180 | before being dereferenced. Useful for making a local 181 | directory masquerade as a remote one. For example: 182 | 183 | --masquerade "http://example.com/x/y/z/ file:///my/local/dir/" 184 | 185 | If the document being checked contains a link to 186 | http://example.com/x/y/z/foo.html, then the local file system will be 187 | checked for file:///my/local/dir/foo.html. 188 | 189 | B<--masquerade> takes a single argument consisting of two URIs, 190 | separated by whitespace. The quote marks are not part of the 191 | argument, but one usual way of providing a value with embedded 192 | whitespace is to enclose it in quotes. 193 | 194 | =item B<-H, --html> 195 | 196 | HTML output. 197 | 198 | =back 199 | 200 | =head1 FILES 201 | 202 | =over 5 203 | 204 | =item F 205 | 206 | The main configuration file. You can use the L environment 207 | variable to override the default location. 208 | 209 | C specifies a regular expression for matching trusted domains 210 | (ie. domains where HTTP basic authentication, if any, will be sent). 211 | The regular expression will be matched case insensitively against host 212 | names. The default behavior (when unset, that is) is to send the 213 | authentication information only to the host which requests it; usually 214 | you don't want to change this. For example, the following configures 215 | I the w3.org domain as trusted: 216 | 217 | Trusted = \.w3\.org$ 218 | 219 | C is a boolean flag indicating whether checking links 220 | on non-public IP addresses is allowed. The default is true in command line 221 | mode and false when run as a CGI script. For example, to disallow checking 222 | non-public IP addresses, regardless of the mode, use: 223 | 224 | Allow_Private_IPs = 0 225 | 226 | C is a comma separated list of additional protocols/URI 227 | schemes that the link checker is not allowed to use. The C and 228 | C schemes are always forbidden, and so is the C scheme when 229 | running as a CGI script. 230 | 231 | Forbidden_Protocols = javascript,mailto 232 | 233 | C and C are formatted URIs to the 234 | respective validators. The C<%s> in these will be replaced with the full 235 | "URI encoded" URI to the document being checked, and shown in the link 236 | checker results view in the online/CGI version. The defaults are: 237 | 238 | Markup_Validator_URI = 239 | http://validator.w3.org/check?uri=%s 240 | CSS_Validator_URI = 241 | http://jigsaw.w3.org/css-validator/validator?uri=%s 242 | 243 | C is a URI used for linking to the documentation, and CSS and 244 | JavaScript files in the dynamically generated content of the link checker. 245 | The default is: 246 | 247 | Doc_URI = http://validator.w3.org/docs/checklink.html 248 | 249 | C is an integer denoting the maximum number of 250 | connections the link checker will keep open at any given time. The 251 | default is: 252 | 253 | Connection_Cache_Size = 2 254 | 255 | =back 256 | 257 | =head1 ENVIRONMENT 258 | 259 | checklink uses the libwww-perl library which has a number of environment 260 | variables affecting its behaviour. See L for some 261 | pointers. 262 | 263 | =over 5 264 | 265 | =item B 266 | 267 | If set, overrides the path to the configuration file. 268 | 269 | =back 270 | 271 | =head1 SEE ALSO 272 | 273 | The documentation for this program is available on the web at 274 | L. 275 | 276 | L, L, L, L, L. 277 | 278 | =head1 AUTHOR 279 | 280 | This program was originally written by Hugo Haas Ehugo@w3.orgE, based 281 | on Renaud Bruyeron's F. It has been enhanced by Ville Skyttä 282 | and many other volunteers since. Use the Ewww-validator@w3.orgE 283 | mailing list for feedback, and see 284 | L for more information. 285 | 286 | This manual page was originally written by Frédéric Schütz 287 | Eschutz@mathgen.chE for the Debian GNU/Linux system (but may 288 | be used by others). 289 | 290 | =head1 COPYRIGHT 291 | 292 | This program is licensed under the W3C® Software License, 293 | L. 294 | 295 | =cut 296 | -------------------------------------------------------------------------------- /docs/checklink.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | W3C Link Checker Documentation 5 | 6 | 7 | 8 | 9 | 10 | 15 |
16 | 26 | 27 |
28 |

About this service

29 |

30 | In order to check the validity of the technical reports that W3C 31 | publishes, the Systems Team has developed a link checker. 32 |

33 |

34 | A first version was developed in by 35 | Renaud Bruyeron. 36 | Since it was lacking some functionalities, 37 | Hugo Haas 38 | rewrote it more or less from scratch in . 39 | It has been improved by Ville Skyttä and many other volunteers since. 40 |

41 |

42 | The source code is available publicly under the 43 | W3C IPR 44 | software notice from 45 | CPAN (released 47 | versions) and a 48 | github repository 49 | (development and archived release versions). 50 |

51 |
52 | 53 |
54 |

What it does

55 | 56 |

57 | The link checker reads an HTML or XHTML document or a CSS style sheet 58 | and extracts a list of anchors and links. 59 |

60 | 61 |

62 | It checks that no anchor is defined twice. 63 |

64 | 65 |

66 | It then checks that all the links are dereferenceable, including 67 | the fragments. It warns about HTTP redirects, including directory 68 | redirects. 69 |

70 | 71 |

72 | It can check recursively a part of a Web site. 73 |

74 | 75 |

76 | There is a command line version and a 77 | CGI version. They both 78 | support HTTP basic 79 | authentication. This is achieved in the CGI version 80 | by passing through the authorization information from the user browser 81 | to the site tested. 82 |

83 |
84 | 85 |
86 |

Use it online

87 |

88 | There is an 89 | online version 90 | of the link checker. 91 |

92 |

93 | In the online version (and in general, when run as a CGI script), 94 | the number of documents that can be checked recursively is limited. 95 |

96 |

97 | Both the command line version and the online one sleep at least one 98 | second between requests to each server to avoid abuses and target 99 | server congestion. 100 |

101 |

Access keys

102 |

103 | The following access keys are implemented throughout the 104 | site in an attempt to help users using screen readers. 105 |

106 |
    107 |
  1. Home: access key 1 leads back to the service's home page.
  2. 108 |
  3. Downloads: access key 2 leads to downloads.
  4. 109 |
  5. Documentation: access key 3 leads to the documentation index for 110 | the service.
  6. 111 |
  7. Feedback: access key 4 leads to the feedback instructions.
  8. 112 |
113 |
114 | 115 |
116 |

Install it locally

117 |

118 | The link checker is written in Perl. It is packaged as a standard 119 | CPAN distribution, and depends on 120 | a few other modules which are also available from CPAN. 121 |

122 | 123 |

Install with the CPAN utility

124 |

If you system has a working installation of Perl, you should be able to install the link checker and its dependencies with a single line from the commandline shell:

125 |

sudo perl -MCPAN -e 'install W3C::LinkChecker' (use without the sudo command if installing from an administrator account).

126 |

If this is the first time you use the CPAN utility, you may have to answer a few setup questions before the tool downloads, builds and installs the link checker.

127 | 128 |

Install by hand

129 |

If for any reason the technique described above is not working or if you prefer installing each package by hand, follow the instructions below:

130 |
    131 |
  1. 132 | Install Perl, version 5.8 133 | or newer. 134 |
  2. 135 |
  3. 136 | You will need the following CPAN 137 | distributions, as well as the distributions they possibly depend on. 138 | Depending on your Perl version, you might already have some of 139 | these installed. Also, the latest versions of these may require a 140 | recent version of Perl. As long as the minimum version requirement(s) 141 | below are satisfied, everything should be fine. The latest version 142 | should not be needed, just get an older version that works with your 143 | Perl. For an introduction to installing Perl modules, 144 | see The CPAN FAQ. 145 |
      146 |
    • W3C-LinkChecker (the link checker itself)
    • 147 |
    • CGI.pm (required for CGI mode only)
    • 148 |
    • Config-General (optional, version 2.06 or newer; required only for reading the (optional) configuration file)
    • 149 |
    • CSS-DOM (version 0.09 or newer)
    • 150 |
    • Encode-Locale (required for command line mode only)
    • 151 |
    • HTML-Parser (version 3.20 or newer)
    • 152 |
    • libwww-perl (version 5.833 or newer)
    • 153 |
    • Net-IP (optional but recommended; required for restricting access to private IP addresses)
    • 154 |
    • TermReadKey (optional but recommended; required only in command line mode for password input)
    • 155 |
    • Time-HiRes
    • 156 |
    • URI (version 1.53 or newer)
    • 157 |
    158 |
  4. 159 |
  5. 160 | Optionally install the link checker configuration file, 161 | etc/checklink.conf contained in the link checker 162 | distribution package into /etc/w3c/checklink.conf 163 | or set the W3C_CHECKLINK_CFG environment variable to the 164 | location where you installed it. 165 |
  6. 166 |
  7. 167 | Optionally, install the checklink script into a location 168 | in your web server which allows execution of CGI scripts (typically a 169 | directory named cgi-bin somewhere below your web server's 170 | root directory). 171 |
  8. 172 |
  9. 173 | See also the README and INSTALL file(s) 174 | included in the above distributions. 175 |
  10. 176 |
177 |

178 | Running checklink --help shows how to 179 | use the command line version. The distribution package also includes 180 | more extensive POD 181 | documentation, use 182 | perldoc checklink (or man checklink on Unixish systems) 183 | to view it. 184 |

185 |

186 | SSL/TLSv1 187 | support for https in the link checker needs support for 188 | it in libwww-perl; see 189 | README.SSL 190 | in the libwww-perl distribution for more information. 191 |

192 |

193 | In online mode, link checker's output should not be buffered to avoid 194 | browser timeouts. The link checker itself does not buffer its output, 195 | but in some cases output buffering needs to be explicitly disabled for 196 | it in the web server running it. One such case is Apache's mod_deflate 197 | compression module which as a side effect results in output buffering; 198 | one way to disable it for the link checker (while leaving it enabled for 199 | other resources if configured so elsewhere) is to add the following 200 | section to an appropriate place in the Apache configuration (assuming the 201 | link checker script's filename is checklink): 202 |

203 |
204 |         <Files checklink>
205 |         SetEnv no-gzip
206 |         </Files>
207 |         
208 |

209 | If you want to enable the authentication capabilities with Apache, 210 | have a look at 211 | Steven Drake's hack. 212 |

213 |

214 | The link checker honors proxy settings from the 215 | scheme_proxy environment variables. See 216 | LWP(3) and 217 | LWP::UserAgent(3)'s 218 | env_proxy method for more information. 219 |

220 |

221 | Some environment variables affect the way how the link checker uses 222 | FTP. 223 | In particular, passive mode is the default. See 224 | Net::FTP(3) 225 | for more information. 226 |

227 |

228 | There are multiple alternatives for configuring the default 229 | NNTP 230 | server for use with news: URIs without explicit hostnames, 231 | see 232 | Net::NNTP(3) 233 | for more information. 234 |

235 |
236 | 237 |
238 |

Robots exclusion

239 |

240 | The link checker honors 241 | robots exclusion 242 | rules. To place rules specific to the W3C Link Checker in 243 | /robots.txt files, sites can use the 244 | W3C-checklink user agent string. For example, to allow 245 | the link checker to access all documents on a server and to disallow 246 | all other robots, one could use the following: 247 |

248 |
249 |           User-Agent: *
250 |           Disallow: /
251 |           User-Agent: W3C-checklink
252 |           Disallow:
253 |         
254 |

255 | Robots exclusion support in the link checker is based on the 256 | LWP::RobotUA 257 | Perl module. It currently supports the 258 | "original 1994 version" 259 | of the standard. The robots META tag, ie. 260 | <meta name="robots" content="...">, is not supported. 261 | Other than that, the link checker's implementation goes all the way 262 | in trying to honor robots exclusion rules; if a 263 | /robots.txt disallows it, not even the first document 264 | submitted as the root for a link checker run is fetched. 265 |

266 |

267 | Note that /robots.txt rules affect only user agents 268 | that honor it; it is not a generic method for access control. 269 |

270 |
271 | 272 |
273 |

Comments, suggestions and bugs

274 |

275 | The current version has proven to be stable. It could however be 276 | improved, see the list of open enhancement ideas and bugs for details. 277 |

278 |
279 |
280 | 281 | 308 | 309 | 310 | -------------------------------------------------------------------------------- /docs/linkchecker.css: -------------------------------------------------------------------------------- 1 | /* 2 | Base Style Sheet for the W3C Link Checker. 3 | 4 | Copyright 2000-2011 W3C (MIT, INRIA, Keio). All Rights Reserved. 5 | See http://www.w3.org/Consortium/Legal/ipr-notice.html#Copyright 6 | */ 7 | 8 | html, body { 9 | line-height: 120%; 10 | color: black; 11 | background: white; 12 | font-family: "Bitstream Vera Sans", sans-serif; 13 | margin: 0; 14 | padding: 0; 15 | border: 0; 16 | } 17 | 18 | #main { 19 | margin: 1em 2em; 20 | } 21 | 22 | #main form { 23 | clear: both; 24 | background: #EAEBEE url(../images/round-tr.png) no-repeat top right; 25 | padding: 0.5em 1.3em; 26 | border-bottom: 1px solid #DCDDE0; 27 | } 28 | 29 | a img { 30 | border: 0; 31 | } 32 | 33 | a:link, a:visited { 34 | text-decoration: underline; 35 | color: #365D95; 36 | } 37 | 38 | a:hover, a:active { 39 | text-decoration: underline; 40 | color: #1F2126; 41 | } 42 | 43 | abbr:hover { 44 | cursor: help; 45 | } 46 | abbr[title], span[title], strong[title] { 47 | border-bottom: thin dotted; 48 | cursor: help; 49 | } 50 | 51 | pre, code { 52 | font-family: "Bitstream Vera Sans Mono", monospace; 53 | line-height: 100%; 54 | white-space: pre; 55 | } 56 | 57 | div.progress pre { 58 | height: 12em; 59 | font-size: small; 60 | overflow: auto; 61 | padding: 0.5em 1.3em; 62 | border: 1px solid #DCDDE0; 63 | margin-top: 0; 64 | } 65 | 66 | div.progress h3 { 67 | margin-bottom: 0; 68 | background: white; 69 | border: 1px solid #DCDDE0; 70 | border-bottom: 0; 71 | padding: .4em .8em; 72 | text-indent: 0; 73 | overflow: hidden; 74 | } 75 | 76 | progress { 77 | 78 | inline-size: 100%; 79 | block-size: 0.15em; 80 | border: 1px solid #DCDDE0; 81 | border-bottom: 0; 82 | } 83 | 84 | fieldset { 85 | border: 0; 86 | padding: 0; 87 | } 88 | legend { 89 | font-size: 1.1em; 90 | padding: 1em 0 0.23em; 91 | letter-spacing: 0.06em; 92 | } 93 | 94 | fieldset p { 95 | margin: 0 !important; 96 | padding: 0.7em 0 0.5em 1em; 97 | border-top: 1px solid #CBCDD5; 98 | background: #EAEBEE url(../images/double.png) left top repeat-x; 99 | } 100 | 101 | input#uri { 102 | font-family: Monaco, "Courier New", Monospace; 103 | font-size: 0.9em; 104 | border: 1px solid #BBB; 105 | border-top: 1px solid #777; 106 | border-bottom: 1px solid #DDD; 107 | background: #FEFEFE url(../images/textbg.png) no-repeat top left; 108 | padding: 0.2em 0.2em; 109 | max-width: 1000px; 110 | font-variant: normal; 111 | width: 95%; 112 | margin: 0.3em 0 0 1em; 113 | } 114 | 115 | p.submit_button input { 116 | display: block; 117 | width: 8em; 118 | text-align: center; 119 | margin: 10px auto; 120 | padding: 0 12px; 121 | -webkit-border-radius: 28px; 122 | -moz-border-radius: 28px; 123 | border-radius: 28px; 124 | -webkit-box-shadow: 0 0 2px #86888e; 125 | -moz-box-shadow: 0 0 2px #86888e; 126 | box-shadow: 0 0 2px #86888e; 127 | color: #365d95; 128 | font-size: 16px; 129 | background-color: #eaebee; 130 | border: solid #86888e 1px; 131 | text-decoration: none; 132 | line-height: 32px; 133 | cursor: pointer; 134 | } 135 | 136 | 137 | p.submit_button input:active, 138 | p.submit_button input:hover { 139 | color: #1f2126; 140 | } 141 | 142 | a:link img, a:visited img { 143 | border-style: none; 144 | } 145 | a img { 146 | color: black; /* The only way to hide the border in NS 4.x */ 147 | } 148 | 149 | ul.toc { 150 | list-style: none; 151 | } 152 | 153 | ol li { 154 | padding: .1em; 155 | } 156 | 157 | th { 158 | text-align: left; 159 | } 160 | 161 | .hideme { 162 | display: none; 163 | } 164 | 165 | @media screen { 166 | /* only hidden visually */ 167 | .progress label span { display: none } 168 | } 169 | 170 | /* These are usually targets and not links */ 171 | h1 a, h1 a:hover, h2 a, h2 a:hover, h3 a, h3 a:hover { 172 | color: inherit; 173 | background-color: inherit; 174 | } 175 | 176 | img { 177 | vertical-align: middle; 178 | } 179 | 180 | address img { 181 | float: right; 182 | width: 88px; 183 | } 184 | 185 | address { 186 | padding: 0 2em; 187 | font-size: small; 188 | text-align: center; 189 | color: #888; 190 | background-color: white; 191 | } 192 | 193 | p.copyright { 194 | margin-top: 5em; 195 | padding-top: .5em; 196 | font-size: xx-small; 197 | max-width: 85ex; 198 | text-align: justify; 199 | text-transform: uppercase; 200 | margin-left: auto; 201 | margin-right:auto; 202 | font-family: "Bitstream Vera Sans Mono", monospace; 203 | color: #888; 204 | line-height: 120%; 205 | } 206 | 207 | p.copyright a { 208 | color: #88F; 209 | text-decoration: none; 210 | } 211 | 212 | /* Various header(ish) things. Definitions cribbed from the CORE Styles. */ 213 | 214 | h1#title { 215 | font-family: "Myriad Web", "Myriad Pro", "Gill Sans", Helvetica, Arial, Sans-Serif; 216 | background-color: #365D95; 217 | color: #FDFDFD; 218 | font-size: 1.6em; 219 | font-weight: normal; 220 | background: url(../images/head-bl.png) bottom left no-repeat; 221 | padding-bottom: 0.430em; 222 | margin: 0; 223 | line-height: 1; 224 | } 225 | h1#title a, h1#title a img { 226 | background-color: #365D95; 227 | } 228 | 229 | h1 span { 230 | border-bottom: 1px solid #6383B1; 231 | border-color: #4E6F9E; 232 | } 233 | 234 | h1#title a:link, h1#title a:hover, h1#title a:visited, h1#title a:active { 235 | color: #FDFDFD !important; 236 | text-decoration: none; 237 | } 238 | 239 | h1#title img { 240 | vertical-align: middle; 241 | margin-right: 0.7em; 242 | } 243 | 244 | p#tagline { 245 | font-size: 0.7em; 246 | margin: -2em 0 0 12.1em; 247 | padding-bottom: 1em; 248 | letter-spacing: 0.1em; 249 | line-height: 100% !important; 250 | color: #D0DCEE; 251 | background-color: transparent; 252 | } 253 | 254 | #banner { 255 | background: #365D95 url(../images/head-br.png) bottom right no-repeat; 256 | margin: 1.5em 2em; 257 | } 258 | 259 | h2 { 260 | font-size: 1.5em; 261 | text-align: left; 262 | font-weight: bold; 263 | font-style: normal; 264 | text-decoration: none; 265 | margin-top: 1em; 266 | margin-bottom: 1em; 267 | line-height: 120%; 268 | } 269 | 270 | h3 { 271 | font-size: 1.3em; 272 | font-weight: normal; 273 | font-style: normal; 274 | text-decoration: none; 275 | background-color: #EEE; 276 | text-indent: 1em; 277 | padding: .2em; 278 | border-top: 1px dotted black; 279 | } 280 | 281 | /* 282 | Navbar 283 | */ 284 | 285 | ul#menu { 286 | text-align: center; 287 | margin: 1em 2em; 288 | background: #EAEBEE url(../images/round-br.png) no-repeat bottom right; 289 | padding: 0.5em 0 0.3em; 290 | border-top: 1px solid #DCDDE0; 291 | } 292 | 293 | ul#menu span { 294 | display: none; 295 | } 296 | 297 | ul#menu a:link, ul#menu a:visited { 298 | background: #EAEBEE; 299 | color: #365D95; 300 | text-decoration: none; 301 | } 302 | 303 | ul#menu a:hover, ul#menu a:active { 304 | color: #1F2126; 305 | text-decoration: underline; 306 | } 307 | 308 | ul#menu li { 309 | display: inline; 310 | margin-right: 0.8em; 311 | } 312 | 313 | /* Results */ 314 | 315 | .report { 316 | width: 100%; 317 | } 318 | 319 | table.report { 320 | border-collapse: collapse; 321 | } 322 | table.report th { 323 | padding: .5em; 324 | background-color: #FCFCFC; 325 | } 326 | table.report td { 327 | padding: .5em; 328 | } 329 | 330 | dl.report { 331 | margin-left: 0 !important; 332 | margin-right: 0 !important; 333 | padding: 0; 334 | border-bottom: 1px solid #EAEBEE; 335 | border-left: 1px solid #EAEBEE; 336 | border-right: 1px solid #EAEBEE; 337 | } 338 | 339 | dl.report dt, dl.report dd { 340 | border-bottom: 0; 341 | } 342 | dl.report dt { 343 | border-top: 1px solid #EAEBEE; 344 | margin-top: .8em; 345 | padding-left: .5em; 346 | padding-top: .5em; 347 | font-weight: bold; 348 | } 349 | dl.report dt span.msg_loc, dl.report dt span.redirected_to { 350 | font-weight: normal; 351 | } 352 | dl.report dd { 353 | border-top: 0; 354 | margin: 0; 355 | text-indent: 0; 356 | padding: 0; 357 | margin-left: 1.5em; 358 | } 359 | dl.report dd.responsecode { 360 | padding-top: 1em; 361 | font-size: smaller; 362 | } 363 | dl.report dd.message_explanation { 364 | font-size: smaller; 365 | margin-bottom: 1.5em; 366 | } 367 | 368 | dl.report dd p{ 369 | padding: 0; 370 | line-height: 150%; 371 | } 372 | 373 | div.settings { 374 | font-size: smaller; 375 | float: right; 376 | } 377 | div.settings ul { 378 | margin: 0; 379 | padding-left: 1.5em; 380 | } 381 | .unauthorized { 382 | background-color: aqua; 383 | } 384 | 385 | .redirect { 386 | font-weight: normal; 387 | font-style: italic; 388 | } 389 | .broken { 390 | color: #A00; 391 | } 392 | dl.report .broken { 393 | font-weight: bold; 394 | } 395 | .multiple { 396 | color: fuchsia; 397 | } 398 | .dubious { 399 | font-style: italic; 400 | } 401 | 402 | span.err_type img { 403 | width: 1.2em; 404 | height: 1.2em; 405 | padding-bottom: .2em; 406 | margin-right: .5em; 407 | vertical-align: middle; 408 | } 409 | 410 | /* W3C includes (donation and sponsorship program) */ 411 | #w3c-include {} 412 | 413 | -------------------------------------------------------------------------------- /docs/linkchecker.js: -------------------------------------------------------------------------------- 1 | function uriOk(num) 2 | { 3 | if (!document.getElementById) { 4 | return true; 5 | } 6 | 7 | var u = document.getElementById('uri_' + num); 8 | var ok = false; 9 | if (u.value.length > 0) { 10 | if (u.value.search) { 11 | ok = (u.value.search(/\S/) !== -1); 12 | } 13 | else { 14 | ok = true; 15 | } 16 | } 17 | if (!ok) { 18 | u.focus(); 19 | } 20 | return ok; 21 | } 22 | 23 | function show_progress(progress_id, progress_text, progress_percentage) 24 | { 25 | var div = document.getElementById("progress" + progress_id); 26 | 27 | var head = div.getElementsByTagName("h3")[0]; 28 | var text = document.createTextNode(progress_text); 29 | var span = document.createElement("span"); 30 | span.appendChild(text); 31 | head.replaceChild(span, head.getElementsByTagName("span")[0]); 32 | 33 | var label = div.getElementsByTagName("label")[0]; 34 | var labelText = label.getElementsByTagName("span")[0]; 35 | labelText.textContent = "Progress: " + progress_percentage; 36 | 37 | var bar = label.getElementsByTagName("progress")[0]; 38 | bar.value = Number.parseFloat(progress_percentage); 39 | bar.textContent = progress_percentage; 40 | 41 | var pre = div.getElementsByTagName("pre")[0]; 42 | pre.scrollTop = pre.scrollHeight; 43 | } 44 | -------------------------------------------------------------------------------- /docs/tests/links_borked.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | Sample HTML Documents with broken links (40X, bogus host, 500, etc) 9 | 10 | 11 | 12 | 13 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/tests/links_ok.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | Sample HTML Documents with no broken link 9 | 10 | 11 | 12 | 13 |

W3C Homepage

14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/tests/links_redirects.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | Sample HTML Documents with no broken link, but different types of redirects 9 | 10 | 11 | 12 | 13 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /etc/checklink.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Configuration file for the W3C Link Checker 3 | # 4 | # See Config::General(3) for the syntax; 'SplitPolicy' is 'equalsign' here. 5 | # 6 | 7 | # 8 | # Trusted is a regular expression for matching "trusted" domains. This is 9 | # used to restrict the domains where HTTP basic authentication will be sent. 10 | # This is matched case insensitively against resoures' hostnames. 11 | # 12 | # Not specifying a value here means that the basic authentication will only 13 | # be sent to the same host where the authentication was requested from. 14 | # 15 | # For example, the following would allow sending the authentication to any 16 | # host in the w3.org domain (and *only* there): 17 | # Trusted = \.w3\.org$ 18 | 19 | 20 | # 21 | # Allow_Private_IPs is a boolean flag (1/0) for specifying whether checking of 22 | # links to non-public RFC 1918 IP addresses is allowed. 23 | # 24 | # The default, ie. not specifying the value here means that checking links 25 | # on non-public IP addresses is disabled when checklink runs as a CGI script, 26 | # and allowed in command line mode. 27 | # 28 | # For example, the following would disallow private IP addresses regardless 29 | # of the mode: 30 | # Allow_Private_IPs = 0 31 | 32 | 33 | # 34 | # Markup_Validator_URI and CSS_Validator_URI are formatted URIs to the 35 | # respective validators. The %s in these will be replaced with the full 36 | # "URI encoded" URI to the document being checked, and shown in the link 37 | # checker results view in the online/CGI version. 38 | # 39 | # Defaults: 40 | # Markup_Validator_URI = http://validator.w3.org/check?uri=%s 41 | # CSS_Validator_URI = http://jigsaw.w3.org/css-validator/validator?uri=%s 42 | 43 | 44 | # 45 | # Doc_URI is the URI to the Link Checker documentation, shown in the 46 | # results report in CGI mode, and the usage message in command line mode. 47 | # The URIs to the CSS and JavaScript files in the generated HTML are also 48 | # formed using this as their base URI. If you have installed the documentation 49 | # locally somewhere, you may wish to change this to point to that location. 50 | # This must be an absolute URI. 51 | # 52 | # Default: 53 | # Doc_URI = http://validator.w3.org/docs/checklink.html 54 | 55 | 56 | # 57 | # Allowed_Protocols is a comma separated list of additional protocols/URI 58 | # schemes that the link checker is allowed to use besides http, https and ftp. 59 | # 60 | # The javascript and mailto schemes for example are forbidden, and so is 61 | # the file scheme when running as a CGI script. 62 | # 63 | # Default: 64 | # Allowed_Protocols = http,https,ftp 65 | 66 | 67 | # 68 | # Connection_Cache_Size is an integer denoting the maximum number of 69 | # connections the link checker will keep open at any given time. 70 | # 71 | # Default: 72 | # Connection_Cache_Size = 2 73 | -------------------------------------------------------------------------------- /etc/perltidyrc: -------------------------------------------------------------------------------- 1 | # perltidy(1) profile for the W3C Link Checker 2 | --standard-error-output 3 | --warning-output 4 | --output-line-ending=unix 5 | --maximum-line-length=79 6 | --indent-columns=4 7 | --continuation-indentation=4 8 | --vertical-tightness=2 9 | --paren-tightness=2 10 | --brace-tightness=2 11 | --square-bracket-tightness=2 12 | --opening-sub-brace-on-new-line 13 | --nospace-for-semicolon 14 | --nooutdent-long-lines 15 | --break-after-all-operators 16 | -------------------------------------------------------------------------------- /images/double.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/double.png -------------------------------------------------------------------------------- /images/grad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/grad.png -------------------------------------------------------------------------------- /images/head-bl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/head-bl.png -------------------------------------------------------------------------------- /images/head-br.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/head-br.png -------------------------------------------------------------------------------- /images/info_icons/README: -------------------------------------------------------------------------------- 1 | * error.png and info.png from: 2 | 3 | information icons set 4 | by: Jakub Jankiewicz 5 | license: public domain 6 | http://openclipart.org/media/files/kuba/2051 7 | 8 | 9 | * warning.png from: 10 | 11 | Warning Notification 12 | by: eastshores 13 | license: public domain 14 | http://openclipart.org/media/files/eastshores/2833 15 | -------------------------------------------------------------------------------- /images/info_icons/error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/info_icons/error.png -------------------------------------------------------------------------------- /images/info_icons/info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/info_icons/info.png -------------------------------------------------------------------------------- /images/info_icons/warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/info_icons/warning.png -------------------------------------------------------------------------------- /images/no_w3c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/no_w3c.png -------------------------------------------------------------------------------- /images/round-br.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/round-br.png -------------------------------------------------------------------------------- /images/round-tr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/round-tr.png -------------------------------------------------------------------------------- /images/textbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/textbg.png -------------------------------------------------------------------------------- /images/w3c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/w3c.png -------------------------------------------------------------------------------- /t/00compile.t: -------------------------------------------------------------------------------- 1 | use Test::More tests => 2; # -*- perl -*- 2 | use File::Spec (); 3 | 4 | ok(system($^X, '-wTc', File::Spec->catfile('bin', 'checklink')) == 0); 5 | require_ok('W3C::LinkChecker'); 6 | -------------------------------------------------------------------------------- /w3c.json: -------------------------------------------------------------------------------- 1 | { 2 | "contacts": ["dontcallmedom"], 3 | "repo-type": "tool" 4 | } 5 | --------------------------------------------------------------------------------