├── .github
    └── workflows
    │   └── test.yml
├── MANIFEST
├── MANIFEST.SKIP
├── META.yml
├── Makefile.PL
├── NEWS
├── README.md
├── SIGNATURE
├── bin
    ├── checklink
    └── checklink.pod
├── docs
    ├── checklink.html
    ├── linkchecker.css
    ├── linkchecker.js
    └── tests
    │   ├── links_borked.html
    │   ├── links_ok.html
    │   └── links_redirects.html
├── etc
    ├── checklink.conf
    └── perltidyrc
├── images
    ├── double.png
    ├── grad.png
    ├── head-bl.png
    ├── head-br.png
    ├── info_icons
    │   ├── README
    │   ├── error.png
    │   ├── info.png
    │   └── warning.png
    ├── no_w3c.png
    ├── round-br.png
    ├── round-tr.png
    ├── textbg.png
    └── w3c.png
├── t
    └── 00compile.t
└── w3c.json


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Run test suite
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request: {}
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |         with:
15 |           repository: w3c/linkchecker-testsuite
16 |           path: test-suite
17 |       - uses: actions/setup-python@v4
18 |         with:
19 |           python-version: '3.7'
20 |       - name: Install test suite dependencies
21 |         working-directory: test-suite/harness
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install -r requirements.txt
25 |       - uses: actions/checkout@v4
26 |         with:
27 |           path: link-checker
28 |       # using https://github.com/shogo82148/actions-setup-perl created
29 |       # issue with installing libcss-dom-perl due to
30 |       # old syntax in package
31 |       # see https://rt.cpan.org/Public/Bug/Display.html?id=146661
32 |       - name: Install perl
33 |         run: sudo apt-get install -y perl cpanminus libcss-dom-perl libwww-perl
34 | 
35 |       - name: Install checker dependencies
36 |         working-directory: link-checker
37 |         run: cpanm --installdeps .
38 |       - name: run test server
39 |         working-directory: test-suite
40 |         run: python server.py &
41 |       - name: install bubblewrap
42 |         run: sudo apt-get install -y bubblewrap
43 |       - name: run test suite
44 |         working-directory: test-suite/harness
45 |         run: bwrap --dev-bind / / --ro-bind ../hosts /etc/hosts sh -c "python linktest.py --checker_path ../../link-checker/bin/checklink run"


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | Makefile.PL
 2 | MANIFEST
 3 | META.yml
 4 | NEWS                    Overview of changes between releases
 5 | README                  Start by reading this
 6 | SIGNATURE
 7 | bin/checklink           The link checker
 8 | bin/checklink.pod       Manual page for the link checker
 9 | etc/checklink.conf      Optional configuration file
10 | etc/perltidyrc          perltidy(1) profile
11 | docs/checklink.html     Additional documentation
12 | docs/linkchecker.css    Cascading style sheet used in docs and generated HTML
13 | docs/linkchecker.js     JavaScript used in the generated HTML
14 | images/double.png
15 | images/grad.png
16 | images/head-bl.png
17 | images/head-br.png
18 | images/no_w3c.png
19 | images/round-br.png
20 | images/round-tr.png
21 | images/textbg.png
22 | images/info_icons/README
23 | images/info_icons/error.png
24 | images/info_icons/info.png
25 | images/info_icons/warning.png
26 | lib/W3C/LinkChecker.pm  Dummy *.pm for CPAN indexing purposes
27 | t/00compile.t
28 | 


--------------------------------------------------------------------------------
/MANIFEST.SKIP:
--------------------------------------------------------------------------------
 1 | \.hgignore$
 2 | \.hgtags$
 3 | \bblib\b
 4 | \bpm_to_blib\b
 5 | \.hg\b
 6 | ~$
 7 | \bMANIFEST\.SKIP$
 8 | \bMakefile$
 9 | \bMakefile\.old$
10 | \.tar\.
11 | \bREADME\.cvs$
12 | \bdocs/tests\b
13 | \bimages/w3c\.png$
14 | 


--------------------------------------------------------------------------------
/META.yml:
--------------------------------------------------------------------------------
 1 | --- #YAML:1.0
 2 | name:               W3C-LinkChecker
 3 | version:            4.81
 4 | abstract:           W3C Link Checker
 5 | author:
 6 |     - W3C QA-dev Team <public-qa-dev@w3.org>
 7 | license:            open_source
 8 | distribution_type:  module
 9 | configure_requires:
10 |     ExtUtils::MakeMaker:  0
11 | build_requires:
12 |     ExtUtils::MakeMaker:  0
13 | requires:
14 |     CGI:                  0
15 |     CGI::Carp:            0
16 |     CGI::Cookie:          0
17 |     Config::General:      2.06
18 |     CSS::DOM:             0.09
19 |     CSS::DOM::Constants:  0
20 |     CSS::DOM::Style:      0
21 |     CSS::DOM::Util:       0
22 |     Encode:               0
23 |     Encode::Locale:       0
24 |     File::Spec:           0
25 |     Getopt::Long:         2.17
26 |     HTML::Entities:       0
27 |     HTML::Parser:         3.40
28 |     HTTP::Cookies:        0
29 |     HTTP::Headers::Util:  0
30 |     HTTP::Message:        5.827
31 |     HTTP::Request:        0
32 |     HTTP::Response:       1.50
33 |     Locale::Country:      0
34 |     Locale::Language:     0
35 |     LWP::RobotUA:         1.19
36 |     LWP::UserAgent:       0
37 |     Net::hostent:         0
38 |     Net::HTTP::Methods:   5.833
39 |     Net::IP:              0
40 |     Socket:               0
41 |     Term::ReadKey:        2
42 |     Test::More:           0
43 |     Text::Wrap:           0
44 |     Time::HiRes:          0
45 |     URI:                  1.53
46 |     URI::Escape:          0
47 |     URI::file:            0
48 | resources:
49 |     bugtracker:   https://github.com/w3c/link-checker/issues/
50 |     homepage:     https://validator.w3.org/checklink
51 |     MailingList:  https://lists.w3.org/Archives/Public/www-validator/
52 |     repository:   https://github.com/w3c/link-checker/
53 | no_index:
54 |     directory:
55 |         - t
56 |         - inc
57 | generated_by:       ExtUtils::MakeMaker version 6.56
58 | meta-spec:
59 |     url:      http://module-build.sourceforge.net/META-spec-v1.4.html
60 |     version:  1.4
61 | 


--------------------------------------------------------------------------------
/Makefile.PL:
--------------------------------------------------------------------------------
  1 | use 5.008;
  2 | use ExtUtils::MakeMaker;
  3 | 
  4 | WriteMakefile(
  5 |     NAME         => 'W3C::LinkChecker',
  6 |     ABSTRACT     => 'W3C Link Checker',
  7 |     AUTHOR       => 'W3C QA-dev Team <public-qa-dev@w3.org>',
  8 |     LICENSE      => 'open_source',
  9 |     VERSION_FROM => 'bin/checklink',
 10 |     PREREQ_PM    => {
 11 | 
 12 |         # Hard dependencies:
 13 |         CSS::DOM            => 0.09,
 14 |         CSS::DOM::Constants => 0,
 15 |         CSS::DOM::Style     => 0,
 16 |         CSS::DOM::Util      => 0,
 17 |         Encode              => 0,
 18 |         HTML::Entities      => 0,
 19 |         HTML::Parser        => "3.40",
 20 |         HTTP::Headers::Util => 0,
 21 |         HTTP::Message       => 5.827,
 22 |         HTTP::Request       => 0,
 23 |         HTTP::Response      => "1.50",
 24 |         LWP::RobotUA        => 1.19,
 25 |         LWP::UserAgent      => 0,
 26 |         Net::HTTP::Methods  => 5.833,
 27 |         Time::HiRes         => 0,
 28 |         URI                 => 1.53,
 29 |         URI::Escape         => 0,
 30 | 
 31 |         # Optional, but required if using a config file:
 32 |         Config::General => 2.06,
 33 | 
 34 |         # Optional, but required if private IPs are disallowed:
 35 |         Net::hostent => 0,
 36 |         Net::IP      => 0,
 37 |         Socket       => 0,
 38 | 
 39 |         # Optional, but required in command line mode:
 40 |         File::Spec   => 0,
 41 |         Getopt::Long => 2.17,
 42 |         Text::Wrap   => 0,
 43 |         URI::file    => 0,
 44 | 
 45 |         # Optional, used for password input in command line mode:
 46 |         Term::ReadKey => 2.00,
 47 | 
 48 |         # Optional, used for guessing language in command line mode:
 49 |         Locale::Country  => 0,
 50 |         Locale::Language => 0,
 51 | 
 52 |         # Optional, used when decoding arguments in command line mode:
 53 |         Encode::Locale => 0,
 54 | 
 55 |         # Optional, but required in CGI mode:
 56 |         CGI         => 0,
 57 |         CGI::Carp   => 0,
 58 |         CGI::Cookie => 0,
 59 | 
 60 |         # Optional, required if using cookies:
 61 |         HTTP::Cookies => 0,
 62 | 
 63 |         # Required for the test suite:
 64 |         File::Spec => 0,
 65 |         Test::More => 0,
 66 |     },
 67 |     PM => {'lib/W3C/LinkChecker.pm' => '$(INST_LIB)/W3C/LinkChecker.pm'},
 68 |     EXE_FILES => ['bin/checklink'],
 69 |     MAN1PODS =>
 70 |         {'bin/checklink.pod' => '$(INST_MAN1DIR)/checklink.$(MAN1EXT)',},
 71 |     META_MERGE => {
 72 |         resources => {
 73 |             homepage   => 'https://validator.w3.org/checklink',
 74 |             bugtracker => 'https://github.com/w3c/link-checker/issues/',
 75 |             repository => 'https://github.com/w3/link-checker/',
 76 |             MailingList =>
 77 |                 'https://lists.w3.org/Archives/Public/www-validator/',
 78 |         },
 79 |     },
 80 |     depend => {distdir  => 'lib/W3C/LinkChecker.pm'},
 81 |     dist   => {TARFLAGS => '--owner=0 --group=0 -cvf'},
 82 |     clean  => {FILES    => 'Makefile.PL.bak bin/checklink.bak'},
 83 | );
 84 | 
 85 | sub MY::postamble
 86 | {
 87 |     return <<'MAKE_FRAG';
 88 | lib/W3C/LinkChecker.pm: Makefile.PL bin/checklink
 89 | 	$(MKPATH) lib/W3C
 90 | 	$(ECHO) "# Dummy module for CPAN indexing purposes."  > $@
 91 | 	$(ECHO) "package $(NAME);"                           >> $@
 92 | 	$(ECHO) "use strict;"                                >> $@
 93 | 	$(ECHO) "use vars qw(\$$VERSION);"                   >> $@
 94 | 	$(ECHO) "\$$VERSION = \"$(VERSION)\";"               >> $@
 95 | 	$(ECHO) "1;"                                         >> $@
 96 | 
 97 | PERLTIDY = perltidy --profile=etc/perltidyrc --backup-and-modify-in-place \
 98 | 	--backup-file-extension=/
 99 | 
100 | perltidy:
101 | 	@for file in Makefile.PL bin/checklink ; do \
102 | 		echo "$(PERLTIDY) $$file" ; \
103 | 		$(PERLTIDY) $$file ; \
104 | 	done
105 | MAKE_FRAG
106 | }
107 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | This document contains information about high level changes between
 2 | Link Checker releases.
 3 | 
 4 | Version 4.81 - 2011-10-16
 5 | - Work around some <base> related problems (#12720, rt.cpan.org#54361).
 6 | - Eliminate some warnings (emitted by code, not from results).
 7 | 
 8 | Version 4.8 - 2011-04-02
 9 | - Avoid some robot delays by improving the order in which links are checked.
10 | - Avoid some unnecessary HEAD requests in recursive mode.
11 | - Clarify output wrt. links that have already been checked.
12 | - Make connection cache size configurable, and increase the default to 2.
13 | - Move JavaScript to an external file.
14 | - Check applet and object archive links.
15 | 
16 | Version 4.7 - 2011-03-17
17 | - Support for IRI.
18 | - Support for more HTML5 links.
19 | - Decode query string parameters as UTF-8.
20 | - Decode command line arguments according to locale.
21 | - New dependencies: Encode-Locale (command line mode only).
22 | - Updated dependencies: libwww-perl >= 5.833, URI >= 1.53.
23 | 
24 | Version 4.6 - 2010-05-01
25 | - Support for checking links in CSS.
26 | - Results UI improvements, added "progress bar".
27 | - Support for larger variety of character and content encodings.
28 | - Support for HTTP responses with > 4kB header lines.
29 | - Additional output suppress options in command line mode.
30 | - Improved heuristics when passed non-absolute URLs.
31 | - Support for cookies (command line only for now).
32 | - More "false positive" failure avoidance efforts for "make test".
33 | - The set of forbidden protocols is now configurable.
34 | - New dependencies: CSS-DOM >= 0.09.
35 | - Updated dependencies: Perl >= 5.8.
36 | 
37 | Version 4.5 - 2009-03-30
38 | - Removed W3C trademarked icons from distribution tarball.
39 | - Avoid "false positive" failures from "make test" in certain setups.
40 | - Make quiet command line mode quieter.
41 | - Lowered default timeout to 30 seconds.
42 | 
43 | Version 4.4 - 2009-02-12
44 | - checking more elements and attributes, such as BLOCKQUOTE cite="", BODY
45 |   background="", EMBED, etc
46 | - Changes in the UI to make it match other validators more closely
47 | - in HTML/cgi output, using javascript to show checklink status as it happens
48 | - added support for HTML5 links
49 | - softer wording for broken link results
50 | - Add non-robot developer mode
51 | - many bug fixes and code cleanup
52 | 
53 | Version 4.3 - 2006-10-22
54 | - Various minor improvements to result output, both in text and HTML modes.
55 | - Fix --quiet and checking multiple documents to match documentation.
56 | - Eliminate various warnings (emitted by code, not from results).
57 | - Documentation improvements.
58 | 
59 | Version 4.2.1 - 2005-05-15
60 | - Include documentation of the reorganized access keys.
61 | 
62 | Version 4.2 - 2005-04-27
63 | - Access key reorganization, making them less likely to conflict with
64 |   browsers' "native" key bindings.
65 | - Redirects are now checked for private IP addresses too.
66 | 
67 | Version 4.1 - 2004-11-24
68 | - Added workarounds against browser timeouts in "summary only" mode.
69 | - Improved caching and reuse of fetched /robots.txt information.
70 | - Fixed a bug where a complete protocol response (including headers)
71 |   was passed to the HTML parser, which caused unexpected behaviour.
72 | - Minor user interface and installation related improvements.
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # W3C-LinkChecker
 2 | 
 3 | This distribution contains the W3C Link Checker.
 4 | 
 5 | The link checker can be run as a CGI script in a web server as well as
 6 | on the command line.  The CGI version provides a HTML interface as
 7 | seen at [http://validator.w3.org/checklink](http://validator.w3.org/checklink).
 8 | 
 9 | 
10 | ## Install
11 | 
12 | To install the distribution for command line use:
13 | 
14 | ```sh
15 | git clone https://github.com/w3c/link-checker.git
16 | cd link-checker
17 | #if you have cpanminus installed
18 | cpanm --installdeps .
19 | perl Makefile.PL
20 | make
21 | make test
22 | make install # as root unless you are using local::lib
23 | ```
24 | 
25 | To install the CGI version, in addition to the above, copy the
26 | bin/checklink script into a location in your web server from where
27 | execution of CGI scripts is allowed, and make sure that the web server
28 | user has execute permissions to the script.  The CGI directory is
29 | typically named "cgi-bin" somewhere under your web server root
30 | directory.
31 | 
32 | For more information, please consult the POD documentation in the
33 | checklink.pod file, typically (in the directory where you unpacked the
34 | source):
35 | 
36 | ```sh
37 | perldoc ./bin/checklink.pod
38 | ```
39 | 
40 | ...as well as the HTML documentation in docs/checklink.html.
41 | 
42 | ## As a Docker container
43 | 
44 | You may want to use [@stupchiy](https://github.com/stupchiy)'s [Dockerfile](https://github.com/stupchiy/docker-w3c-checklink/blob/master/Dockerfile), which is
45 | based on Ubuntu Linux, and follow his instructions:
46 | 
47 | ```sh
48 | $ docker build -t link-checker .                                                                        # Build an image
49 | $ docker run -it --rm link-checker                                                                      # Run a container
50 | $ docker run -it --rm link-checker checklink https://foo.bar                                            # Run script directly
51 | $ docker run -it --rm -v "$PWD":/home/checklink link-checker checklink -H https://foo.bar > report.html # Write to HTML file
52 | ```
53 | 
54 | ## Useful links
55 | 
56 | - [Archived bug list](https://www.w3.org/Bugs/Public/buglist.cgi?product=LinkChecker&component=checklink) (Decommissioned Bugzilla) 
57 | - [www-validator MailingList](https://lists.w3.org/Archives/Public/www-validator/) used to discuss the W3C Markup Validation Service, Link Checker and Log Validator. 
58 | 
59 | ## Copyright and License
60 | 
61 | Written by the following people for the W3C:
62 | 
63 | - Hugo Haas <hugo@w3.org>
64 | - Ville Skyttä <ville.skytta@iki.fi>
65 | - W3C QA-dev Team <public-qa-dev@w3.org>
66 | 
67 | Copyright (C) 1994-2023 World Wide Web Consortium Inc.
68 | All Rights Reserved. This work is
69 | distributed under the [W3C Software License](http://www.w3.org/Consortium/Legal/2015/copyright-software-and-document) in the hope that it
70 | will be useful, but WITHOUT ANY WARRANTY; without even the implied
71 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
72 | 


--------------------------------------------------------------------------------
/SIGNATURE:
--------------------------------------------------------------------------------
 1 | This file contains message digests of all files listed in MANIFEST,
 2 | signed via the Module::Signature module, version 0.68.
 3 | 
 4 | To verify the content in this distribution, first make sure you have
 5 | Module::Signature installed, then type:
 6 | 
 7 |     % cpansign -v
 8 | 
 9 | It will check each file's integrity, as well as the signature's
10 | validity.  If "==> Signature verified OK! <==" is not displayed,
11 | the distribution may already have been compromised, and you should
12 | not run its Makefile.PL or Build.PL.
13 | 
14 | -----BEGIN PGP SIGNED MESSAGE-----
15 | Hash: SHA1
16 | 
17 | SHA1 b075772a968f5694bfbb4ce33eadf26566a25f47 MANIFEST
18 | SHA1 e8175087619cebc9d0e0ead8ca06b9d8ee73b678 META.yml
19 | SHA1 ab9150095a45776c2020e5781d19054c7018da8b Makefile.PL
20 | SHA1 05b3e35c8352063f2c99efa9cd3881c208fa1bb0 NEWS
21 | SHA1 f1f868ea73db7d39ab491ebb50c84de76cce4b44 README
22 | SHA1 619d90efc63090552be8926418d69a0364989501 bin/checklink
23 | SHA1 4406433ae670dd4f7be3f2c76d55aefb239e9bc9 bin/checklink.pod
24 | SHA1 b188063249c820f0aa5a34b5f735e8f334a536e1 docs/checklink.html
25 | SHA1 fa101fed018fc8e41beca63a0a667fb94c10a557 docs/linkchecker.css
26 | SHA1 8fa71b54357c9ed6ac8e01ab600120032d35b080 docs/linkchecker.js
27 | SHA1 92d01a8a6e7edcd200d70492f4e551984b97b7a0 etc/checklink.conf
28 | SHA1 87c74944dbc80b5d6ab8aac1d09419607b15efff etc/perltidyrc
29 | SHA1 bcb7896bee3764f85a03ab14495efc233f70e215 images/double.png
30 | SHA1 ff9a7be7fee245dd81a7dc4124544d692a140119 images/grad.png
31 | SHA1 61aeb3ea5616833678f66c7baa6db373eedcd86b images/head-bl.png
32 | SHA1 bcb7bf006b79106309350bfa578e94af80aed82d images/head-br.png
33 | SHA1 11243aa6b3463dd8d6a9b2e69027e42a1d9480ab images/info_icons/README
34 | SHA1 a54abf3d12f207b81e19ea8ce783d37c6200cf40 images/info_icons/error.png
35 | SHA1 3fd2638079cd0698655614a5a5afc97a976a4af4 images/info_icons/info.png
36 | SHA1 552c52188188f560dc02a03200164de3045ac3f4 images/info_icons/warning.png
37 | SHA1 1631ed7d5b20c2c036e61225854134f0674cb10a images/no_w3c.png
38 | SHA1 401b5fba02d0d8484775a4a77503fa0d136b96ce images/round-br.png
39 | SHA1 9eb1ee6188391715284a3db080e6e92d163864d9 images/round-tr.png
40 | SHA1 cc01bd358bc1d6d42ca350ad0a4a42778ca4440e images/textbg.png
41 | SHA1 7587466f1487eb446fe5da1a70d445e7b33efd36 lib/W3C/LinkChecker.pm
42 | SHA1 962ba9fff082c4087239b55618ada2a8f1564464 t/00compile.t
43 | -----BEGIN PGP SIGNATURE-----
44 | Version: GnuPG v1.4.11 (GNU/Linux)
45 | 
46 | iQEVAwUBTpqXHId580Rxl2NsAQJqPAf/XrqTWrlZa9DFkWrnOSxIYsyDGPl14fCl
47 | ohGFL7jBYxdKndEHo2aA7bA95EOypZVxakUIFcizpC5ujbrqasGPEnxinhJQYLqA
48 | S+4G+yzen3DqbbLndd5eIWVLPS5992gXwuLaeZrNFlGv/kG892NSLGfu3JQiePlc
49 | jNNUZ4dvwe+MHSSvs3DEkAPJqeIR7bx55tp+O7n5HX3ab/sqYIaqI2V3tXP/EHFy
50 | PA/Ig9QFQmfB7SY3TFN7iUFuIDRqIQOzC/Ij/WqY1Uj9885zZJvq0GWT/huFvyVG
51 | IGFM+sZp8gr6fr/bkB7de5xCoVUpkCz+mFkIJFQCu1cwJcP9pa81+g==
52 | =W5f6
53 | -----END PGP SIGNATURE-----
54 | 


--------------------------------------------------------------------------------
/bin/checklink:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/perl -wT
   2 | #
   3 | # W3C Link Checker
   4 | # by Hugo Haas <hugo@w3.org>
   5 | # (c) 1999-2023 World Wide Web Consortium
   6 | # based on Renaud Bruyeron's checklink.pl
   7 | #
   8 | # This program is licensed under the W3C(r) Software License:
   9 | #       https://www.w3.org/Consortium/Legal/copyright-software
  10 | #
  11 | # The documentation is at:
  12 | #       https://validator.w3.org/checklink/docs/checklink.html
  13 | #
  14 | # See the Github repository at:
  15 | #       https://github.com/w3c/link-checker/
  16 | #
  17 | # An online version is available at:
  18 | #       https://validator.w3.org/checklink
  19 | #
  20 | # Comments and suggestions should be filed on github
  21 | 
  22 | use strict;
  23 | use 5.008;
  24 | 
  25 | # Get rid of potentially unsafe and unneeded environment variables.
  26 | delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)});
  27 | $ENV{PATH} = undef;
  28 | 
  29 | # ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib,
  30 | # http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html
  31 | use Config qw(%Config);
  32 | use lib map { /(.*)/ }
  33 |     defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) :
  34 |     defined($ENV{PERLLIB})  ? split(/$Config{path_sep}/, $ENV{PERLLIB}) :
  35 |                               ();
  36 | 
  37 | # -----------------------------------------------------------------------------
  38 | 
  39 | package W3C::UserAgent;
  40 | 
  41 | use LWP::RobotUA 1.19 qw();
  42 | use LWP::UserAgent qw();
  43 | use Net::HTTP::Methods 5.833 qw();    # >= 5.833 for 4kB cookies (#6678)
  44 | 
  45 | # if 0, ignore robots exclusion (useful for testing)
  46 | use constant USE_ROBOT_UA => 1;
  47 | 
  48 | if (USE_ROBOT_UA) {
  49 |     @W3C::UserAgent::ISA = qw(LWP::RobotUA);
  50 | }
  51 | else {
  52 |     @W3C::UserAgent::ISA = qw(LWP::UserAgent);
  53 | }
  54 | 
  55 | sub new
  56 | {
  57 |     my $proto = shift;
  58 |     my $class = ref($proto) || $proto;
  59 |     my ($name, $from, $rules) = @_;
  60 | 
  61 |     # For security/privacy reasons, if $from was not given, do not send it.
  62 |     # Cheat by defining something for the constructor, and resetting it later.
  63 |     my $from_ok = $from;
  64 |     $from ||= 'www-validator@w3.org';
  65 | 
  66 |     my $self;
  67 |     if (USE_ROBOT_UA) {
  68 |         $self = $class->SUPER::new($name, $from, $rules);
  69 |     }
  70 |     else {
  71 |         my %cnf;
  72 |         @cnf{qw(agent from)} = ($name, $from);
  73 |         $self = LWP::UserAgent->new(%cnf);
  74 |         $self = bless $self, $class;
  75 |     }
  76 | 
  77 |     $self->from(undef) unless $from_ok;
  78 | 
  79 |     $self->env_proxy();
  80 | 
  81 |     $self->allow_private_ips(1);
  82 | 
  83 |     $self->protocols_allowed([qw(http https ftp)]);
  84 | 
  85 |     return $self;
  86 | }
  87 | 
  88 | sub allow_private_ips
  89 | {
  90 |     my $self = shift;
  91 |     if (@_) {
  92 |         $self->{Checklink_allow_private_ips} = shift;
  93 |         if (!$self->{Checklink_allow_private_ips}) {
  94 | 
  95 |             # Pull in dependencies
  96 |             require Net::IP;
  97 |             require Socket;
  98 |             require Net::hostent;
  99 |         }
 100 |     }
 101 |     return $self->{Checklink_allow_private_ips};
 102 | }
 103 | 
 104 | sub redirect_progress_callback
 105 | {
 106 |     my $self = shift;
 107 |     $self->{Checklink_redirect_callback} = shift if @_;
 108 |     return $self->{Checklink_redirect_callback};
 109 | }
 110 | 
 111 | sub simple_request
 112 | {
 113 |     my $self = shift;
 114 | 
 115 |     my $response = $self->ip_disallowed($_[0]->uri());
 116 | 
 117 |     # RFC 2616, section 15.1.3
 118 |     $_[0]->remove_header("Referer")
 119 |         if ($_[0]->referer() &&
 120 |         (!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure()));
 121 | 
 122 |     $response ||= do {
 123 |         local $SIG{__WARN__} =
 124 |             sub {    # Suppress some warnings, rt.cpan.org #18902
 125 |             warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/);
 126 |             };
 127 | 
 128 |         # @@@ Why not just $self->SUPER::simple_request?
 129 |         $self->W3C::UserAgent::SUPER::simple_request(@_);
 130 |     };
 131 | 
 132 |     if (!defined($self->{FirstResponse})) {
 133 |         $self->{FirstResponse} = $response->code();
 134 |         $self->{FirstMessage} = $response->message() || '(no message)';
 135 |     }
 136 | 
 137 |     return $response;
 138 | }
 139 | 
 140 | sub redirect_ok
 141 | {
 142 |     my ($self, $request, $response) = @_;
 143 | 
 144 |     if (my $callback = $self->redirect_progress_callback()) {
 145 | 
 146 |         # @@@ TODO: when an LWP internal robots.txt request gets redirected,
 147 |         # this will a bit confusingly fire for it too.  Would need a robust
 148 |         # way to determine whether the request is such a LWP "internal
 149 |         # robots.txt" one.
 150 |         &$callback($request->method(), $request->uri());
 151 |     }
 152 | 
 153 |     return 0 unless $self->SUPER::redirect_ok($request, $response);
 154 | 
 155 |     if (my $res = $self->ip_disallowed($request->uri())) {
 156 |         $response->previous($response->clone());
 157 |         $response->request($request);
 158 |         $response->code($res->code());
 159 |         $response->message($res->message());
 160 |         return 0;
 161 |     }
 162 | 
 163 |     return 1;
 164 | }
 165 | 
 166 | #
 167 | # Checks whether we're allowed to retrieve the document based on its IP
 168 | # address.  Takes an URI object and returns a HTTP::Response containing the
 169 | # appropriate status and error message if the IP was disallowed, 0
 170 | # otherwise.  URIs without hostname or IP address are always allowed,
 171 | # including schemes where those make no sense (eg. data:, often javascript:).
 172 | #
 173 | sub ip_disallowed
 174 | {
 175 |     my ($self, $uri) = @_;
 176 |     return 0 if $self->allow_private_ips();    # Short-circuit
 177 | 
 178 |     my $hostname = undef;
 179 |     eval { $hostname = $uri->host() };    # Not all URIs implement host()...
 180 |     return 0 unless $hostname;
 181 | 
 182 |     my $addr = my $iptype = my $resp = undef;
 183 |     if (my $host = Net::hostent::gethostbyname($hostname)) {
 184 |         $addr = Socket::inet_ntoa($host->addr()) if $host->addr();
 185 |         if ($addr && (my $ip = Net::IP->new($addr))) {
 186 |             $iptype = $ip->iptype();
 187 |         }
 188 |     }
 189 |     if ($iptype && $iptype ne 'PUBLIC') {
 190 |         $resp = HTTP::Response->new(403,
 191 |             'Checking non-public IP address disallowed by link checker configuration'
 192 |         );
 193 |         $resp->header('Client-Warning', 'Internal response');
 194 |     }
 195 | 
 196 | #    #defaults to 80
 197 |     my $port = undef;
 198 |     $port = $uri->port();
 199 |     #whitelist regex if short enough
 200 |     if ($port !~ m/^(8[0-9]|21|443)$/ && $port < 1024) {
 201 |         $resp = HTTP::Response->new(403,
 202 |             'Checking certain ports disallowed by link checker configuration'
 203 |         );
 204 |         $resp->header('Client-Warning', 'Internal response');
 205 |     }
 206 | 
 207 |     return $resp;
 208 | }
 209 | 
 210 | # -----------------------------------------------------------------------------
 211 | 
 212 | package W3C::LinkChecker;
 213 | 
 214 | use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
 215 |     $DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
 216 | 
 217 | use CSS::DOM 0.09 qw();    # >= 0.09 for many bugfixes
 218 | use CSS::DOM::Constants qw(:rule);
 219 | use CSS::DOM::Style qw();
 220 | use CSS::DOM::Util qw();
 221 | use Encode qw();
 222 | use HTML::Entities qw();
 223 | use HTML::Parser 3.40 qw();    # >= 3.40 for utf8_mode()
 224 | use HTTP::Headers::Util qw();
 225 | use HTTP::Message 5.827 qw();    # >= 5.827 for content_charset()
 226 | use HTTP::Request 5.814 qw();    # >= 5.814 for accept_decodable()
 227 | use HTTP::Response 1.50 qw();    # >= 1.50 for decoded_content()
 228 | use Time::HiRes qw();
 229 | use URI 1.53 qw();               # >= 1.53 for secure()
 230 | use URI::Escape qw();
 231 | use URI::Heuristic qw();
 232 | 
 233 | # @@@ Needs also W3C::UserAgent but can't use() it here.
 234 | 
 235 | use constant RC_ROBOTS_TXT          => -1;
 236 | use constant RC_DNS_ERROR           => -2;
 237 | use constant RC_IP_DISALLOWED       => -3;
 238 | use constant RC_PROTOCOL_DISALLOWED => -4;
 239 | use constant RC_MIXEDCONTENT_BLOCK  => -5;
 240 | 
 241 | use constant LINE_UNKNOWN => -1;
 242 | 
 243 | use constant MP2 =>
 244 |     (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
 245 | 
 246 | # Mixed content status based on https://w3c.github.io/webappsec-mixed-content/
 247 | use constant MC_NONE        => 0;
 248 | use constant MC_OPTIONAL    => 1;
 249 | use constant MC_CONDITIONAL => 2;
 250 | use constant MC_BLOCK       => 3;
 251 | 
 252 | # Tag=>[[mixedContentStatus, attribute]] mapping of things we treat as links.
 253 | # Note: meta/@http-equiv gets special treatment, see start() for details.
 254 | use constant LINK_ATTRS => {
 255 |     a => [[MC_NONE, 'href']],
 256 | 
 257 |     # base/@href intentionally not checked
 258 |     # https://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
 259 |     area       => [[MC_NONE, 'href']],
 260 |     audio      => [[MC_OPTIONAL, 'src']],
 261 |     blockquote => [[MC_NONE, 'cite']],
 262 |     body       => [[MC_OPTIONAL, 'background']],
 263 |     command    => [[MC_OPTIONAL, 'icon']], # no spec says how to handle this
 264 |                                           # wrt to mixed content
 265 | 
 266 |     # button/@formaction not checked (side effects)
 267 |     del => [[MC_NONE, 'cite']],
 268 | 
 269 |     # @pluginspage, @pluginurl, @href: pre-HTML5 proprietary
 270 |     embed => [[MC_BLOCK, 'href'], [MC_NONE, 'pluginspage'], [MC_NONE, 'pluginurl'], [MC_BLOCK, 'src']],
 271 | 
 272 |     # form/@action not checked (side effects)
 273 |     frame  => [[MC_BLOCK, 'longdesc'], [MC_BLOCK, 'src']],
 274 |     html   => [[MC_BLOCK, 'manifest']],
 275 |     iframe => [[MC_BLOCK, 'longdesc'], [MC_BLOCK, 'src']],
 276 |     img    => [[MC_BLOCK, 'longdesc'], [MC_CONDITIONAL, 'src']],
 277 | 
 278 |     # input/@action, input/@formaction not checked (side effects)
 279 |     input  => [[MC_BLOCK,'src']],
 280 |     ins    => [[MC_NONE, 'cite']],
 281 |     link   => [[MC_CONDITIONAL, 'href']],
 282 |     object => [[MC_BLOCK, 'data']],
 283 |     q      => [[MC_NONE, 'cite']],
 284 |     script => [[MC_BLOCK, 'src']],
 285 |     source => [[MC_OPTIONAL, 'src']],
 286 |     track  => [[MC_BLOCK, 'src']],
 287 |     video  => [[MC_OPTIONAL, 'src'], [MC_BLOCK, 'poster']],
 288 | };
 289 | 
 290 | # Tag=>[separator, subseparator, attributes] mapping of things we treat as lists of links.
 291 | use constant LINK_LIST_ATTRS => {
 292 |     a      => [qr/\s+/, qr/$/,      [MC_BLOCK, 'ping']],
 293 |     applet => [qr/[\s,]+/, qr/$/,   [MC_BLOCK, 'archive']],
 294 |     area   => [qr/\s+/, qr/$/,      [MC_BLOCK, 'ping']],
 295 |     img    => [qr/\s*,\s*/, qr/\s.*$/,   [MC_BLOCK, 'srcset']],
 296 |     head   => [qr/\s+/,  qr/$/,     [MC_NONE, 'profile']],
 297 |     object => [qr/\s+/,  qr/$/,     [MC_BLOCK, 'archive']],
 298 |     source => [qr/\s*,\s*/, qr/\s.*$/, [MC_BLOCK, 'srcset']]
 299 | };
 300 | 
 301 | # TBD/TODO:
 302 | # - applet/@code?
 303 | # - bgsound/@src?
 304 | # - object/@classid?
 305 | # - isindex/@action?
 306 | # - layer/@background,@src?
 307 | # - ilayer/@background?
 308 | # - table,tr,td,th/@background?
 309 | # - xmp/@href?
 310 | 
 311 | @W3C::LinkChecker::ISA = qw(HTML::Parser);
 312 | 
 313 | BEGIN {
 314 | 
 315 |     # Version info
 316 |     $PACKAGE  = 'W3C Link Checker';
 317 |     $PROGRAM  = 'W3C-checklink';
 318 |     $VERSION  = '5.0.0';
 319 |     $REVISION = sprintf('version %s (c) 1999-2019 W3C', $VERSION);
 320 |     $AGENT    = sprintf(
 321 |         '%s/%s',
 322 |         $PROGRAM, $VERSION
 323 |     );
 324 | 
 325 |     # Pull in mod_perl modules if applicable.
 326 |     eval {
 327 |         local $SIG{__DIE__} = undef;
 328 |         require Apache2::RequestUtil;
 329 |     } if MP2();
 330 | 
 331 |     my @content_types = qw(
 332 |         text/html
 333 |         application/xhtml+xml;q=0.9
 334 |         application/vnd.wap.xhtml+xml;q=0.6
 335 |     );
 336 |     $Accept = join(', ', @content_types, '*/*;q=0.5');
 337 |     push(@content_types, 'text/css', 'text/html-sandboxed');
 338 |     my $re = join('|', map { s/;.*//; quotemeta } @content_types);
 339 |     $ContentTypes = qr{\b(?:$re)\b}io;
 340 | 
 341 |     # Regexp for matching URL values in CSS.
 342 |     $CssUrl = qr/url\(\s*(['"]?)(.*?)\1\s*\)/;
 343 | 
 344 |     #
 345 |     # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
 346 |     # been set or the default contains a non-empty file, read it.  Otherwise,
 347 |     # skip silently.
 348 |     #
 349 |     my $defaultconfig = '/etc/w3c/checklink.conf';
 350 |     if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
 351 | 
 352 |         require Config::General;
 353 |         Config::General->require_version(2.06);    # Need 2.06 for -SplitPolicy
 354 | 
 355 |         my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
 356 |         eval {
 357 |             my %config_opts = (
 358 |                 -ConfigFile        => $conffile,
 359 |                 -SplitPolicy       => 'equalsign',
 360 |                 -AllowMultiOptions => 'no',
 361 |             );
 362 |             %Cfg = Config::General->new(%config_opts)->getall();
 363 |         };
 364 |         if ($@) {
 365 |             die <<"EOF";
 366 | Failed to read configuration from '$conffile':
 367 | $@
 368 | EOF
 369 |         }
 370 |     }
 371 |     $Cfg{Markup_Validator_URI} ||= 'https://validator.w3.org/check?uri=%s';
 372 |     $Cfg{CSS_Validator_URI} ||=
 373 |         'https://jigsaw.w3.org/css-validator/validator?uri=%s';
 374 |     $Cfg{Doc_URI} ||= 'https://validator.w3.org/checklink/docs/checklink.html';
 375 |     $Cfg{Doc_Base_URI} ||= '/checklink/docs/';
 376 |     $Cfg{Doc_Images_URI} ||= '/checklink/images/';
 377 | 
 378 |     # Untaint config params that are used as the format argument to (s)printf(),
 379 |     # Perl 5.10 does not want to see that in taint mode.
 380 |     ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
 381 |     ($Cfg{CSS_Validator_URI})    = ($Cfg{CSS_Validator_URI}    =~ /^(.*)$/);
 382 | 
 383 |     $DocType =
 384 |         '<!DOCTYPE html>';
 385 |     my $css_url = $Cfg{Doc_Base_URI} . 'linkchecker.css';
 386 |     my $js_url  = $Cfg{Doc_Base_URI} . 'linkchecker.js';
 387 |     $Head =
 388 |         sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url);
 389 | <meta http-equiv="Content-Script-Type" content="text/javascript" />
 390 | <meta name="generator" content="%s" />
 391 | <link rel="stylesheet" type="text/css" href="%s" />
 392 | <script type="text/javascript" src="%s"></script>
 393 | EOF
 394 | 
 395 |     # Trusted environment variables that need laundering in taint mode.
 396 |     for (qw(NNTPSERVER NEWSHOST)) {
 397 |         ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
 398 |     }
 399 | 
 400 |     # Use passive FTP by default, see Net::FTP(3).
 401 |     $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
 402 | }
 403 | 
 404 | # Autoflush
 405 | $| = 1;
 406 | 
 407 | # Different options specified by the user
 408 | my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
 409 | my %Opts = (
 410 |     Command_Line    => $cmdline,
 411 |     Quiet           => 0,
 412 |     Summary_Only    => 0,
 413 |     Verbose         => 0,
 414 |     Progress        => 0,
 415 |     HTML            => 0,
 416 |     Timeout         => 30,
 417 |     Redirects       => 1,
 418 |     Dir_Redirects   => 1,
 419 |     Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
 420 |     Cookies         => undef,
 421 |     No_Referer      => 0,
 422 |     Hide_Same_Realm => 0,
 423 |     Follow_File_Links        => 0,
 424 |     Depth                    => 0,             # < 0 means unlimited recursion.
 425 |     Sleep_Time               => 1,
 426 |     Connection_Cache_Size    => 2,
 427 |     Max_Documents            => 150,           # For the online version.
 428 |     User                     => undef,
 429 |     Password                 => undef,
 430 |     Base_Locations           => [],
 431 |     Exclude                  => undef,
 432 |     Exclude_Docs             => undef,
 433 |     Suppress_Redirect        => [],
 434 |     Suppress_Redirect_Prefix => [],
 435 |     Suppress_Redirect_Regexp => [],
 436 |     Suppress_Temp_Redirects  => 1,
 437 |     Suppress_Broken          => [],
 438 |     Suppress_Fragment        => [],
 439 |     Masquerade               => 0,
 440 |     Masquerade_From          => '',
 441 |     Masquerade_To            => '',
 442 |     Trusted                  => $Cfg{Trusted},
 443 |     Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
 444 |         $Cfg{Allow_Private_IPs} :
 445 |         $cmdline,
 446 | );
 447 | undef $cmdline;
 448 | 
 449 | # Global variables
 450 | # What URI's did we process? (used for recursive mode)
 451 | my %processed;
 452 | 
 453 | # Result of the HTTP query
 454 | my %results;
 455 | 
 456 | # List of redirects
 457 | my %redirects;
 458 | 
 459 | # Count of the number of documents checked
 460 | my $doc_count = 0;
 461 | 
 462 | # Time stamp
 463 | my $timestamp = &get_timestamp();
 464 | 
 465 | # Per-document header; undefined if already printed.  See print_doc_header().
 466 | my $doc_header;
 467 | 
 468 | &parse_arguments() if $Opts{Command_Line};
 469 | 
 470 | my $ua = W3C::UserAgent->new($AGENT);    # @@@ TODO: admin address
 471 | 
 472 | $ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}});
 473 | if ($ua->can('delay')) {
 474 |     $ua->delay($Opts{Sleep_Time} / 60);
 475 | }
 476 | $ua->timeout($Opts{Timeout});
 477 | 
 478 | # Set up cookie stash if requested
 479 | if (defined($Opts{Cookies})) {
 480 |     require HTTP::Cookies;
 481 |     my $cookie_file = $Opts{Cookies};
 482 |     if ($cookie_file eq 'tmp') {
 483 |         $cookie_file = undef;
 484 |     }
 485 |     elsif ($cookie_file =~ /^(.*)$/) {
 486 |         $cookie_file = $1;    # untaint
 487 |     }
 488 |     $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));
 489 | }
 490 | eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); };
 491 | if ($@) {
 492 |     die <<"EOF";
 493 | Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
 494 | Net::hostent modules:
 495 | $@
 496 | EOF
 497 | }
 498 | 
 499 | # Add configured allowed protocols
 500 | if ($Cfg{Allowed_Protocols}) {
 501 |     my $allowed = $ua->protocols_allowed();
 502 |     push(@$allowed, split(/[,\s]+/, lc($Cfg{Allowed_Protocols})));
 503 |     $ua->protocols_allowed($allowed);
 504 | }
 505 | if ($Opts{Follow_File_Links}) {
 506 |     my $allowed = $ua->protocols_allowed();
 507 |     push(@$allowed, "file");
 508 |     $ua->protocols_allowed($allowed);
 509 | }
 510 | 
 511 | if ($Opts{Command_Line}) {
 512 | 
 513 |     require Text::Wrap;
 514 |     Text::Wrap->import('wrap');
 515 | 
 516 |     require URI::file;
 517 | 
 518 |     &usage(1) unless scalar(@ARGV);
 519 | 
 520 |     $Opts{_Self_URI} = 'https://validator.w3.org/checklink';   # For HTML output
 521 | 
 522 |     &ask_password() if ($Opts{User} && !$Opts{Password});
 523 | 
 524 |     if (!$Opts{Summary_Only}) {
 525 |         printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
 526 |     }
 527 |     else {
 528 |         $Opts{Verbose}  = 0;
 529 |         $Opts{Progress} = 0;
 530 |     }
 531 | 
 532 |     # Populate data for print_form()
 533 |     my %params = (
 534 |         summary            => $Opts{Summary_Only},
 535 |         hide_redirects     => !$Opts{Redirects},
 536 |         hide_type          => $Opts{Dir_Redirects} ? 'dir' : 'all',
 537 |         no_accept_language => !(
 538 |             defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto'
 539 |         ),
 540 |         no_referer => $Opts{No_Referer},
 541 |         recursive  => ($Opts{Depth} != 0),
 542 |         depth      => $Opts{Depth},
 543 |     );
 544 | 
 545 |     my $check_num = 1;
 546 |     my $has_error = 0;
 547 |     my @bases     = @{$Opts{Base_Locations}};
 548 |     for my $uri (@ARGV) {
 549 | 
 550 |         # Reset base locations so that previous URI's given on the command line
 551 |         # won't affect the recursion scope for this URI (see check_uri())
 552 |         @{$Opts{Base_Locations}} = @bases;
 553 | 
 554 |         # Transform the parameter into a URI
 555 |         $uri = &urize($uri);
 556 |         $params{uri} = $uri;
 557 |         $has_error += &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
 558 |         $check_num++;
 559 |     }
 560 |     undef $check_num;
 561 | 
 562 |     if ($Opts{HTML}) {
 563 |         &html_footer();
 564 |     }
 565 |     elsif ($doc_count > 0 && !$Opts{Summary_Only}) {
 566 |         printf("\n%s\n", &global_stats());
 567 |     }
 568 |     exit ($has_error ? 64 : 0);
 569 | }
 570 | else {
 571 | 
 572 |     require CGI;
 573 |     require CGI::Carp;
 574 |     CGI::Carp->import(qw(fatalsToBrowser));
 575 |     require CGI::Cookie;
 576 | 
 577 |     my $query = CGI->new();
 578 | 
 579 |     for my $param ($query->param()) {
 580 |         my @values = map { Encode::decode_utf8($_) } $query->param($param);
 581 |         $query->param($param, @values);
 582 |     }
 583 | 
 584 |     # Set a few parameters in CGI mode
 585 |     $Opts{Verbose}   = 0;
 586 |     $Opts{Progress}  = 0;
 587 |     $Opts{HTML}      = 1;
 588 |     $Opts{_Self_URI} = $query->url(-relative => 1);
 589 | 
 590 |     # Backwards compatibility
 591 |     my $uri = undef;
 592 |     if ($uri = $query->param('url')) {
 593 |         $query->param('uri', $uri) unless $query->param('uri');
 594 |         $query->delete('url');
 595 |     }
 596 |     $uri = $query->param('uri');
 597 | 
 598 |     if (!$uri) {
 599 |         &html_header('', undef);    # Set cookie only from results page.
 600 |         my %cookies = CGI::Cookie->fetch();
 601 |         &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
 602 |         &html_footer();
 603 |         exit;
 604 |     }
 605 | 
 606 |     # Backwards compatibility
 607 |     if ($query->param('hide_dir_redirects')) {
 608 |         $query->param('hide_redirects', 'on');
 609 |         $query->param('hide_type',      'dir');
 610 |         $query->delete('hide_dir_redirects');
 611 |     }
 612 | 
 613 |     $Opts{Summary_Only} = 1 if $query->param('summary');
 614 | 
 615 |     if ($query->param('hide_redirects')) {
 616 |         $Opts{Dir_Redirects} = 0;
 617 |         if (my $type = $query->param('hide_type')) {
 618 |             $Opts{Redirects} = 0 if ($type ne 'dir');
 619 |         }
 620 |         else {
 621 |             $Opts{Redirects} = 0;
 622 |         }
 623 |     }
 624 | 
 625 |     $Opts{Accept_Language} = undef if $query->param('no_accept_language');
 626 |     $Opts{No_Referer} = $query->param('no_referer');
 627 | 
 628 |     $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
 629 |     if (my $depth = $query->param('depth')) {
 630 | 
 631 |         # @@@ Ignore invalid depth silently for now.
 632 |         $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
 633 |     }
 634 | 
 635 |     # Save, clear or leave cookie as is.
 636 |     my $cookie = undef;
 637 |     if (my $action = $query->param('cookie')) {
 638 |         if ($action eq 'clear') {
 639 | 
 640 |             # Clear the cookie.
 641 |             $cookie = CGI::Cookie->new(-name => $PROGRAM,-value => $PROGRAM);
 642 |             $cookie->value({clear => 1});
 643 |             $cookie->expires('-1M');
 644 |         }
 645 |         elsif ($action eq 'set') {
 646 | 
 647 |             # Set the options.
 648 |             $cookie = CGI::Cookie->new(-name => $PROGRAM,-value => $PROGRAM);
 649 |             my %options = $query->Vars();
 650 |             delete($options{$_})
 651 |                 for qw(url uri check cookie);    # Non-persistent.
 652 |             $cookie->value(\%options);
 653 |         }
 654 |     }
 655 |     if (!$cookie) {
 656 |         my %cookies = CGI::Cookie->fetch();
 657 |         $cookie = $cookies{$PROGRAM};
 658 |     }
 659 | 
 660 |     # Always refresh cookie expiration time.
 661 |     $cookie->expires('+1M') if ($cookie && !$cookie->expires());
 662 | 
 663 |     # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
 664 |     # If we're under mod_perl, there is a way around it...
 665 |     eval {
 666 |         local $SIG{__DIE__} = undef;
 667 |         my $auth =
 668 |             Apache2::RequestUtil->request()->headers_in()->{Authorization};
 669 |         $ENV{HTTP_AUTHORIZATION} = $auth if $auth;
 670 |     } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
 671 | 
 672 |     $uri =~ s/^\s+//g;
 673 |     if ($uri =~ /:/) {
 674 |         $uri = URI->new($uri);
 675 |     }
 676 |     else {
 677 |         if ($uri =~ m|^//|) {
 678 |             $uri = URI->new("http:$uri");
 679 |         }
 680 |         else {
 681 |             local $ENV{URL_GUESS_PATTERN} = '';
 682 |             my $guess = URI::Heuristic::uf_uri($uri);
 683 |             if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
 684 |                 $uri = $guess;
 685 |             }
 686 |             else {
 687 |                 $uri = URI->new("http://$uri");
 688 |             }
 689 |         }
 690 |     }
 691 |     $uri = $uri->canonical();
 692 |     $query->param("uri", $uri);
 693 | 
 694 |     &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
 695 |     undef $query;    # Not needed any more.
 696 |     &html_footer();
 697 | }
 698 | 
 699 | ###############################################################################
 700 | 
 701 | ################################
 702 | # Command line and usage stuff #
 703 | ################################
 704 | 
 705 | sub parse_arguments ()
 706 | {
 707 |     require Encode::Locale;
 708 |     Encode::Locale::decode_argv();
 709 | 
 710 |     require Getopt::Long;
 711 |     Getopt::Long->require_version(2.17);
 712 |     Getopt::Long->import('GetOptions');
 713 |     Getopt::Long::Configure('bundling', 'no_ignore_case');
 714 |     my $masq = '';
 715 |     my @locs = ();
 716 | 
 717 |     GetOptions(
 718 |         'help|h|?' => sub { usage(0) },
 719 |         'q|quiet'  => sub {
 720 |             $Opts{Quiet}        = 1;
 721 |             $Opts{Summary_Only} = 1;
 722 |         },
 723 |         's|summary' => \$Opts{Summary_Only},
 724 |         'b|broken'  => sub {
 725 |             $Opts{Redirects}     = 0;
 726 |             $Opts{Dir_Redirects} = 0;
 727 |         },
 728 |         'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
 729 |         'v|verbose'       => \$Opts{Verbose},
 730 |         'i|indicator'     => \$Opts{Progress},
 731 |         'H|html'          => \$Opts{HTML},
 732 |         'r|recursive'     => sub {
 733 |             $Opts{Depth} = -1
 734 |                 if $Opts{Depth} == 0;
 735 |         },
 736 |         'l|location=s'                => \@locs,
 737 |         'X|exclude=s@'                => \@{$Opts{Exclude}},
 738 |         'exclude-docs=s@'             => \@{$Opts{Exclude_Docs}},
 739 |         'follow-file-links'           => \$Opts{Follow_File_Links},
 740 |         'https'                       => \$Opts{Https},
 741 |         'suppress-redirect=s@'        => \@{$Opts{Suppress_Redirect}},
 742 |         'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
 743 |         'suppress-temp-redirects'     => \$Opts{Suppress_Temp_Redirects},
 744 |         'suppress-broken=s@'          => \@{$Opts{Suppress_Broken}},
 745 |         'suppress-fragment=s@'        => \@{$Opts{Suppress_Fragment}},
 746 |         'u|user=s'                    => \$Opts{User},
 747 |         'p|password=s'                => \$Opts{Password},
 748 |         't|timeout=i'                 => \$Opts{Timeout},
 749 |         'C|connection-cache=i'        => \$Opts{Connection_Cache_Size},
 750 |         'S|sleep=i'                   => \$Opts{Sleep_Time},
 751 |         'L|languages=s'               => \$Opts{Accept_Language},
 752 |         'c|cookies=s'                 => \$Opts{Cookies},
 753 |         'R|no-referer'                => \$Opts{No_Referer},
 754 |         'D|depth=i'                   => sub {
 755 |             $Opts{Depth} = $_[1]
 756 |                 unless $_[1] == 0;
 757 |         },
 758 |         'd|domain=s'      => \$Opts{Trusted},
 759 |         'masquerade=s'    => \$masq,
 760 |         'hide-same-realm' => \$Opts{Hide_Same_Realm},
 761 |         'V|version'       => \&version,
 762 |         ) ||
 763 |         usage(1);
 764 | 
 765 |     if ($masq) {
 766 |         $Opts{Masquerade} = 1;
 767 |         my @masq = split(/\s+/, $masq);
 768 |         if (scalar(@masq) != 2 ||
 769 |             !defined($masq[0]) ||
 770 |             $masq[0] !~ /\S/ ||
 771 |             !defined($masq[1]) ||
 772 |             $masq[1] !~ /\S/)
 773 |         {
 774 |             usage(1,
 775 |                 "Error: --masquerade takes two whitespace separated URIs.");
 776 |         }
 777 |         else {
 778 |             require URI::file;
 779 |             $Opts{Masquerade_From} = $masq[0];
 780 |             my $u = URI->new($masq[1]);
 781 |             $Opts{Masquerade_To} =
 782 |                 $u->scheme() ? $u : URI::file->new_abs($masq[1]);
 783 |         }
 784 |     }
 785 | 
 786 |     if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
 787 |         $Opts{Accept_Language} = &guess_language();
 788 |     }
 789 | 
 790 |     if (($Opts{Sleep_Time} || 0) < 1) {
 791 |         warn(
 792 |             "*** Warning: minimum allowed sleep time is 1 second, resetting.\n"
 793 |         );
 794 |         $Opts{Sleep_Time} = 1;
 795 |     }
 796 | 
 797 |     push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);
 798 | 
 799 |     $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
 800 | 
 801 |     # Precompile/error-check regular expressions.
 802 |     for my $i (0 .. $#{$Opts{Exclude}}) {
 803 |         eval { $Opts{Exclude}->[$i] = qr/$Opts{Exclude}->[$i]/; };
 804 |         &usage(1, "Error in exclude regexp: $@") if $@;
 805 |     }
 806 |     for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
 807 |         eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
 808 |         &usage(1, "Error in exclude-docs regexp: $@") if $@;
 809 |     }
 810 |     if (defined($Opts{Trusted})) {
 811 |         eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
 812 |         &usage(1, "Error in trusted domains regexp: $@") if $@;
 813 |     }
 814 | 
 815 |     # Sanity-check error-suppression arguments
 816 |     for my $i (0 .. $#{$Opts{Suppress_Redirect}}) {
 817 |         ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/;
 818 |         my $sr_arg = ${$Opts{Suppress_Redirect}}[$i];
 819 |         if ($sr_arg !~ /.->./) {
 820 |             &usage(1,
 821 |                 "Bad suppress-redirect argument, should contain \"->\": $sr_arg"
 822 |             );
 823 |         }
 824 |     }
 825 |     for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) {
 826 |         my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i];
 827 |         $srp_arg =~ s/ /->/;
 828 |         if ($srp_arg !~ /^(.*)->(.*)$/) {
 829 |             &usage(1,
 830 |                 "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"
 831 |             );
 832 |         }
 833 | 
 834 |         # Turn prefixes into a regexp.
 835 |         ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism;
 836 |     }
 837 |     for my $i (0 .. $#{$Opts{Suppress_Broken}}) {
 838 |         ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/;
 839 |         my $sb_arg = ${$Opts{Suppress_Broken}}[$i];
 840 |         if ($sb_arg !~ /^(-1|[0-9]+)(:.)?/) {
 841 |             &usage(1,
 842 |                 "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg"
 843 |             );
 844 |         }
 845 |     }
 846 |     for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
 847 |         if ($sf_arg !~ /.#./) {
 848 |             &usage(1,
 849 |                 "Bad suppress-fragment argument, should contain \"#\": $sf_arg"
 850 |             );
 851 |         }
 852 |     }
 853 | 
 854 |     return;
 855 | }
 856 | 
 857 | sub version ()
 858 | {
 859 |     print "$PACKAGE $REVISION\n";
 860 |     exit 0;
 861 | }
 862 | 
 863 | sub usage ()
 864 | {
 865 |     my ($exitval, $msg) = @_;
 866 |     $exitval = 0 unless defined($exitval);
 867 |     $msg ||= '';
 868 |     $msg =~ s/[\r\n]*$/\n\n/ if $msg;
 869 | 
 870 |     die($msg) unless $Opts{Command_Line};
 871 | 
 872 |     my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
 873 | 
 874 |     select(STDERR) if $exitval;
 875 |     print "$msg$PACKAGE $REVISION
 876 | 
 877 | Usage: checklink <options> <uris>
 878 | Options:
 879 |  -s, --summary              Result summary only.
 880 |  -b, --broken               Show only the broken links, not the redirects.
 881 |  -e, --dir-redirects        Hide directory redirects, for example
 882 |                             https://www.w3.org/TR -> https://www.w3.org/TR/
 883 |  -r, --recursive            Check the documents linked from the first one.
 884 |  -D, --depth N              Check the documents linked from the first one to
 885 |                             depth N (implies --recursive).
 886 |  -l, --location URI         Scope of the documents checked in recursive mode
 887 |                             (implies --recursive).  Can be specified multiple
 888 |                             times.  If not specified, the default eg. for
 889 |                             https://www.w3.org/TR/html4/Overview.html
 890 |                             would be https://www.w3.org/TR/html4/
 891 |  -X, --exclude REGEXP       Do not check links whose full, canonical URIs
 892 |                             match REGEXP; also limits recursion the same way
 893 |                             as --exclude-docs with the same regexp would.
 894 |                             This option may be specified multiple times.
 895 |  --exclude-docs REGEXP      In recursive mode, do not check links in documents
 896 |                             whose full, canonical URIs match REGEXP.  This
 897 |                             option may be specified multiple times.
 898 |  --suppress-redirect URI->URI  Do not report a redirect from the first to the
 899 |                             second URI.  This option may be specified multiple
 900 |                             times.
 901 |  --suppress-redirect-prefix URI->URI  Do not report a redirect from a child of
 902 |                             the first URI to the same child of the second URI.
 903 |                             This option may be specified multiple times.
 904 |  --suppress-temp-redirects  Suppress warnings about temporary redirects.
 905 |  --suppress-broken CODE(:URI)  Do not report a broken link with the given CODE.
 906 |                             CODE is HTTP response, or -1 for robots exclusion.
 907 |                             This option may be specified multiple times, and
 908 |                             can optionally be limited to a single pair of CODE /
 909 |                             URI.
 910 |  --suppress-fragment URI    Do not report the given broken fragment URI.
 911 |                             A fragment URI contains \"#\".  This option may be
 912 |                             specified multiple times.
 913 |  --follow-file-links        When encountering a file: URI, try to follow it
 914 |                             (disabled by default for security reasons)
 915 |  -L, --languages LANGS      Accept-Language header to send.  The special value
 916 |                             'auto' causes autodetection from the environment.
 917 |  -c, --cookies FILE         Use cookies, load/save them in FILE.  The special
 918 |                             value 'tmp' causes non-persistent use of cookies.
 919 |  -R, --no-referer           Do not send the Referer HTTP header.
 920 |  -q, --quiet                No output if no errors are found (implies -s).
 921 |  -v, --verbose              Verbose mode.
 922 |  -i, --indicator            Show percentage of lines processed while parsing.
 923 |  -u, --user USERNAME        Specify a username for authentication.
 924 |  -p, --password PASSWORD    Specify a password.
 925 |  --hide-same-realm          Hide 401's that are in the same realm as the
 926 |                             document checked.
 927 |  -S, --sleep SECS           Sleep SECS seconds between requests to each server
 928 |                             (default and minimum: 1 second).
 929 |  -t, --timeout SECS         Timeout for requests in seconds (default: 30).
 930 |  -d, --domain DOMAIN        Regular expression describing the domain to which
 931 |                             authentication information will be sent
 932 |                             (default: $trust).
 933 |  --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2.  See the
 934 |                             manual page for more information.
 935 |  --https                    Check the file as if it was served on https://
 936 |  -H, --html                 HTML output.
 937 |  -?, -h, --help             Show this message and exit.
 938 |  -V, --version              Output version information and exit.
 939 | 
 940 | See \"perldoc LWP\" for information about proxy server support,
 941 | \"perldoc Net::FTP\" for information about various environment variables
 942 | affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
 943 | NNTP server for news: URIs.
 944 | 
 945 | The W3C_CHECKLINK_CFG environment variable can be used to set the
 946 | configuration file to use.  See details in the full manual page, it can
 947 | be displayed with: perldoc checklink
 948 | 
 949 | More documentation at: $Cfg{Doc_URI}
 950 | Please file bug reports and comments at https://github.com/w3c/link-checker/issues
 951 | ";
 952 |     exit $exitval;
 953 | }
 954 | 
 955 | sub ask_password ()
 956 | {
 957 |     eval {
 958 |         local $SIG{__DIE__} = undef;
 959 |         require Term::ReadKey;
 960 |         Term::ReadKey->require_version(2.00);
 961 |         Term::ReadKey->import(qw(ReadMode));
 962 |     };
 963 |     if ($@) {
 964 |         warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
 965 |                 "password input disabled.\n");
 966 |         return;
 967 |     }
 968 |     printf(STDERR 'Enter the password for user %s: ', $Opts{User});
 969 |     ReadMode('noecho', *STDIN);
 970 |     chomp($Opts{Password} = <STDIN>);
 971 |     ReadMode('restore', *STDIN);
 972 |     print(STDERR "ok.\n");
 973 |     return;
 974 | }
 975 | 
 976 | ###############################################################################
 977 | 
 978 | ###########################################################################
 979 | # Guess an Accept-Language header based on the $LANG environment variable #
 980 | ###########################################################################
 981 | 
 982 | sub guess_language ()
 983 | {
 984 |     my $lang = $ENV{LANG} or return;
 985 | 
 986 |     $lang =~ s/[\.@].*$//;    # en_US.UTF-8, fi_FI@euro...
 987 | 
 988 |     return 'en' if ($lang eq 'C' || $lang eq 'POSIX');
 989 | 
 990 |     my $res = undef;
 991 |     eval {
 992 |         require Locale::Language;
 993 |         if (my $tmp = Locale::Language::language2code($lang)) {
 994 |             $lang = $tmp;
 995 |         }
 996 |         if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
 997 |             if (Locale::Language::code2language($l)) {
 998 |                 $res = $l;
 999 |                 if ($c) {
1000 |                     require Locale::Country;
1001 |                     $res .= "-$c" if Locale::Country::code2country($c);
1002 |                 }
1003 |             }
1004 |         }
1005 |     };
1006 |     return $res;
1007 | }
1008 | 
1009 | ############################
1010 | # Transform foo into a URI #
1011 | ############################
1012 | 
1013 | sub urize ($)
1014 | {
1015 |     my $arg  = shift;
1016 |     my $uarg = URI::Escape::uri_unescape($arg);
1017 |     my $uri;
1018 |     if (-d $uarg) {
1019 | 
1020 |         # look for an "index" file in dir, return it if found
1021 |         require File::Spec;
1022 |         for my $index (map { File::Spec->catfile($uarg, $_) }
1023 |             qw(index.html index.xhtml index.htm index.xhtm))
1024 |         {
1025 |             if (-e $index) {
1026 |                 $uri = URI::file->new_abs($index);
1027 |                 last;
1028 |             }
1029 |         }
1030 | 
1031 |         # return dir itself if an index file was not found
1032 |         $uri ||= URI::file->new_abs($uarg);
1033 |     }
1034 |     elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
1035 |         $uri = URI::file->new_abs($uarg);
1036 |     }
1037 |     else {
1038 |         my $newuri = URI->new($arg);
1039 |         if ($newuri->scheme()) {
1040 |             $uri = $newuri;
1041 |         }
1042 |         else {
1043 |             local $ENV{URL_GUESS_PATTERN} = '';
1044 |             $uri = URI::Heuristic::uf_uri($arg);
1045 |             $uri = URI::file->new_abs($uri) unless $uri->scheme();
1046 |         }
1047 |     }
1048 |     return $uri->canonical();
1049 | }
1050 | 
1051 | sub should_exclude ($)
1052 | {
1053 |     my ($canon_uri) = @_;
1054 |     for my $ex (@{$Opts{Exclude}}) {
1055 |         return 1 if ($canon_uri =~ $ex);
1056 |     }
1057 |     return 0;
1058 | }
1059 | 
1060 | ########################################
1061 | # Check for broken links in a resource #
1062 | ########################################
1063 | 
1064 | sub check_uri (\%\$$$$;\$$)
1065 | {
1066 |     my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
1067 |     $is_start ||= ($check_num == 1);
1068 | 
1069 |     my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
1070 |     my $error_found = 0;
1071 | 
1072 |     # Get and parse the document
1073 |     my $response = &get_document(
1074 |         'GET',   $uri,    $doc_count, \%redirects, $referer,
1075 |         $cookie, $params, $check_num, $is_start
1076 |     );
1077 | 
1078 |     # Can we check the resource? If not, we exit here...
1079 |     return 0 if defined($response->{Stop});
1080 | 
1081 |     if ($Opts{HTML}) {
1082 |         &html_header($uri, $cookie) if ($check_num == 1);
1083 |         &print_form($params, $cookie, $check_num) if $is_start;
1084 |     }
1085 | 
1086 |     if ($is_start) { # Starting point of a new check, eg. from the command line
1087 |           # Use the first URI as the recursion base unless specified otherwise.
1088 |         push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
1089 |             unless @{$Opts{Base_Locations}};
1090 |     }
1091 |     else {
1092 | 
1093 |         # Before fetching the document, we don't know if we'll be within the
1094 |         # recursion scope or not (think redirects).
1095 |         if (!&in_recursion_scope($response->{absolute_uri})) {
1096 |             hprintf("Not in recursion scope: %s\n")
1097 |                 if ($Opts{Verbose});
1098 |             $response->content("");
1099 |             return $error_found;
1100 |         }
1101 |     }
1102 | 
1103 |     # Define the document header, and perhaps print it.
1104 |     # (It might still be defined if the previous document had no errors;
1105 |     # just redefine it in that case.)
1106 | 
1107 |     if ($check_num != 1) {
1108 |         if ($Opts{HTML}) {
1109 |             $doc_header = "\n</section>\n";
1110 |         }
1111 |         else {
1112 |             $doc_header = "\n" . ('-' x 40) . "\n";
1113 |         }
1114 |     }
1115 | 
1116 |     if ($Opts{HTML}) {
1117 |         $doc_header .=
1118 |             ("<section data-url-checked=\"" . &encode($response->{absolute_uri}) . "\">\n\t<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) .
1119 |                 "\n</h2>\n\n");
1120 |     }
1121 |     else {
1122 |         $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n";
1123 |     }
1124 | 
1125 |     if (!$Opts{Quiet}) {
1126 |         print_doc_header();
1127 |     }
1128 | 
1129 |     # We are checking a new document
1130 |     $doc_count++;
1131 | 
1132 |     my $result_anchor = 'results' . $doc_count;
1133 | 
1134 |     if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
1135 |         my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
1136 |         my $acclang = $Opts{Accept_Language} || '(not sent)';
1137 |         my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
1138 |         my $cookies = 'not used';
1139 |         if (defined($Opts{Cookies})) {
1140 |             $cookies = 'used, ';
1141 |             if ($Opts{Cookies} eq 'tmp') {
1142 |                 $cookies .= 'non-persistent';
1143 |             }
1144 |             else {
1145 |                 $cookies .= "file $Opts{Cookies}";
1146 |             }
1147 |         }
1148 |         printf(
1149 |             <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s);
1150 | 
1151 | Settings used:
1152 | - Accept: %s
1153 | - Accept-Language: %s
1154 | - Referer: %s
1155 | - Cookies: %s
1156 | - Upgrade-Insecure-Requets: 1
1157 | - Sleeping %d second%s between requests to each server
1158 | EOF
1159 |         printf("- Excluding links matching %s\n",
1160 |             join(', ', @{$Opts{Exclude}}))
1161 |             if @{$Opts{Exclude}};
1162 |         printf("- Excluding links in documents whose URIs match %s\n",
1163 |             join(', ', @{$Opts{Exclude_Docs}}))
1164 |             if @{$Opts{Exclude_Docs}};
1165 |     }
1166 | 
1167 |     if ($Opts{HTML}) {
1168 |         if (!$Opts{Summary_Only}) {
1169 |             my $accept       = &encode($Accept);
1170 |             my $acclang      = &encode($Opts{Accept_Language} || '(not sent)');
1171 |             my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
1172 |             my $s            = $Opts{Sleep_Time} == 1 ? '' : 's';
1173 |             printf(
1174 |                 <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
1175 | <div class="settings">
1176 | Settings used:
1177 |  <ul>
1178 |   <li><code><a href="https://www.rfc-editor.org/rfc/rfc9110#field.accept">Accept</a></code>: %s</li>
1179 |   <li><code><a href="https://www.rfc-editor.org/rfc/rfc9110#field.accept-language">Accept-Language</a></code>: %s</li>
1180 |   <li><code><a href="https://www.rfc-editor.org/rfc/rfc9110#field.referer">Referer</a></code>: %s</li>
1181 |   <li>Sleeping %d second%s between requests to each server</li>
1182 |  </ul>
1183 | </div>
1184 | EOF
1185 |             printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
1186 |                 $result_anchor);
1187 |             my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri},
1188 |                 "^A-Za-z0-9.");
1189 |             print "<p>For reliable link checking results, check ";
1190 | 
1191 |             if (!$response->{IsCss}) {
1192 |                 printf("<a href=\"%s\">HTML validity</a> and ",
1193 |                     &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
1194 |             }
1195 |             printf(
1196 |                 "<a href=\"%s\">CSS validity</a> first.</p>
1197 | <p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
1198 |                 &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
1199 |                 &encode($Opts{_Self_URI})
1200 |             );
1201 | 
1202 |             printf(<<'EOF', $result_anchor);
1203 | <div class="progress" id="progress%s">
1204 | <h3>Status: <span></span></h3>
1205 | <label>
1206 |     <span></span>
1207 |     <progress value="0" max="100"></progress>
1208 | </label>
1209 | <pre>
1210 | EOF
1211 |         }
1212 |     }
1213 | 
1214 |     if ($Opts{Summary_Only} && !$Opts{Quiet}) {
1215 |         print '<p>' if $Opts{HTML};
1216 |         print 'This may take some time';
1217 |         print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>"
1218 |             if $Opts{HTML};
1219 |         print " if the document has many links to check.\n" unless $Opts{HTML};
1220 |     }
1221 | 
1222 |     # Record that we have processed this resource
1223 |     $processed{$response->{absolute_uri}} = 1;
1224 | 
1225 |     # Parse the document
1226 |     my $p =
1227 |         &parse_document($uri, $response->base(), $response, 1, ($depth != 0));
1228 |     my $base = URI->new($p->{base});
1229 |     if (!defined($base->scheme)) {
1230 |         $base = URI->new_abs($p->{base}, $uri);
1231 |     }
1232 |     my $check_mixed_content = ($base->scheme eq 'https' || $Opts{Https});
1233 | 
1234 |     # Check anchors
1235 |     ###############
1236 | 
1237 |     print "Checking anchors...\n" unless $Opts{Summary_Only};
1238 | 
1239 |     my %errors;
1240 |     while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
1241 |         if (!length($anchor)) {
1242 | 
1243 |             # Empty IDREF's are not allowed
1244 |             $errors{$anchor} = 1;
1245 |         }
1246 |         else {
1247 |             my $times = 0;
1248 |             $times += $_ for values(%$lines);
1249 | 
1250 |             # They should appear only once
1251 |             $errors{$anchor} = 1 if ($times > 1);
1252 |         }
1253 |     }
1254 |     print " done.\n" unless $Opts{Summary_Only};
1255 | 
1256 |     # Check links
1257 |     #############
1258 | 
1259 |     &hprintf("Recording all the links found: %d\n",
1260 |         scalar(keys %{$p->{Links}}))
1261 |         if ($Opts{Verbose});
1262 |     my %links;
1263 |     my %hostlinks;
1264 | 
1265 |     # Record all the links found
1266 |     while (my ($link, $lines) = each(%{$p->{Links}})) {
1267 |         #my @hash = values(%$lines);
1268 |         #print join(',', map { keys(%$_) } @hash);
1269 |         #print $link, $mixedcontent;
1270 |         my $link_uri = URI->new($link);
1271 |         my $abs_link_uri = URI->new_abs($link_uri, $base);
1272 | 
1273 |         if ($Opts{Masquerade}) {
1274 |             if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
1275 |                 print_doc_header();
1276 |                 printf("processing %s in base %s\n",
1277 |                     $abs_link_uri, $Opts{Masquerade_To});
1278 |                 my $nlink = $abs_link_uri;
1279 |                 $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
1280 |                 $abs_link_uri = URI->new($nlink);
1281 |             }
1282 |         }
1283 | 
1284 |         my $canon_uri = URI->new($abs_link_uri->canonical());
1285 |         my $fragment  = $canon_uri->fragment(undef);
1286 |         if (!should_exclude($canon_uri)) {
1287 |             if (!exists($links{$canon_uri})) {
1288 |                 my $hostport;
1289 |                 $hostport = $canon_uri->host_port()
1290 |                     if $canon_uri->can('host_port');
1291 |                 $hostport = '' unless defined $hostport;
1292 |                 push(@{$hostlinks{$hostport}}, $canon_uri);
1293 |             }
1294 |             for my $line_num (keys(%{$lines})) {
1295 |                 my @mixedcontent = keys(%{%$lines{$line_num}});
1296 |                 my $mc_block = MC_BLOCK;
1297 |                 if (grep(/^$mc_block$/, @mixedcontent)) {
1298 | 
1299 |                     push(@{$links{$canon_uri}{mixedcontent}}, $line_num);
1300 |                 }
1301 |                 # take optional into account if CSP: block-all-mixed-content is set
1302 |                 my $mc_optional = MC_OPTIONAL;
1303 |                 if ($p->{csp} && grep(qr/^block-all-mixed-content$/, $p->{csp})
1304 |                     && grep(/^$mc_optional$/, @mixedcontent)) {
1305 |                     push(@{$links{$canon_uri}{mixedcontent}}, $line_num);
1306 |                 }
1307 |                 if (!defined($fragment) || !length($fragment)) {
1308 | 
1309 |                     # Document without fragment
1310 |                     $links{$canon_uri}{location}{$line_num} = @mixedcontent;
1311 |                 }
1312 |                 else {
1313 | 
1314 |                     # Resource with a fragment
1315 |                     if (! grep { $_ eq "$canon_uri#$fragment" } @{$Opts{Suppress_Fragment}}) {
1316 |                         $links{$canon_uri}{fragments}{$fragment}{$line_num} = @mixedcontent;
1317 |                     }
1318 |                 }
1319 |             }
1320 |         }
1321 |     }
1322 | 
1323 |     my @order = &distribute_links(\%hostlinks);
1324 |     undef %hostlinks;
1325 | 
1326 |     # Build the list of broken URI's
1327 | 
1328 |     my $nlinks = scalar(@order);
1329 | 
1330 |     &hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
1331 |         if ($Opts{Verbose});
1332 | 
1333 |     my %broken;
1334 |     my %blocked;
1335 |     my $link_num = 0;
1336 |     for my $u (@order) {
1337 |         my $ulinks = $links{$u};
1338 | 
1339 |         if ($Opts{Summary_Only}) {
1340 | 
1341 |             # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
1342 |             print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
1343 |         }
1344 |         else {
1345 |             &hprintf("\nChecking link %s\n", $u);
1346 |             my $progress = ($link_num / $nlinks) * 100;
1347 |             printf(
1348 |                 '<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>',
1349 |                 $result_anchor, &encode($u), $progress)
1350 |                 if (!$Opts{Command_Line} &&
1351 |                 $Opts{HTML} &&
1352 |                 !$Opts{Summary_Only});
1353 |         }
1354 |         $link_num++;
1355 | 
1356 |         # Check that a link is valid
1357 |         my $mixedcontent_lines = $check_mixed_content && %$ulinks{"mixedcontent"} ? %$ulinks{"mixedcontent"} : ();
1358 |         &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)),
1359 |             \%links, \%redirects, $mixedcontent_lines, $p->{csp});
1360 |         &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
1361 |             if ($Opts{Verbose});
1362 |         if ($results{$u}{location}{success}) {
1363 | 
1364 |             # Even though it was not broken, we might want to display it
1365 |             # on the results page (e.g. because it required authentication)
1366 |             $broken{$u}{location} = 1
1367 |                 if ($results{$u}{location}{display} >= 400);
1368 | 
1369 |             # List the broken fragments
1370 |             while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
1371 |                 my $fragment_broken = $results{$u}{fragments}{$fragment} == 0
1372 |                 # 100 matches what we get for ignored error codes
1373 |                 || $results{$u}{location}{code} == 100;
1374 |                 if ($Opts{Verbose}) {
1375 |                     my @line_nums = sort { $a <=> $b } keys(%$lines);
1376 |                     &hprintf(
1377 |                         "\t\t%s %s - Line%s: %s\n",
1378 |                         $fragment,
1379 |                         $fragment_broken             ? 'Not found' : ( $results{$u}{fragments}{$fragment} > 0 ? 'OK' : 'Not checked'),
1380 |                         (scalar(@line_nums) > 1) ? 's'  : '',
1381 |                         join(', ', @line_nums)
1382 |                     );
1383 |                 }
1384 | 
1385 |                 # A broken fragment?
1386 |                 $broken{$u}{fragments}{$fragment} += 2 if $fragment_broken;
1387 |             }
1388 |         }
1389 |         elsif (!($Opts{Quiet} && defined($results{$u}{location}{code}) && &informational($results{$u}{location}{code})))
1390 |         {
1391 | 
1392 |             # Couldn't find the document
1393 |             $broken{$u}{location} = 1;
1394 | 
1395 |             # All the fragments associated are hence broken
1396 |             for my $fragment (keys %{$ulinks->{fragments}}) {
1397 |                 $broken{$u}{fragments}{$fragment}++;
1398 |             }
1399 |         }
1400 |         if ($results{$u}{mixedcontent_blocked}) {
1401 |             $blocked{$u} = $results{$u}{mixedcontent_blocked};
1402 |         }
1403 |     }
1404 |     &hprintf(
1405 |         "\nProcessed in %s seconds.\n",
1406 |         &time_diff($start, &get_timestamp())
1407 |     ) unless $Opts{Summary_Only};
1408 |     printf(
1409 |         '<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>',
1410 |         $result_anchor, &time_diff($start, &get_timestamp()))
1411 |         if ($Opts{HTML} && !$Opts{Summary_Only});
1412 | 
1413 |     # Display results
1414 |     if ($Opts{HTML} && !$Opts{Summary_Only}) {
1415 |         print("</pre>\n</div>\n");
1416 |         printf("<h2 id=\"%s\">Results</h2>\n", $result_anchor);
1417 |     }
1418 |     print "\n" unless $Opts{Quiet};
1419 | 
1420 |     &links_summary(\%links, \%results, \%broken, \%redirects, \%blocked);
1421 |     &anchors_summary($p->{Anchors}, \%errors);
1422 | 
1423 |     $error_found = scalar(keys(%errors)) + scalar(keys(%broken)) + scalar(keys(%blocked));
1424 | 
1425 |     # Do we want to process other documents?
1426 |     if ($depth != 0) {
1427 | 
1428 |         for my $u (map { URI->new($_) } keys %links) {
1429 | 
1430 |             next unless $results{$u}{location}{success};    # Broken link?
1431 | 
1432 |             next unless &in_recursion_scope($u);
1433 | 
1434 |             # Do we understand its content type?
1435 |             next unless (defined $results{$u}{location}{type} && $results{$u}{location}{type} =~ $ContentTypes);
1436 | 
1437 |             # Have we already processed this URI?
1438 |             next if &already_processed($u, $uri);
1439 | 
1440 |             # Do the job
1441 |             print "\n" unless $Opts{Quiet};
1442 |             if ($Opts{HTML}) {
1443 |                 if (!$Opts{Command_Line}) {
1444 |                     if ($doc_count == $Opts{Max_Documents}) {
1445 |                         print(
1446 |                             "<hr>\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n"
1447 |                         );
1448 |                     }
1449 |                     if ($doc_count >= $Opts{Max_Documents}) {
1450 |                         $doc_count++;
1451 |                         print("<p>Not checking <strong>$u</strong></p>\n");
1452 |                         $processed{$u} = 1;
1453 |                         next;
1454 |                     }
1455 |                 }
1456 |             }
1457 | 
1458 |             # This is an inherently recursive algorithm, so Perl's warning is not
1459 |             # helpful.  You may wish to comment this out when debugging, though.
1460 |             no warnings 'recursion';
1461 | 
1462 |             if ($depth < 0) {
1463 |                 $error_found += &check_uri($params, $u, 0, -1, $cookie, $uri);
1464 |             }
1465 |             else {
1466 |                 $error_found += &check_uri($params, $u, 0, $depth - 1, $cookie, $uri);
1467 |             }
1468 |         }
1469 |     }
1470 |     return $error_found;
1471 | }
1472 | 
1473 | ###############################################################
1474 | # Distribute links based on host:port to avoid RobotUA delays #
1475 | ###############################################################
1476 | 
1477 | sub distribute_links(\%)
1478 | {
1479 |     my $hostlinks = shift;
1480 | 
1481 |     # Hosts ordered by weight (number of links), descending
1482 |     my @order =
1483 |         sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) }
1484 |         keys %$hostlinks;
1485 | 
1486 |     # All link list flattened into one, in host weight order
1487 |     my @all;
1488 |     push(@all, @{$hostlinks->{$_}}) for @order;
1489 | 
1490 |     return @all if (scalar(@order) < 2);
1491 | 
1492 |     # Indexes and chunk size for "zipping" the end result list
1493 |     my $num = scalar(@{$hostlinks->{$order[0]}});
1494 |     my @indexes = map { $_ * $num } (0 .. $num - 1);
1495 | 
1496 |     # Distribute them
1497 |     my @result;
1498 |     while (my @chunk = splice(@all, 0, $num)) {
1499 |         @result[@indexes] = @chunk;
1500 |         @indexes = map { $_ + 1 } @indexes;
1501 |     }
1502 | 
1503 |     # Weed out undefs
1504 |     @result = grep(defined, @result);
1505 | 
1506 |     return @result;
1507 | }
1508 | 
1509 | ##########################################
1510 | # Decode Content-Encodings in a response #
1511 | ##########################################
1512 | 
1513 | sub decode_content ($)
1514 | {
1515 |     my $response = shift;
1516 |     my $error    = undef;
1517 | 
1518 |     my $docref = $response->decoded_content(ref => 1);
1519 |     if (defined($docref)) {
1520 |         utf8::encode($$docref);
1521 |         $response->content_ref($docref);
1522 | 
1523 |         # Remove Content-Encoding so it won't be decoded again later.
1524 |         $response->remove_header('Content-Encoding');
1525 |     }
1526 |     else {
1527 |         my $ce = $response->header('Content-Encoding');
1528 |         $ce = defined($ce) ? "'$ce'" : 'undefined';
1529 |         my $ct = $response->header('Content-Type');
1530 |         $ct = defined($ct) ? "'$ct'" : 'undefined';
1531 |         my $request_uri = $response->request->url;
1532 | 
1533 |         my $cs = $response->content_charset();
1534 |         $cs = defined($cs) ? "'$cs'" : 'unknown';
1535 |         $error =
1536 |             "Error decoding document at <$request_uri>, Content-Type $ct, " .
1537 |             "Content-Encoding $ce, content charset $cs: '$@'";
1538 |     }
1539 |     return $error;
1540 | }
1541 | 
1542 | #######################################
1543 | # Get and parse a resource to process #
1544 | #######################################
1545 | 
1546 | sub get_document ($\$$;\%\$$$$$)
1547 | {
1548 |     my ($method, $uri,    $in_recursion, $redirects, $referer,
1549 |         $cookie, $params, $check_num,    $is_start
1550 |     ) = @_;
1551 | 
1552 |     # $method contains the HTTP method the use (GET or HEAD)
1553 |     # $uri object contains the identifier of the resource
1554 |     # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
1555 |     #                        the second resource checked)
1556 |     # $redirects is a pointer to the hash containing the map of the redirects
1557 |     # $referer is the URI object of the referring document
1558 |     # $cookie, $params, $check_num, and $is_start are for printing HTTP headers
1559 |     #                  and the form if $in_recursion == 0 and not authenticating
1560 | 
1561 |     # Get the resource
1562 |     my $response;
1563 |     if (defined($results{$uri}{response}) &&
1564 |         !($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
1565 |     {
1566 |         $response = $results{$uri}{response};
1567 |     }
1568 |     else {
1569 |         $response = &get_uri($method, $uri, $referer);
1570 |         &record_results($uri, $method, $response, $referer);
1571 |         &record_redirects($redirects, $response);
1572 |     }
1573 |     if (!$response->is_success()) {
1574 |         if (!$in_recursion) {
1575 | 
1576 |             # Is it too late to request authentication?
1577 |             if ($response->code() == 401) {
1578 |                 &authentication($response, $cookie, $params, $check_num,
1579 |                     $is_start);
1580 |             }
1581 |             else {
1582 |                 if ($Opts{HTML}) {
1583 |                     &html_header($uri, $cookie) if ($check_num == 1);
1584 |                     &print_form($params, $cookie, $check_num) if $is_start;
1585 |                     print "<p>", &status_icon($response->code());
1586 |                 }
1587 |                 &hprintf("\nError: %d %s\n",
1588 |                     $response->code(), $response->message() || '(no message)');
1589 |                 print "</p>\n" if $Opts{HTML};
1590 |             }
1591 |         }
1592 |         $response->{Stop} = 1;
1593 |         $response->content("");
1594 |         return ($response);
1595 |     }
1596 | 
1597 |     # What is the URI of the resource that we are processing by the way?
1598 |     my $base_uri    = $response->base();
1599 |     my $request_uri = URI->new($response->request->url);
1600 |     $response->{absolute_uri} = $request_uri->abs($base_uri);
1601 | 
1602 |     # Can we parse the document?
1603 |     my $failed_reason;
1604 |     my $ct = $response->header('Content-Type');
1605 |     if (!$ct || $ct !~ $ContentTypes) {
1606 |         $failed_reason = "Content-Type for <$request_uri> is " .
1607 |             (defined($ct) ? "'$ct'" : 'undefined');
1608 |     }
1609 |     else {
1610 |         $failed_reason = decode_content($response);
1611 |     }
1612 |     if ($failed_reason) {
1613 | 
1614 |         # No, there is a problem...
1615 |         if (!$in_recursion) {
1616 |             if ($Opts{HTML}) {
1617 |                 &html_header($uri, $cookie) if ($check_num == 1);
1618 |                 &print_form($params, $cookie, $check_num) if $is_start;
1619 |                 print "<p>", &status_icon(406);
1620 | 
1621 |             }
1622 |             &hprintf("Can't check links: %s.\n", $failed_reason);
1623 |             print "</p>\n" if $Opts{HTML};
1624 |         }
1625 |         $response->{Stop} = 1;
1626 |         $response->content("");
1627 |     }
1628 | 
1629 |     # Ok, return the information
1630 |     return ($response);
1631 | }
1632 | 
1633 | #########################################################
1634 | # Check whether a URI is within the scope of recursion. #
1635 | #########################################################
1636 | 
1637 | sub in_recursion_scope (\$)
1638 | {
1639 |     my ($uri) = @_;
1640 |     return 0 unless $uri;
1641 | 
1642 |     my $candidate = $uri->canonical();
1643 | 
1644 |     return 0 if should_exclude($candidate);
1645 | 
1646 |     for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
1647 |         return 0 if ($candidate =~ $excluded_doc);
1648 |     }
1649 | 
1650 |     for my $base (@{$Opts{Base_Locations}}) {
1651 |         my $rel = $candidate->rel($base);
1652 |         next if ($candidate eq $rel);    # Relative path not possible?
1653 |         next if ($rel =~ m|^(\.\.)?/|);  # Relative path upwards?
1654 |         return 1;
1655 |     }
1656 | 
1657 |     return 0;    # We always have at least one base location, but none matched.
1658 | }
1659 | 
1660 | #################################
1661 | # Check for content type match. #
1662 | #################################
1663 | 
1664 | sub is_content_type ($$)
1665 | {
1666 |     my ($candidate, $type) = @_;
1667 |     return 0 unless ($candidate && $type);
1668 |     my @v = HTTP::Headers::Util::split_header_words($candidate);
1669 |     return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
1670 | }
1671 | 
1672 | ##################################################
1673 | # Check whether a URI has already been processed #
1674 | ##################################################
1675 | 
1676 | sub already_processed (\$\$)
1677 | {
1678 |     my ($uri, $referer) = @_;
1679 | 
1680 |     # Don't be verbose for that part...
1681 |     my $summary_value = $Opts{Summary_Only};
1682 |     $Opts{Summary_Only} = 1;
1683 | 
1684 |     # Do a GET: if it fails, we stop, if not, the results are cached
1685 |     my $response = &get_document('GET', $uri, 1, undef, $referer);
1686 | 
1687 |     # ... but just for that part
1688 |     $Opts{Summary_Only} = $summary_value;
1689 | 
1690 |     # Can we process the resource?
1691 |     return -1 if defined($response->{Stop});
1692 | 
1693 |     # Have we already processed it?
1694 |     return 1 if defined($processed{$response->{absolute_uri}->as_string()});
1695 | 
1696 |     # It's not processed yet and it is processable: return 0
1697 |     return 0;
1698 | }
1699 | 
1700 | ############################
1701 | # Get the content of a URI #
1702 | ############################
1703 | 
1704 | sub get_uri ($\$;\$$\%$$$$)
1705 | {
1706 | 
1707 |     # Here we have a lot of extra parameters in order not to lose information
1708 |     # if the function is called several times (401's)
1709 |     my ($method, $uri,   $referer, $start, $redirects,
1710 |         $code,   $realm, $message, $auth
1711 |     ) = @_;
1712 | 
1713 |     # $method contains the method used
1714 |     # $uri object contains the target of the request
1715 |     # $referer is the URI object of the referring document
1716 |     # $start is a timestamp (not defined the first time the function is called)
1717 |     # $redirects is a map of redirects
1718 |     # $code is the first HTTP return code
1719 |     # $realm is the realm of the request
1720 |     # $message is the HTTP message received
1721 |     # $auth equals 1 if we want to send out authentication information
1722 | 
1723 |     # For timing purposes
1724 |     $start = &get_timestamp() unless defined($start);
1725 | 
1726 |     # Prepare the query
1727 | 
1728 |     # Do we want printouts of progress?
1729 |     my $verbose_progress =
1730 |         !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
1731 | 
1732 |     &hprintf("%s %s ", $method, $uri) if $verbose_progress;
1733 | 
1734 |     my $request = HTTP::Request->new($method, $uri);
1735 | 
1736 |     $request->header('Accept-Language' => $Opts{Accept_Language})
1737 |         if $Opts{Accept_Language};
1738 |     $request->header('Accept', $Accept);
1739 |     $request->header('Upgrade-Insecure-Requests', 1);
1740 |     $request->accept_decodable();
1741 | 
1742 |     # Are we providing authentication info?
1743 |     if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
1744 |         if (defined($ENV{HTTP_AUTHORIZATION})) {
1745 |             $request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
1746 |         }
1747 |         elsif (defined($Opts{User}) && defined($Opts{Password})) {
1748 |             $request->authorization_basic($Opts{User}, $Opts{Password});
1749 |         }
1750 |     }
1751 | 
1752 |     # Tell the user agent if we want progress reports for redirects or not.
1753 |     $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
1754 |         if $verbose_progress;
1755 | 
1756 |     # Set referer
1757 |     $request->referer($referer) if (!$Opts{No_Referer} && $referer);
1758 | 
1759 |     # Telling caches in the middle we want a fresh copy (Bug 4998)
1760 |     $request->header(Cache_Control => "max-age=0");
1761 | 
1762 |     # Do the query
1763 |     my $response = $ua->request($request);
1764 | 
1765 |     # Get the results
1766 |     # Record the very first response
1767 |     if (!defined($code)) {
1768 |         ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
1769 |     }
1770 | 
1771 |     # Authentication requested?
1772 |     if ($response->code() == 401 &&
1773 |         !defined($auth) &&
1774 |         (defined($ENV{HTTP_AUTHORIZATION}) ||
1775 |             (defined($Opts{User}) && defined($Opts{Password})))
1776 |         )
1777 |     {
1778 | 
1779 |         # Set host as trusted domain unless we already have one.
1780 |         if (!$Opts{Trusted}) {
1781 |             my $re = sprintf('^%s$', quotemeta($response->base()->host()));
1782 |             $Opts{Trusted} = qr/$re/io;
1783 |         }
1784 | 
1785 |         # Deal with authentication and avoid loops
1786 |         if (!defined($realm) &&
1787 |             $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/)
1788 |         {
1789 |             $realm = $1;
1790 |         }
1791 | 
1792 |         print "\n" if $verbose_progress;
1793 |         return &get_uri($method, $response->request()->url(),
1794 |             $referer, $start, $redirects, $code, $realm, $message, 1);
1795 |     }
1796 | 
1797 |     # @@@ subtract robot delay from the "fetched in" time?
1798 |     &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
1799 |         if $verbose_progress;
1800 | 
1801 |     $response->{IsCss} =
1802 |         is_content_type($response->content_type(), "text/css");
1803 |     $response->{Realm} = $realm if defined($realm);
1804 | 
1805 |     return $response;
1806 | }
1807 | 
1808 | #########################################
1809 | # Record the results of an HTTP request #
1810 | #########################################
1811 | 
1812 | sub record_results (\$$$$\@)
1813 | {
1814 |     my ($uri, $method, $response, $referer, $mixedcontent_lines, @csp) = @_;
1815 |     $results{$uri}{referer}        = $referer;
1816 |     $results{$uri}{response}       = $response;
1817 |     $results{$uri}{method}         = $method;
1818 |     $results{$uri}{location}{code} = $response->code();
1819 |     $results{$uri}{location}{code} = RC_ROBOTS_TXT()
1820 |         if ($results{$uri}{location}{code} == 403 &&
1821 |         $response->message() =~ /Forbidden by robots\.txt/);
1822 |     $results{$uri}{location}{code} = RC_IP_DISALLOWED()
1823 |         if ($results{$uri}{location}{code} == 403 &&
1824 |         $response->message() =~ /non-public IP/);
1825 |     $results{$uri}{location}{code} = RC_DNS_ERROR()
1826 |         if ($results{$uri}{location}{code} == 500 &&
1827 |         $response->message() =~ /Bad hostname '[^\']*'/);
1828 |     $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
1829 |         if ($results{$uri}{location}{code} == 500 &&
1830 |         $response->message() =~ /Access to '[^\']*' URIs has been disabled/);
1831 | 
1832 |     $results{$uri}{location}{type}    = $response->header('Content-type');
1833 |     $results{$uri}{location}{display} = $results{$uri}{location}{code};
1834 | 
1835 |     # Rewind, check for the original code and message.
1836 |     for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
1837 |         $results{$uri}{location}{orig}         = $tmp->code();
1838 |         $results{$uri}{location}{orig_message} = $tmp->message() ||
1839 |             '(no message)';
1840 |     }
1841 |     # dealing with mixed content
1842 |     if ($mixedcontent_lines && $uri->scheme ne 'https') {
1843 |         # Check headers of CSP: upgrade-insecure-request
1844 |         if (! ($uri->scheme eq 'http' && $#csp != 0 && grep(/^upgrade-insecure-requests$/, @csp))) {
1845 |             # Set this the lines where the URL is used in mixedcontent_block context
1846 |             $results{$uri}{mixedcontent_blocked} = $mixedcontent_lines;
1847 |         }
1848 |     }
1849 |     $results{$uri}{location}{success} = $response->is_success();
1850 | 
1851 |     # If a suppressed broken link, fill the data structure like a typical success.
1852 |     # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
1853 |     if (!$results{$uri}{location}{success}) {
1854 |         my $code = $results{$uri}{location}{code};
1855 |         my $matchCodeUri = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
1856 |         my $matchCode = grep { $_ eq "$code" } @{$Opts{Suppress_Broken}};
1857 |         if ($matchCodeUri || $matchCode) {
1858 |             $results{$uri}{location}{success} = 1;
1859 |             $results{$uri}{location}{code}    = 100;
1860 |             $results{$uri}{location}{display} = 100;
1861 |         }
1862 |     }
1863 | 
1864 |     # Stores the authentication information
1865 |     if (defined($response->{Realm})) {
1866 |         $results{$uri}{location}{realm} = $response->{Realm};
1867 |         $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
1868 |     }
1869 | 
1870 |     # What type of broken link is it? (stored in {record} - the {display}
1871 |     #              information is just for visual use only)
1872 |     if ($results{$uri}{location}{display} == 401 &&
1873 |         $results{$uri}{location}{code} == 404)
1874 |     {
1875 |         $results{$uri}{location}{record} = 404;
1876 |     }
1877 |     else {
1878 |         $results{$uri}{location}{record} = $results{$uri}{location}{display};
1879 |     }
1880 | 
1881 |     # Did it fail?
1882 |     $results{$uri}{location}{message} = $response->message() || '(no message)';
1883 |     if (!$results{$uri}{location}{success}) {
1884 |         &hprintf(
1885 |             "Error: %d %s\n",
1886 |             $results{$uri}{location}{code},
1887 |             $results{$uri}{location}{message}
1888 |         ) if ($Opts{Verbose});
1889 |     }
1890 |     return;
1891 | }
1892 | 
1893 | ####################
1894 | # Parse a document #
1895 | ####################
1896 | 
1897 | sub parse_document (\$\$$$$)
1898 | {
1899 |     my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
1900 | 
1901 |     print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
1902 |         if $Opts{Verbose};
1903 | 
1904 |     my $p;
1905 | 
1906 |     if (defined($results{$uri}{parsing})) {
1907 | 
1908 |         # We have already done the job. Woohoo!
1909 |         $p->{base}    = $results{$uri}{parsing}{base};
1910 |         $p->{Anchors} = $results{$uri}{parsing}{Anchors};
1911 |         $p->{Links}   = $results{$uri}{parsing}{Links};
1912 |         return $p;
1913 |     }
1914 | 
1915 |     $p = W3C::LinkChecker->new();
1916 |     $p->{base} = $base_uri;
1917 | 
1918 |     $p->{csp} = $response->header('Content-Security-Policy');
1919 | 
1920 |     my $stype = $response->header("Content-Style-Type");
1921 |     $p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
1922 | 
1923 |     my $start;
1924 |     if (!$Opts{Summary_Only}) {
1925 |         $start = &get_timestamp();
1926 |         print("Parsing...\n");
1927 |     }
1928 | 
1929 |     # Content-Encoding etc already decoded in get_document().
1930 |     my $docref = $response->content_ref();
1931 | 
1932 |     # Count lines beforehand if needed (for progress indicator, or CSS while
1933 |     # we don't get any line context out of the parser).  In case of HTML, the
1934 |     # actual final number of lines processed shown is populated by our
1935 |     # end_document handler.
1936 |     $p->{Total} = ($$docref =~ tr/\n//)
1937 |         if ($response->{IsCss} || $Opts{Progress});
1938 | 
1939 |     # We only look for anchors if we are not interested in the links
1940 |     # obviously, or if we are running a recursive checking because we
1941 |     # might need this information later
1942 |     $p->{only_anchors} = !($links || $rec_needs_links);
1943 | 
1944 |     if ($response->{IsCss}) {
1945 | 
1946 |         # Parse as CSS
1947 | 
1948 |         $p->parse_css($$docref, LINE_UNKNOWN());
1949 |     }
1950 |     else {
1951 | 
1952 |         # Parse as HTML
1953 | 
1954 |         # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
1955 |         # Processing instructions are not parsed by process, but in this case
1956 |         # it should be. It's expensive, it's horrible, but it's the easiest way
1957 |         # for right now.
1958 |         $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
1959 |             unless $p->{only_anchors};
1960 | 
1961 |         $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
1962 | 
1963 |         $p->parse($$docref)->eof();
1964 |     }
1965 | 
1966 |     $response->content("");
1967 | 
1968 |     if (!$Opts{Summary_Only}) {
1969 |         my $stop = &get_timestamp();
1970 |         print "\r" if $Opts{Progress};
1971 |         &hprintf(" done (%d lines in %s seconds).\n",
1972 |             $p->{Total}, &time_diff($start, $stop));
1973 |     }
1974 | 
1975 |     # Save the results before exiting
1976 |     $results{$uri}{parsing}{base}    = $p->{base};
1977 |     $results{$uri}{parsing}{Anchors} = $p->{Anchors};
1978 |     $results{$uri}{parsing}{Links}   = $p->{Links};
1979 | 
1980 |     return $p;
1981 | }
1982 | 
1983 | ####################################
1984 | # Constructor for W3C::LinkChecker #
1985 | ####################################
1986 | 
1987 | sub new
1988 | {
1989 |     my $p = HTML::Parser::new(@_, api_version => 3);
1990 |     $p->utf8_mode(1);
1991 | 
1992 |     # Set up handlers
1993 | 
1994 |     $p->handler(start => 'start', 'self, tagname, attr, line');
1995 |     $p->handler(end   => 'end',   'self, tagname, line');
1996 |     $p->handler(text  => 'text',  'self, dtext, line');
1997 |     $p->handler(
1998 |         declaration => sub {
1999 |             my $self = shift;
2000 |             $self->declaration(substr($_[0], 2, -1));
2001 |         },
2002 |         'self, text, line'
2003 |     );
2004 |     $p->handler(end_document => 'end_document', 'self, line');
2005 |     if ($Opts{Progress}) {
2006 |         $p->handler(default => 'parse_progress', 'self, line');
2007 |         $p->{last_percentage} = 0;
2008 |     }
2009 | 
2010 |     # Check <a [..] name="...">?
2011 |     $p->{check_name} = 1;
2012 | 
2013 |     # Check <[..] id="..">?
2014 |     $p->{check_id} = 1;
2015 | 
2016 |     # Don't interpret comment loosely
2017 |     $p->strict_comment(1);
2018 | 
2019 |     return $p;
2020 | }
2021 | 
2022 | #################################################
2023 | # Record or return  the doctype of the document #
2024 | #################################################
2025 | 
2026 | sub doctype
2027 | {
2028 |     my ($self, $dc) = @_;
2029 |     return $self->{doctype} unless $dc;
2030 |     $_ = $self->{doctype} = $dc;
2031 | 
2032 |     # What to look for depending on the doctype
2033 | 
2034 |     # Check for <a name="...">?
2035 |     $self->{check_name} = 0
2036 |         if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;
2037 | 
2038 |     # Check for <* id="...">?
2039 |     $self->{check_id} = 0
2040 |         if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);
2041 | 
2042 |     # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
2043 |     $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);
2044 | 
2045 |     return;
2046 | }
2047 | 
2048 | ###################################
2049 | # Print parse progress indication #
2050 | ###################################
2051 | 
2052 | sub parse_progress
2053 | {
2054 |     my ($self, $line) = @_;
2055 |     return unless defined($line) && $line > 0 && $self->{Total} > 0;
2056 | 
2057 |     my $percentage = int($line / $self->{Total} * 100);
2058 |     if ($percentage != $self->{last_percentage}) {
2059 |         printf("\r%4d%%", $percentage);
2060 |         $self->{last_percentage} = $percentage;
2061 |     }
2062 | 
2063 |     return;
2064 | }
2065 | 
2066 | #############################
2067 | # Extraction of the anchors #
2068 | #############################
2069 | 
2070 | sub get_anchor
2071 | {
2072 |     my ($self, $tag, $attr) = @_;
2073 | 
2074 |     my $anchor = $self->{check_id} ? $attr->{id} : undef;
2075 |     if ($self->{check_name} && ($tag eq 'a')) {
2076 | 
2077 |         # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
2078 |         # Force an error if it's not the case (or if id's and name's values
2079 |         #                                      are different)
2080 |         # If id is defined, name if defined must have the same value
2081 |         $anchor ||= $attr->{name};
2082 |     }
2083 | 
2084 |     return $anchor;
2085 | }
2086 | 
2087 | #############################
2088 | # W3C::LinkChecker handlers #
2089 | #############################
2090 | 
2091 | sub add_link
2092 | {
2093 |     my ($self, $uri, $base, $line, $mixedcontent) = @_;
2094 |     if (defined($uri)) {
2095 | 
2096 |         # Remove repeated slashes after the . or .. in relative links, to avoid
2097 |         # duplicated checking or infinite recursion.
2098 |         $uri =~ s|^(\.\.?/)/+|$1|o;
2099 |         $uri = Encode::decode_utf8($uri);
2100 |         $uri = URI->new_abs($uri, $base) if defined($base);
2101 |         $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}{$mixedcontent}++;
2102 |     }
2103 |     return;
2104 | }
2105 | 
2106 | sub start
2107 | {
2108 |     my ($self, $tag, $attr, $line) = @_;
2109 |     $line = LINE_UNKNOWN() unless defined($line);
2110 | 
2111 |     # Anchors
2112 |     my $anchor = $self->get_anchor($tag, $attr);
2113 |     $self->{Anchors}{$anchor}{$line}++ if defined($anchor);
2114 | 
2115 |     # Links
2116 |     if (!$self->{only_anchors}) {
2117 | 
2118 |         my $tag_local_base = undef;
2119 | 
2120 |         # Special case: base/@href
2121 |         # @@@TODO: The reason for handling <base href> ourselves is that LWP's
2122 |         # head parsing magic fails at least for responses that have
2123 |         # Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361
2124 |         if ($tag eq 'base') {
2125 | 
2126 |             # Ignore <base> with missing/empty href.
2127 |             $self->{base} = $attr->{href}
2128 |                 if (defined($attr->{href}) && length($attr->{href}));
2129 |         }
2130 | 
2131 |         # Special case: meta[@http-equiv=Refresh]/@content
2132 |         elsif ($tag eq 'meta') {
2133 |             if ($attr->{'http-equiv'} &&
2134 |                 lc($attr->{'http-equiv'}) eq 'refresh')
2135 |             {
2136 |                 my $content = $attr->{content};
2137 |                 if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
2138 |                     $self->add_link($1, undef, $line, MC_BLOCK);
2139 |                 }
2140 |             }
2141 |         }
2142 | 
2143 |         # Special case: tags that have "local base"
2144 |         elsif ($tag eq 'applet' || $tag eq 'object') {
2145 |             if (my $codebase = $attr->{codebase}) {
2146 | 
2147 |                 # Applet codebases are directories, append trailing slash
2148 |                 # if it's not there so that new_abs does the right thing.
2149 |                 $codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|);
2150 | 
2151 |                 # TODO: HTML 4 spec says applet/@codebase may only point to
2152 |                 # subdirs of the directory containing the current document.
2153 |                 # Should we do something about that?
2154 |                 $tag_local_base = URI->new_abs($codebase, $self->{base});
2155 |             }
2156 |         }
2157 | 
2158 |         # Link attributes:
2159 |         if (my $link_attrs = LINK_ATTRS()->{$tag}) {
2160 |             for my $attrdesc (@$link_attrs) {
2161 |                 my ($mixedcontent, $la) = @$attrdesc;
2162 |                 my $attributes = [];
2163 |                 my $skip = 0;
2164 |                 if ($mixedcontent == MC_CONDITIONAL) {
2165 |                     if ($tag eq 'img') {
2166 |                         if ($attr->{crossorigin}) {
2167 |                             $mixedcontent = MC_BLOCK;
2168 |                         } else {
2169 |                             $mixedcontent = MC_OPTIONAL;
2170 |                         }
2171 |                     } elsif ($tag eq 'link') {
2172 |                         my @blocked_link_types = ('stylesheet', 'prefetch', 'preload', 'preconnect', 'prerender', 'modulepreload', 'icon', 'dns-prefetch');
2173 |                         my @link_types = map{ lc($_)} split(/\s+/, $attr->{rel});
2174 |                         for my $type (@link_types) {
2175 |                             if ( grep (/^$type$/, @blocked_link_types)) {
2176 |                                 $mixedcontent = MC_BLOCK;
2177 |                                 last;
2178 |                             }
2179 |                         }
2180 |                         if ($mixedcontent == MC_CONDITIONAL) {
2181 |                             $mixedcontent = MC_NONE;
2182 |                         }
2183 |                     }
2184 |                 }
2185 |                 if ($tag eq 'link') {
2186 |                     my @skipped_link_types = ('preconnect', 'dns-prefetch');
2187 |                     my @link_types = map{ lc($_)} split(/\s+/, $attr->{rel});
2188 |                     for my $type (@link_types) {
2189 |                         if ( grep (/^$type$/, @skipped_link_types)) {
2190 |                             $skip = 1;
2191 |                             last;
2192 |                         }
2193 |                     }
2194 |                 }
2195 |                 if (!$skip) {
2196 |                     $self->add_link($attr->{$la}, $tag_local_base, $line, $mixedcontent);
2197 |                 }
2198 |             }
2199 |         }
2200 | 
2201 |         # List of links attributes:
2202 |         if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
2203 |             my ($sep, $subsep, $attrdesc) = @$link_attrs;
2204 |             my ($mixedcontent, $la) = @$attrdesc;
2205 |             if (defined(my $value = $attr->{$la})) {
2206 |                 for my $comp (split($sep, $value)) {
2207 |                     for my $link (split($subsep, $comp)) {
2208 |                         $self->add_link($link, $tag_local_base, $line, $mixedcontent);
2209 |                     }
2210 |                 }
2211 |             }
2212 |         }
2213 | 
2214 |         # Inline CSS:
2215 |         delete $self->{csstext};
2216 |         if ($tag eq 'style') {
2217 |             $self->{csstext} = ''
2218 |                 if ((!$attr->{type} && $self->{style_is_css}) ||
2219 |                 is_content_type($attr->{type}, "text/css"));
2220 |         }
2221 |         elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
2222 |             $style = CSS::DOM::Style::parse($style);
2223 |             $self->parse_style($style, $line);
2224 |         }
2225 |     }
2226 | 
2227 |     $self->parse_progress($line) if $Opts{Progress};
2228 |     return;
2229 | }
2230 | 
2231 | sub end
2232 | {
2233 |     my ($self, $tagname, $line) = @_;
2234 | 
2235 |     $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
2236 |     delete $self->{csstext};
2237 | 
2238 |     $self->parse_progress($line) if $Opts{Progress};
2239 |     return;
2240 | }
2241 | 
2242 | sub parse_css
2243 | {
2244 |     my ($self, $css, $line) = @_;
2245 |     return unless $css;
2246 | 
2247 |     my $sheet = CSS::DOM::parse($css);
2248 |     for my $rule (@{$sheet->cssRules()}) {
2249 |         if ($rule->type() == IMPORT_RULE()) {
2250 |             $self->add_link($rule->href(), $self->{base}, $line, MC_BLOCK);
2251 |         }
2252 |         elsif ($rule->type == STYLE_RULE()) {
2253 |             $self->parse_style($rule->style(), $line);
2254 |         }
2255 |     }
2256 |     return;
2257 | }
2258 | 
2259 | sub parse_style
2260 | {
2261 |     my ($self, $style, $line) = @_;
2262 |     return unless $style;
2263 | 
2264 |     for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
2265 |         my $prop = $style->item($i);
2266 |         my $val  = $style->getPropertyValue($prop);
2267 | 
2268 |         while ($val =~ /$CssUrl/go) {
2269 |             my $url = CSS::DOM::Util::unescape($2);
2270 |             # FIXME: are there CSS rules with URLs that aren't mixed content blockable?
2271 |             $self->add_link($url, $self->{base}, $line, MC_OPTIONAL);
2272 |         }
2273 |     }
2274 | 
2275 |     return;
2276 | }
2277 | 
2278 | sub declaration
2279 | {
2280 |     my ($self, $text, $line) = @_;
2281 | 
2282 |     # Extract the doctype
2283 |     my @declaration = split(/\s+/, $text, 4);
2284 |     if ($#declaration >= 3 &&
2285 |         $declaration[0] eq 'DOCTYPE' &&
2286 |         lc($declaration[1]) eq 'html')
2287 |     {
2288 | 
2289 |         # Parse the doctype declaration
2290 |         if ($text =~
2291 |             m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i
2292 |             )
2293 |         {
2294 | 
2295 |             # Store the doctype
2296 |             $self->doctype($1) if $1;
2297 | 
2298 |             # If there is a link to the DTD, record it
2299 |             $self->add_link($3, undef, $line, MC_NONE)
2300 |                 if (!$self->{only_anchors} && $3);
2301 |         }
2302 |     }
2303 | 
2304 |     $self->text($text) unless $self->{only_anchors};
2305 | 
2306 |     return;
2307 | }
2308 | 
2309 | sub text
2310 | {
2311 |     my ($self, $text, $line) = @_;
2312 |     $self->{csstext} .= $text if defined($self->{csstext});
2313 |     $self->parse_progress($line) if $Opts{Progress};
2314 |     return;
2315 | }
2316 | 
2317 | sub end_document
2318 | {
2319 |     my ($self, $line) = @_;
2320 |     $self->{Total} = $line;
2321 |     delete $self->{csstext};
2322 |     return;
2323 | }
2324 | 
2325 | sub validate_pdf_fragment
2326 | {
2327 |     my ($uri, $fragments) = @_;
2328 |     mark_fragments_as_not_checked($uri, $fragments);
2329 |     # TODO: Check the rules defined in
2330 |     # https://pdfobject.com/pdf/pdf_open_parameters_acro8.pdf
2331 |     return;
2332 | }
2333 | 
2334 | sub mark_fragments_as_not_checked
2335 | {
2336 |     my ($uri, $fragments) = @_;
2337 |     for my $fragment (keys %$fragments) {
2338 |         $results{$uri}{fragments}{$fragment} = -1;
2339 |     }
2340 |     return;
2341 | }
2342 | 
2343 | ################################
2344 | # Check the validity of a link #
2345 | ################################
2346 | 
2347 | sub check_validity (\$\$$\%\%\$\$)
2348 | {
2349 |     my ($referer, $uri, $want_links, $links, $redirects, $mixedcontent_lines, $csp) = @_;
2350 | 
2351 |     # $referer is the URI object of the document checked
2352 |     # $uri is the URI object of the target that we are verifying
2353 |     # $want_links is true if we're interested in links in the target doc
2354 |     # $links is a hash of the links in the documents checked
2355 |     # $redirects is a map of the redirects encountered
2356 |     # $mixedcontent_lines is an array of lines where the link appears
2357 |     #                     in a mixedcontent context (if any)
2358 |     # $csp is an array of values from the content-security-policy header
2359 |     #                     of the page
2360 | 
2361 |     # Get the document with the appropriate method: GET if there are
2362 |     # fragments to check or links are wanted, HEAD is enough otherwise.
2363 |     my $fragments = $links->{$uri}{fragments} || {};
2364 |     my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD';
2365 | 
2366 |     my $response;
2367 |     my $being_processed = 0;
2368 |     if (!defined($results{$uri}) ||
2369 |         ($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
2370 |     {
2371 |         $being_processed = 1;
2372 |         $response = &get_uri($method, $uri, $referer);
2373 | 
2374 |         # Get the information back from get_uri()
2375 |         &record_results($uri, $method, $response, $referer, $mixedcontent_lines, $csp);
2376 | 
2377 |         # Record the redirects
2378 |         &record_redirects($redirects, $response);
2379 |     }
2380 |     elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) {
2381 |         my $ref = $results{$uri}{referer};
2382 |         &hprintf("Already checked%s\n", $ref ? ", referrer $ref" : ".");
2383 |     }
2384 | 
2385 |     # We got the response of the HTTP request. Stop here if it was a HEAD.
2386 |     return if ($method eq 'HEAD');
2387 | 
2388 |     # There are fragments. Parse the document.
2389 |     my $p;
2390 |     if ($being_processed) {
2391 |         if (!defined($results{$uri}{location}{type})) {
2392 |             &hprintf("Can't check content: Content-Type for '%s' is undefined.\n",
2393 |                 $uri)
2394 |                 if ($Opts{Verbose});
2395 |             $response->content("");
2396 |             return;
2397 |         } elsif ($results{$uri}{location}{type} =~ "application/pdf") {
2398 |             validate_pdf_fragment($uri, $fragments);
2399 |             $response->content("");
2400 |             return;
2401 |         } elsif ($results{$uri}{location}{type} !~ $ContentTypes) {
2402 |             &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
2403 |                 $uri, $results{$uri}{location}{type})
2404 |                 if ($Opts{Verbose});
2405 |             # TODO: find content types for which fragment are defined
2406 |             # Maybe starting from https://en.wikipedia.org/wiki/URI_fragment#Examples
2407 |             mark_fragments_as_not_checked($uri, $fragments);
2408 |             $response->content("");
2409 |             return;
2410 |         }
2411 | 
2412 |         # Do it then
2413 |         if (my $error = decode_content($response)) {
2414 |             &hprintf("%s\n.", $error);
2415 |         }
2416 |         # @@@TODO: this isn't the best thing to do if a decode error occurred
2417 |         $p =
2418 |             &parse_document($uri, $response->base(), $response, 0,
2419 |             $want_links);
2420 |     }
2421 |     else {
2422 | 
2423 |         # We already had the information
2424 |         $p->{Anchors} = $results{$uri}{parsing}{Anchors};
2425 |     }
2426 | 
2427 |     # Check that the fragments exist
2428 |     for my $fragment (keys %$fragments) {
2429 |         if (defined($p->{Anchors}{$fragment}) ||
2430 |             &escape_match($fragment, $p->{Anchors}) ||
2431 |             # per https://html.spec.whatwg.org/multipage/browsing-the-web.html#scroll-to-fragid:ascii-case-insensitive
2432 |             # 'top' is always a valid fragment in HTML
2433 |             (!$response->{IsCss} && $fragment eq 'top') ||
2434 |             # TODO #top is OK in HTML documents
2435 |             grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}})
2436 |         {
2437 |             $results{$uri}{fragments}{$fragment} = 1;
2438 |         }
2439 |         else {
2440 |             $results{$uri}{fragments}{$fragment} = 0;
2441 |         }
2442 |     }
2443 |     return;
2444 | }
2445 | 
2446 | sub escape_match ($\%)
2447 | {
2448 |     my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
2449 |     for my $b (keys %$hash) {
2450 |         return 1 if ($a eq URI::Escape::uri_unescape($b));
2451 |     }
2452 |     return 0;
2453 | }
2454 | 
2455 | ##########################
2456 | # Ask for authentication #
2457 | ##########################
2458 | 
2459 | sub authentication ($;$$$$)
2460 | {
2461 |     my ($response, $cookie, $params, $check_num, $is_start) = @_;
2462 | 
2463 |     my $realm = '';
2464 |     if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
2465 |         $realm = $1;
2466 |     }
2467 | 
2468 |     if ($Opts{Command_Line}) {
2469 |         printf STDERR <<'EOF', $response->request()->url(), $realm;
2470 | 
2471 | Authentication is required for %s.
2472 | The realm is "%s".
2473 | Use the -u and -p options to specify a username and password and the -d option
2474 | to specify trusted domains.
2475 | EOF
2476 |     }
2477 |     else {
2478 | 
2479 |         printf(
2480 |             "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n",
2481 |             $response->www_authenticate(),
2482 |             $cookie ? "Set-Cookie: $cookie\n" : "",
2483 |         );
2484 | 
2485 |         printf(
2486 |             "%s
2487 | <html lang=\"en\">
2488 | <head>
2489 | <title>W3C Link Checker: 401 Authorization Required</title>
2490 | %s</head>
2491 | <body>", $DocType, $Head
2492 |         );
2493 |         &banner(': 401 Authorization Required');
2494 |         &print_form($params, $cookie, $check_num) if $is_start;
2495 |         printf(
2496 |             '<p>
2497 |   %s
2498 |   You need "%s" access to <a href="%s">%s</a> to perform link checking.<br>
2499 | ',
2500 |             &status_icon(401),
2501 |             &encode($realm), (&encode($response->request()->url())) x 2
2502 |         );
2503 | 
2504 |         my $host = $response->request()->url()->host();
2505 |         if ($Opts{Trusted} && $host !~ $Opts{Trusted}) {
2506 |             printf <<'EOF', &encode($Opts{Trusted}), &encode($host);
2507 |   This service has been configured to send authentication only to hostnames
2508 |   matching the regular expression <code>%s</code>, but the hostname
2509 |   <code>%s</code> does not match it.
2510 | EOF
2511 |         }
2512 | 
2513 |         print "</p>\n";
2514 |     }
2515 |     return;
2516 | }
2517 | 
2518 | ##################
2519 | # Get statistics #
2520 | ##################
2521 | 
2522 | sub get_timestamp ()
2523 | {
2524 |     return pack('LL', Time::HiRes::gettimeofday());
2525 | }
2526 | 
2527 | sub time_diff ($$)
2528 | {
2529 |     my @start = unpack('LL', $_[0]);
2530 |     my @stop  = unpack('LL', $_[1]);
2531 |     for ($start[1], $stop[1]) {
2532 |         $_ /= 1_000_000;
2533 |     }
2534 |     return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1])));
2535 | }
2536 | 
2537 | ########################
2538 | # Handle the redirects #
2539 | ########################
2540 | 
2541 | # Record the redirects in a hash
2542 | sub record_redirects (\%$)
2543 | {
2544 |     my ($redirects, $response) = @_;
2545 |     for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
2546 | 
2547 |         # Check for redirect match.
2548 |         my $from = $prev->request()->url();
2549 |         my $to   = $response->request()->url();  # same on every loop iteration
2550 |         my $from_to = $from . '->' . $to;
2551 |         my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
2552 | 
2553 |         # print STDERR "Result $match of redirect checking $from_to\n";
2554 |         if ($match) { next; }
2555 | 
2556 |         $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}};
2557 | 
2558 |         # print STDERR "Result $match of regexp checking $from_to\n";
2559 |         if ($match) { next; }
2560 | 
2561 |         my $c = $prev->code();
2562 |         if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) {
2563 |             next;
2564 |         }
2565 | 
2566 |         $redirects->{$prev->request()->url()} = $response->request()->url();
2567 |     }
2568 |     return;
2569 | }
2570 | 
2571 | # Determine if a request is redirected
2572 | sub is_redirected ($%)
2573 | {
2574 |     my ($uri, %redirects) = @_;
2575 |     return (defined($redirects{$uri}));
2576 | }
2577 | 
2578 | # Get a list of redirects for a URI
2579 | sub get_redirects ($%)
2580 | {
2581 |     my ($uri, %redirects) = @_;
2582 |     my @history = ($uri);
2583 |     my %seen    = ($uri => 1);    # for tracking redirect loops
2584 |     my $loop    = 0;
2585 |     while ($redirects{$uri}) {
2586 |         $uri = $redirects{$uri};
2587 |         push(@history, $uri);
2588 |         if ($seen{$uri}) {
2589 |             $loop = 1;
2590 |             last;
2591 |         }
2592 |         else {
2593 |             $seen{$uri}++;
2594 |         }
2595 |     }
2596 |     return ($loop, @history);
2597 | }
2598 | 
2599 | ####################################################
2600 | # Tool for sorting the unique elements of an array #
2601 | ####################################################
2602 | 
2603 | sub sort_unique (@)
2604 | {
2605 |     my %saw;
2606 |     @saw{@_} = ();
2607 |     # Treat "(N/A)" as 0
2608 |     return sort { (($a =~ /(\d+)/)[0] || 0) <=> (($b =~ /(\d+)/)[0] || 0) } keys %saw;
2609 | }
2610 | 
2611 | #####################
2612 | # Print the results #
2613 | #####################
2614 | 
2615 | sub line_number ($)
2616 | {
2617 |     my $line = shift;
2618 |     return $line if ($line >= 0);
2619 |     return "(N/A)";
2620 | }
2621 | 
2622 | sub http_rc ($)
2623 | {
2624 |     my $rc = shift;
2625 |     return $rc if ($rc >= 0);
2626 |     return "(N/A)";
2627 | }
2628 | 
2629 | # returns true if the given code is informational
2630 | sub informational ($)
2631 | {
2632 |     my $rc = shift;
2633 |     return $rc == RC_ROBOTS_TXT() ||
2634 |         $rc == RC_IP_DISALLOWED() ||
2635 |         $rc == RC_PROTOCOL_DISALLOWED();
2636 | }
2637 | 
2638 | sub anchors_summary (\%\%)
2639 | {
2640 |     my ($anchors, $errors) = @_;
2641 | 
2642 |     # Number of anchors found.
2643 |     my $n = scalar(keys(%$anchors));
2644 |     if (!$Opts{Quiet}) {
2645 |         if ($Opts{HTML}) {
2646 |             print("<h3>Anchors</h3>\n<p>");
2647 |         }
2648 |         else {
2649 |             print("Anchors\n\n");
2650 |         }
2651 |         &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
2652 |         print("</p>\n") if $Opts{HTML};
2653 |     }
2654 | 
2655 |     # List of the duplicates, if any.
2656 |     my @errors = keys %{$errors};
2657 |     if (!scalar(@errors)) {
2658 |         print("<p>Valid anchors!</p>\n")
2659 |             if (!$Opts{Quiet} && $Opts{HTML} && $n);
2660 |         return;
2661 |     }
2662 |     undef $n;
2663 | 
2664 |     print_doc_header();
2665 |     print('<p>') if $Opts{HTML};
2666 |     print('List of duplicate and empty anchors');
2667 |     print <<'EOF' if $Opts{HTML};
2668 | </p>
2669 | <table class="report" border="1" summary="List of duplicate and empty anchors.">
2670 | <thead>
2671 | <tr>
2672 | <th>Anchor</th>
2673 | <th>Lines</th>
2674 | </tr>
2675 | </thead>
2676 | <tbody>
2677 | EOF
2678 |     print("\n");
2679 | 
2680 |     for my $anchor (@errors) {
2681 |         my $format;
2682 |         my @unique = &sort_unique(
2683 |             map { line_number($_) }
2684 |                 keys %{$anchors->{$anchor}}
2685 |         );
2686 |         if ($Opts{HTML}) {
2687 |             $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
2688 |         }
2689 |         else {
2690 |             my $s = (scalar(@unique) > 1) ? 's' : '';
2691 |             $format = "\t%s\tLine$s: %s\n";
2692 |         }
2693 |         printf($format,
2694 |             &encode(length($anchor) ? $anchor : 'Empty anchor'),
2695 |             join(', ', @unique));
2696 |     }
2697 | 
2698 |     print("</tbody>\n</table>\n") if $Opts{HTML};
2699 | 
2700 |     return;
2701 | }
2702 | 
2703 | sub show_link_report (\%\%\%\%\@;$\%)
2704 | {
2705 |     my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
2706 | 
2707 |     print("\n<dl class=\"report\">") if $Opts{HTML};
2708 |     print("\n") if (!$Opts{Quiet});
2709 | 
2710 |     # Process each URL
2711 |     my ($c, $previous_c);
2712 |     for my $u (@$urls) {
2713 |         my @fragments = keys %{$broken->{$u}{fragments}};
2714 | 
2715 |         # Did we get a redirect?
2716 |         my $redirected = &is_redirected($u, %$redirects);
2717 | 
2718 |         # List of lines
2719 |         my @total_lines;
2720 |         push(@total_lines, keys(%{$links->{$u}{location}}));
2721 |         for my $f (@fragments) {
2722 |             push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
2723 |                 unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
2724 |         }
2725 | 
2726 |         my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
2727 |         my $currloc = $results->{$u}{location};
2728 | 
2729 |         # Error type
2730 |         $c = &code_shown($u, $results);
2731 | 
2732 |         # What to do
2733 |         my $whattodo = '';
2734 |         my $redirect_too;
2735 |         if ($todo) {
2736 |             if ($u =~ m/^javascript:/) {
2737 |                 if ($Opts{HTML}) {
2738 |                     $whattodo =
2739 |                         'You should change this link: people using a browser without JavaScript support
2740 | will <em>not</em> be able to follow this link.';
2741 |                 }
2742 |                 else {
2743 |                     $whattodo =
2744 |                         'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
2745 |                 }
2746 |             }
2747 |             elsif ($c == RC_ROBOTS_TXT()) {
2748 |                 $whattodo =
2749 |                     'The link was not checked due to robots exclusion ' .
2750 |                     'rules. Check the link manually.';
2751 |             }
2752 |             elsif ($redirect_loop) {
2753 |                 $whattodo =
2754 |                     'Retrieving the URI results in a redirect loop, that should be '
2755 |                     . 'fixed.  Examine the redirect sequence to see where the loop '
2756 |                     . 'occurs.';
2757 |             }
2758 |             else {
2759 |                 $whattodo = $todo->{$c};
2760 |             }
2761 |         }
2762 |         elsif (defined($redirects{$u})) {
2763 | 
2764 |             # Redirects
2765 |             if (($u . '/') eq $redirects{$u}) {
2766 |                 $whattodo =
2767 |                     'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
2768 |             }
2769 |             elsif ($c == 307 || $c == 302) {
2770 |                 $whattodo =
2771 |                     'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
2772 |             }
2773 |             elsif ($c == 301 || $c == 308) {
2774 |                 $whattodo =
2775 |                     'This is a permanent redirect. The link should be updated.';
2776 |             }
2777 |         }
2778 | 
2779 |         my @unique = &sort_unique(map { line_number($_) } @total_lines);
2780 |         my $lines_list = join(', ', @unique);
2781 |         my $s = (scalar(@unique) > 1) ? 's' : '';
2782 |         undef @unique;
2783 | 
2784 |         my @http_codes = ($currloc->{code});
2785 |         unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
2786 |         @http_codes = map { http_rc($_) } @http_codes;
2787 | 
2788 |         if ($Opts{HTML}) {
2789 | 
2790 |             # Style stuff
2791 |             my $idref = '';
2792 |             if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
2793 |                 $idref      = ' id="d' . $doc_count . 'code_' . $c . '"';
2794 |                 $previous_c = $c;
2795 |             }
2796 | 
2797 |             # Main info
2798 |             for (@redirects_urls) {
2799 |                 $_ = &show_url($_);
2800 |             }
2801 | 
2802 |             # HTTP message
2803 |             my $http_message;
2804 |             if ($currloc->{message}) {
2805 |                 $http_message = &encode($currloc->{message});
2806 |                 if ($c == 404 || $c == 500) {
2807 |                     $http_message =
2808 |                         '<span class="broken">' . $http_message . '</span>';
2809 |                 }
2810 |             }
2811 |             my $redirmsg =
2812 |                 $redirect_loop ? ' <em>redirect loop detected</em>' : '';
2813 |             printf("
2814 | <dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt>
2815 | <dd class='responsecode'><strong>Status</strong>: %s %s %s</dd>
2816 | <dd class='message_explanation'><p>%s %s</p></dd>\n",
2817 | 
2818 |                 # Anchor for return codes
2819 |                 $idref,
2820 | 
2821 |                 # Color
2822 |                 &status_icon($c),
2823 |                 $s,
2824 | 
2825 |                 # List of lines
2826 |                 $lines_list,
2827 | 
2828 |                 # List of redirects
2829 |                 $redirected ?
2830 |                     join(' redirected to ', @redirects_urls) . $redirmsg :
2831 |                     &show_url($u),
2832 | 
2833 |                 # Realm
2834 |                 defined($currloc->{realm}) ?
2835 |                     sprintf('Realm: %s<br>', &encode($currloc->{realm})) :
2836 |                     '',
2837 | 
2838 |                 # HTTP original message
2839 |                 # defined($currloc->{orig_message})
2840 |                 # ? &encode($currloc->{orig_message}).
2841 |                 # ' <span title="redirected to">-&gt;</span> '
2842 |                 # : '',
2843 | 
2844 |                 # Response code chain
2845 |                 join(
2846 |                     ' <span class="redirected_to" title="redirected to">-&gt;</span> ',
2847 |                     map { &encode($_) } @http_codes),
2848 | 
2849 |                 # HTTP final message
2850 |                 $http_message,
2851 | 
2852 |                 # What to do
2853 |                 $whattodo || "?",
2854 | 
2855 |                 # Redirect too?
2856 |                 $redirect_too ?
2857 |                     sprintf(' <span %s>%s</span>',
2858 |                     &bgcolor(301), $redirect_too) :
2859 |                     '',
2860 |             );
2861 |             if ($#fragments >= 0) {
2862 |                 printf("<dd>Broken fragments: <ul>\n");
2863 |             }
2864 |         }
2865 |         else {
2866 |             my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
2867 |             printf(
2868 |                 "\n%s\t%s\n  Code: %s %s\n%s\n",
2869 | 
2870 |                 # List of redirects
2871 |                 $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
2872 | 
2873 |                 # List of lines
2874 |                 $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
2875 | 
2876 |                 # Response code chain
2877 |                 join(' -> ', @http_codes),
2878 | 
2879 |                 # HTTP message
2880 |                 $currloc->{message} || '',
2881 | 
2882 |                 # What to do
2883 |                 wrap(' To do: ', '        ', $whattodo)
2884 |             );
2885 |             if ($#fragments >= 0) {
2886 |                 if ($currloc->{code} == 200) {
2887 |                     print("The following fragments need to be fixed:\n");
2888 |                 }
2889 |                 else {
2890 |                     print("Fragments:\n");
2891 |                 }
2892 |             }
2893 |         }
2894 | 
2895 |         # Fragments
2896 |         for my $f (@fragments) {
2897 |             my @unique_lines =
2898 |                 &sort_unique(keys %{$links->{$u}{fragments}{$f}});
2899 |             my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
2900 |             my $unique_lines = join(', ', @unique_lines);
2901 |             if ($Opts{HTML}) {
2902 |                 printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
2903 |                     &encode($u), &encode($f), $plural, $unique_lines);
2904 |             }
2905 |             else {
2906 |                 printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
2907 |             }
2908 |         }
2909 | 
2910 |         print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
2911 |     }
2912 | 
2913 |     # End of the table
2914 |     print("</dl>\n") if $Opts{HTML};
2915 | 
2916 |     return;
2917 | }
2918 | 
2919 | sub show_mixedcontent_report (\@\%)
2920 | {
2921 |     my ($mc_blocked_urls, $blocked) = @_;
2922 | 
2923 |     print("\n<dl class=\"report\">") if $Opts{HTML};
2924 |     print("\n") if (!$Opts{Quiet});
2925 |     print("\n<dl class=\"report\">") if $Opts{HTML};
2926 |     for my $url (@$mc_blocked_urls) {
2927 |         if ($Opts{HTML}) {
2928 |             printf("<dt>%s</dt>\n", &show_url($url)) ;
2929 |             printf("<dd>Blocked in mixed content context on lines: <span class='msg_loc'>%s</span></dd>\n", join("</span>, <span class='msg_loc'>", @{%$blocked{$url}}));
2930 |         } else {
2931 |             printf("\n%s\n", $url) ;
2932 |             print "Blocked in mixed content context on lines: ";
2933 |             print join(", ", @{%$blocked{$url}});
2934 |             print "\n\n";
2935 | 
2936 |         }
2937 |     }
2938 | }
2939 | 
2940 | sub code_shown ($$)
2941 | {
2942 |     my ($u, $results) = @_;
2943 | 
2944 |     if ($results->{$u}{location}{record} == 200) {
2945 |         return $results->{$u}{location}{orig} ||
2946 |             $results->{$u}{location}{record};
2947 |     }
2948 |     else {
2949 |         return $results->{$u}{location}{record};
2950 |     }
2951 | }
2952 | 
2953 | sub links_summary (\%\%\%\%\%)
2954 | {
2955 | 
2956 |     # Advices to fix the problems
2957 | 
2958 |     my %todo = (
2959 |         200 =>
2960 |             'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
2961 |         300 =>
2962 |             'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
2963 |         301 =>
2964 |             'This is a permanent redirect. The link should be updated to point to the more recent URI.',
2965 |         302 =>
2966 |             'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
2967 |         303 =>
2968 |             'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
2969 |         307 =>
2970 |             'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
2971 |         308 =>
2972 |             'This is a permanent redirect. The link should be updated to point to the more recent URI.',
2973 |         400 =>
2974 |             'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
2975 |         401 =>
2976 |             "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
2977 |         403 =>
2978 |             'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
2979 |         404 =>
2980 |             'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
2981 |         405 =>
2982 |             'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically.  Check the link manually.',
2983 |         406 =>
2984 |             "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
2985 |         407 => 'The link is a proxy, but requires Authentication.',
2986 |         408 => 'The request timed out.',
2987 |         410 => 'The resource is gone. You should remove this link.',
2988 |         415 => 'The media type is not supported.',
2989 |         500 => 'This is a server side problem. Check the URI.',
2990 |         501 =>
2991 |             'Could not check this link: method not implemented or scheme not supported.',
2992 |         503 =>
2993 |             'The server cannot service the request, for some unknown reason.',
2994 | 
2995 |         # Non-HTTP codes:
2996 |         RC_ROBOTS_TXT() => sprintf(
2997 |             'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
2998 |             $Opts{HTML} ? (
2999 |                 '<a href="http://www.robotstxt.org/robotstxt.html">', '</a>',
3000 |                 "<a href=\"$Cfg{Doc_URI}#bot\">",                     '</a>'
3001 |                 ) : ('') x 4
3002 |         ),
3003 |         RC_DNS_ERROR() =>
3004 |             'The hostname could not be resolved. Check the link for typos.',
3005 |         RC_IP_DISALLOWED() =>
3006 |             sprintf(
3007 |             'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
3008 |             $Opts{HTML} ?
3009 |                 ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') :
3010 |                 ('') x 2),
3011 |         RC_PROTOCOL_DISALLOWED() =>
3012 |             'Accessing links with this URI scheme has been disabled in link checker.',
3013 |         RC_MIXEDCONTENT_BLOCK() =>
3014 |             'The link is blocked as mixed-content. The resource needs to be fetched over https.',
3015 |     );
3016 |     my %priority = (
3017 |         410 => 1,
3018 |         404 => 2,
3019 |         403 => 5,
3020 |         200 => 10,
3021 |         300 => 15,
3022 |         401 => 20
3023 |     );
3024 | 
3025 |     my ($links, $results, $broken, $redirects, $blocked) = @_;
3026 | 
3027 |     # List of the broken links
3028 |     my @urls              = keys %{$broken};
3029 |     my @mc_blocked_urls      = keys %{$blocked};
3030 |     my @dir_redirect_urls = ();
3031 |     if ($Opts{Redirects}) {
3032 | 
3033 |         # Add the redirected URI's to the report
3034 |         for my $l (keys %$redirects) {
3035 |             next
3036 |                 unless (defined($results->{$l}) &&
3037 |                 defined($links->{$l}) &&
3038 |                 !defined($broken->{$l}));
3039 | 
3040 |             # Check whether we have a "directory redirect"
3041 |             # e.g. https://www.w3.org/TR -> https://www.w3.org/TR/
3042 |             my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
3043 |             if ($#redirects == 1) {
3044 |                 push(@dir_redirect_urls, $l);
3045 |                 next;
3046 |             }
3047 |             push(@urls, $l);
3048 |         }
3049 |     }
3050 | 
3051 |     @urls = grep { defined($results->{$_}{location}{record}) } @urls;
3052 | 
3053 |     # Broken links and redirects
3054 |     if ($#urls < 0) {
3055 |         if (!$Opts{Quiet}) {
3056 |             print_doc_header();
3057 |             if ($Opts{HTML}) {
3058 |                 print "<h3>Links</h3>\n<p>Valid links!</p>\n";
3059 |             }
3060 |             else {
3061 |                 print "\nValid links.\n";
3062 |             }
3063 |         }
3064 |     }
3065 |     else {
3066 |         print_doc_header();
3067 |         print('<h3>') if $Opts{HTML};
3068 |         print("\nList of broken links and other issues");
3069 | 
3070 |         #print(' and redirects') if $Opts{Redirects};
3071 | 
3072 |         # Sort the URI's by HTTP Code
3073 |         my %code_summary;
3074 |         my @idx;
3075 |         for my $u (@urls) {
3076 |             if (defined($results->{$u}{location}{record})) {
3077 |                 my $c = &code_shown($u, $results);
3078 |                 $code_summary{$c}++;
3079 |                 push(@idx, $c);
3080 |             }
3081 |         }
3082 |         my @sorted = @urls[
3083 |             sort {
3084 |                 defined($priority{$idx[$a]}) ?
3085 |                     defined($priority{$idx[$b]}) ?
3086 |                     $priority{$idx[$a]} <=> $priority{$idx[$b]} :
3087 |                         -1 :
3088 |                     defined($priority{$idx[$b]}) ? 1 :
3089 |                     $idx[$a] <=> $idx[$b]
3090 |             } 0 .. $#idx
3091 |         ];
3092 |         @urls = @sorted;
3093 |         undef(@sorted);
3094 |         undef(@idx);
3095 | 
3096 |         if ($Opts{HTML}) {
3097 | 
3098 |             # Print a summary
3099 |             print <<'EOF';
3100 | </h3>
3101 | <p><em>There are issues with the URLs listed below. The table summarizes the
3102 | issues and suggested actions by HTTP response status code.</em></p>
3103 | <table class="report" border="1" summary="List of issues and suggested actions.">
3104 | <thead>
3105 | <tr>
3106 | <th>Code</th>
3107 | <th>Occurrences</th>
3108 | <th>What to do</th>
3109 | </tr>
3110 | </thead>
3111 | <tbody>
3112 | EOF
3113 |             for my $code (sort(keys(%code_summary))) {
3114 |                 printf('<tr%s>', &bgcolor($code));
3115 |                 printf('<td><a href="#d%scode_%s">%s</a></td>',
3116 |                     $doc_count, $code, http_rc($code));
3117 |                 printf('<td>%s</td>', $code_summary{$code});
3118 |                 printf('<td>%s</td>', $todo{$code} || "?");
3119 |                 print "</tr>\n";
3120 |             }
3121 |             print "</tbody>\n</table>\n";
3122 |         }
3123 |         else {
3124 |             print(':');
3125 |         }
3126 |         &show_link_report($links, $results, $broken, $redirects, \@urls, 1,
3127 |             \%todo);
3128 |     }
3129 | 
3130 |     # Show directory redirects
3131 |     if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
3132 |         print_doc_header();
3133 |         print('<h3>') if $Opts{HTML};
3134 |         print("\nList of redirects");
3135 |         print(
3136 |             "</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>"
3137 |         ) if $Opts{HTML};
3138 |         &show_link_report($links, $results, $broken, $redirects,
3139 |             \@dir_redirect_urls);
3140 |     }
3141 | 
3142 |     # Show mixedcontent blocked URLs
3143 |     if ($#mc_blocked_urls > -1) {
3144 |         print_doc_header();
3145 |         print('<h3>') if $Opts{HTML};
3146 |         print("\nList of URLs blocked because they're mixed content");
3147 |         print(
3148 |             "</h3>\n<p>The URLs below are not necessarily broken, but they will be not loaded correctly since they’re  <code>http:</code> URLs loaded in an <code>https:</code> page and will be blocked as mixed content.</p>"
3149 |         ) if $Opts{HTML};
3150 |         &show_mixedcontent_report(\@mc_blocked_urls, \%$blocked);
3151 |     }
3152 |     return;
3153 | }
3154 | 
3155 | ###############################################################################
3156 | 
3157 | ################
3158 | # Global stats #
3159 | ################
3160 | 
3161 | sub global_stats ()
3162 | {
3163 |     my $stop = &get_timestamp();
3164 |     my $n_docs =
3165 |         ($doc_count <= $Opts{Max_Documents}) ? $doc_count :
3166 |                                                $Opts{Max_Documents};
3167 |     return sprintf(
3168 |         'Checked %d document%s in %s seconds.',
3169 |         $n_docs,
3170 |         ($n_docs == 1) ? '' : 's',
3171 |         &time_diff($timestamp, $stop)
3172 |     );
3173 | }
3174 | 
3175 | ##################
3176 | # HTML interface #
3177 | ##################
3178 | 
3179 | sub html_header ($$)
3180 | {
3181 |     my ($uri, $cookie) = @_;
3182 | 
3183 |     my $title = defined($uri) ? $uri : '';
3184 |     $title = ': ' . $title if ($title =~ /\S/);
3185 | 
3186 |     my $headers = '';
3187 |     if (!$Opts{Command_Line}) {
3188 |         $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri;
3189 |         $headers .= "Content-Type: text/html; charset=utf-8\n";
3190 |         $headers .= "Set-Cookie: $cookie\n"                       if $cookie;
3191 | 
3192 |         # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
3193 |         # print() statement as the last header
3194 |         $headers .= "Content-Language: en\n\n";
3195 |     }
3196 | 
3197 |     my $onload = $uri ? '' :
3198 |           ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
3199 | 
3200 |     print $headers, $DocType, "
3201 | <html lang=\"en\">
3202 | <head>
3203 | <title>W3C Link Checker", &encode($title), "</title>
3204 | ",      $Head,   "</head>
3205 | <body", $onload, '>';
3206 |     &banner($title);
3207 |     return;
3208 | }
3209 | 
3210 | sub banner ($)
3211 | {
3212 |     my $tagline = "Check links and anchors in Web pages or full Web sites";
3213 | 
3214 |     printf(
3215 |         <<'EOF', $Cfg{Doc_Images_URI} . 'w3c.png', $tagline);
3216 | <header id="banner"><h1 id="title"><a href="https://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" width="110" height="61"></a>
3217 | <a href="checklink"><span>Link Checker</span></a></h1>
3218 | <p id="tagline">%s</p>
3219 | </header>
3220 | <main id="main">
3221 | EOF
3222 |     return;
3223 | }
3224 | 
3225 | sub status_icon($)
3226 | {
3227 |     my ($code) = @_;
3228 |     my $icon_type;
3229 |     my $r = HTTP::Response->new($code);
3230 |     if ($r->is_success()) {
3231 |         $icon_type = 'error'
3232 |             ; # if is success but reported, it's because of broken frags => error
3233 |     }
3234 |     elsif (&informational($code)) {
3235 |         $icon_type = 'info';
3236 |     }
3237 |     elsif ($code == 300) {
3238 |         $icon_type = 'info';
3239 |     }
3240 |     elsif ($code == 401) {
3241 |         $icon_type = 'error';
3242 |     }
3243 |     elsif ($r->is_redirect()) {
3244 |         $icon_type = 'warning';
3245 |     }
3246 |     elsif ($r->is_error()) {
3247 |         $icon_type = 'error';
3248 |     }
3249 |     else {
3250 |         $icon_type = 'error';
3251 |     }
3252 |     return sprintf('<span class="err_type"><img src="%s" alt="%s"></span>',
3253 |         $Cfg{Doc_Images_URI} . 'info_icons/' . $icon_type . '.png',
3254 |         $icon_type);
3255 | }
3256 | 
3257 | sub bgcolor ($)
3258 | {
3259 |     my ($code) = @_;
3260 |     my $class;
3261 |     my $r = HTTP::Response->new($code);
3262 |     if ($r->is_success()) {
3263 |         return '';
3264 |     }
3265 |     elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
3266 |         $class = 'dubious';
3267 |     }
3268 |     elsif ($code == 300) {
3269 |         $class = 'multiple';
3270 |     }
3271 |     elsif ($code == 401) {
3272 |         $class = 'unauthorized';
3273 |     }
3274 |     elsif ($r->is_redirect()) {
3275 |         $class = 'redirect';
3276 |     }
3277 |     elsif ($r->is_error()) {
3278 |         $class = 'broken';
3279 |     }
3280 |     else {
3281 |         $class = 'broken';
3282 |     }
3283 |     return (' class="' . $class . '"');
3284 | }
3285 | 
3286 | sub show_url ($)
3287 | {
3288 |     my ($url) = @_;
3289 |     return sprintf('<a href="%s">%s</a>', (&encode($url)) x 2);
3290 | }
3291 | 
3292 | sub html_footer ()
3293 | {
3294 |     printf("<p>%s</p>\n", &global_stats())
3295 |         if ($doc_count > 0 && !$Opts{Quiet});
3296 |     if (!$doc_count) {
3297 |         print <<'EOF';
3298 | <div class="intro">
3299 |   <p>
3300 |     This Link Checker looks for issues in links, anchors and referenced objects
3301 |     in a Web page, CSS style sheet, or recursively on a whole Web site. For
3302 |     best results, it is recommended to first ensure that the documents checked
3303 |     use Valid <a href="https://validator.w3.org/">(X)HTML Markup</a> and
3304 |     <a href="https://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
3305 |     part of the W3C's <a href="https://www.w3.org/QA/Tools/">validators and
3306 |     Quality Web tools</a>.
3307 |   </p>
3308 | </div>
3309 | EOF
3310 |     }
3311 |     printf(<<'EOF', $Cfg{Doc_URI}, $PACKAGE, $REVISION);
3312 | </main><!-- main -->
3313 | <footer>
3314 | <nav>
3315 | <ul class="navbar" id="menu">
3316 |   <li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li>
3317 |   <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li>
3318 |   <li><a href="https://github.com/w3c/link-checker" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li>
3319 |   <li><a href="https://validator.w3.org/" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li>
3320 | </ul>
3321 | </nav>
3322 | <address>
3323 | %s<br> %s
3324 | </address>
3325 | </footer>
3326 | </body>
3327 | </html>
3328 | EOF
3329 |     return;
3330 | }
3331 | 
3332 | sub print_form (\%$$)
3333 | {
3334 |     my ($params, $cookie, $check_num) = @_;
3335 | 
3336 |     # Split params on \0, see CGI's docs on Vars()
3337 |     while (my ($key, $value) = each(%$params)) {
3338 |         if ($value) {
3339 |             my @vals = split(/\0/, $value, 2);
3340 |             $params->{$key} = $vals[0];
3341 |         }
3342 |     }
3343 | 
3344 |     # Override undefined values from the cookie, if we got one.
3345 |     my $valid_cookie = 0;
3346 |     if ($cookie) {
3347 |         my %cookie_values = $cookie->value();
3348 |         if (!$cookie_values{clear})
3349 |         {    # XXX no easy way to check if cookie expired?
3350 |             $valid_cookie = 1;
3351 |             while (my ($key, $value) = each(%cookie_values)) {
3352 |                 $params->{$key} = $value unless defined($params->{$key});
3353 |             }
3354 |         }
3355 |     }
3356 | 
3357 |     my $chk = ' checked="checked"';
3358 |     $params->{hide_type} = 'all' unless $params->{hide_type};
3359 | 
3360 |     my $requested_uri = &encode($params->{uri} || '');
3361 |     my $sum = $params->{summary}        ? $chk : '';
3362 |     my $red = $params->{hide_redirects} ? $chk : '';
3363 |     my $all = ($params->{hide_type} ne 'dir') ? $chk : '';
3364 |     my $dir = $all                            ? ''   : $chk;
3365 |     my $acc = $params->{no_accept_language}   ? $chk : '';
3366 |     my $ref = $params->{no_referer}           ? $chk : '';
3367 |     my $rec = $params->{recursive}            ? $chk : '';
3368 |     my $dep = &encode($params->{depth} || '');
3369 | 
3370 |     my $cookie_options = '';
3371 |     if ($valid_cookie) {
3372 |         $cookie_options = "
3373 |     <label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\"> Don't modify saved options</label>
3374 |     <label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\"> Save these options</label>
3375 |     <label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\"> Clear saved options</label>";
3376 |     }
3377 |     else {
3378 |         $cookie_options = "
3379 |     <label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\"> Save options in a <a href=\"https://tools.ietf.org/html/rfc6265\">cookie</a></label>";
3380 |     }
3381 | 
3382 |     print "<form action=\"", $Opts{_Self_URI},
3383 |         "\" method=\"get\" onsubmit=\"return uriOk($check_num)\" accept-charset=\"UTF-8\">
3384 | <p><label for=\"uri_$check_num\">Enter the address (<a href=\"https://www.w3.org/Addressing/\">URL</a>)
3385 | of a document that you would like to check:</label></p>
3386 | <p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",
3387 |         $requested_uri, "\"></p>
3388 | <fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\">
3389 | 	<legend class=\"toggletext\">More Options</legend>
3390 | 	<div class=\"options\">
3391 |   <p>
3392 |     <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"",
3393 |         $sum, "> Summary only</label>
3394 |     <br>
3395 |     <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"",
3396 |         $red,
3397 |         "> Hide <a href=\"https://tools.ietf.org/html/rfc7231#section-6.4\">redirects</a>:</label>
3398 |     <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"",
3399 |         $all, "> all</label>
3400 |     <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"",
3401 |         $dir, "> for directories only</label>
3402 |     <br>
3403 |     <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"",
3404 |         $acc,
3405 |         "> Don't send the <code><a href=\"https://www.rfc-editor.org/rfc/rfc9110#field.accept-language\">Accept-Language</a></code> header</label>
3406 |     <br>
3407 |     <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"",
3408 |         $ref,
3409 |         "> Don't send the <code><a href=\"https://www.rfc-editor.org/rfc/rfc9110#field.referer\">Referer</a></code> header</label>
3410 |     <br>
3411 |     <label title=\"Check linked documents recursively (maximum: ",
3412 |         $Opts{Max_Documents},
3413 |         " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"",
3414 |         $rec, "> Check linked documents recursively</label>,
3415 |     <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"",
3416 |         $dep, "\"></label>
3417 |     <br><br>", $cookie_options, "
3418 |   </p>
3419 |   </div>
3420 | </fieldset>
3421 | <p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\"></p>
3422 | </form>
3423 | <div id=\"w3c-include\"><script type=\"text/javascript\" src=\"//www.w3.org/QA/Tools/w3c-include.js\"></script></div>
3424 | ";
3425 |     return;
3426 | }
3427 | 
3428 | sub encode (@)
3429 | {
3430 |     return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
3431 | }
3432 | 
3433 | sub hprintf (@)
3434 | {
3435 |     print_doc_header();
3436 |     if (!$Opts{HTML}) {
3437 |         printf(@_);
3438 |     }
3439 |     else {
3440 |         print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1]));
3441 |     }
3442 |     return;
3443 | }
3444 | 
3445 | # Print the document header, if it hasn't been printed already.
3446 | # This is invoked before most other output operations, in order
3447 | # to enable quiet processing that doesn't clutter the output with
3448 | # "Processing..." messages when nothing else will be reported.
3449 | sub print_doc_header ()
3450 | {
3451 |     if (defined($doc_header)) {
3452 |         print $doc_header;
3453 |         undef($doc_header);
3454 |     }
3455 | }
3456 | 
3457 | # Local Variables:
3458 | # mode: perl
3459 | # indent-tabs-mode: nil
3460 | # cperl-indent-level: 4
3461 | # cperl-continued-statement-offset: 4
3462 | # cperl-brace-offset: -4
3463 | # perl-indent-level: 4
3464 | # End:
3465 | # ex: ts=4 sw=4 et
3466 | 


--------------------------------------------------------------------------------
/bin/checklink.pod:
--------------------------------------------------------------------------------
  1 | =encoding utf8
  2 | 
  3 | =head1 NAME
  4 | 
  5 | checklink - check the validity of links in an HTML or XHTML document
  6 | 
  7 | =head1 SYNOPSIS
  8 | 
  9 | B<checklink>  [ I<options> ] I<uri> ...
 10 | 
 11 | =head1 DESCRIPTION
 12 | 
 13 | This manual page documents briefly the B<checklink> command, a.k.a. the
 14 | W3C® Link Checker.
 15 | 
 16 | B<checklink> is a program that reads an HTML or XHTML document,
 17 | extracts a list of anchors and links and checks that no anchor is
 18 | defined twice and that all the links are dereferenceable, including
 19 | the fragments. It warns about HTTP redirects, including directory
 20 | redirects, and can check recursively a part of a web site.
 21 | 
 22 | The program can be used either as a command line tool or as a CGI script.
 23 | 
 24 | =head1 OPTIONS
 25 | 
 26 | This program follow the usual GNU command line syntax, with long options
 27 | starting with two dashes (`-'). A summary of options is included below.
 28 | 
 29 | =over 5
 30 | 
 31 | =item B<-?, -h, --help>
 32 | 
 33 | Show summary of options.
 34 | 
 35 | =item B<-V, --version>
 36 | 
 37 | Output version information.
 38 | 
 39 | =item B<-s, --summary>
 40 | 
 41 | Show result summary only.
 42 | 
 43 | =item B<-b, --broken>
 44 | 
 45 | Show only the broken links, not the redirects.
 46 | 
 47 | =item B<-e, --dir-redirects>
 48 | 
 49 | Hide directory redirects - e.g. L<http://www.w3.org/TR> ->
 50 | L<http://www.w3.org/TR/>.
 51 | 
 52 | =item B<-r, --recursive>
 53 | 
 54 | Check the documents linked from the first one.
 55 | 
 56 | =item B<-D, --depth> I<n>
 57 | 
 58 | Check the documents linked from the first one to depth I<n>
 59 | (implies B<--recursive>).
 60 | 
 61 | =item B<-l, --location> I<uri>
 62 | 
 63 | Scope of the documents checked (implies B<--recursive>).
 64 | Can be specified multiple times in order to specify multiple recursion
 65 | bases.  If the URI of a candidate document is downwards relative to any of
 66 | the bases, it is considered to be within the scope.  If not specified, the
 67 | default is the base URI of the initial document, for example for
 68 | L<http://www.w3.org/TR/html4/Overview.html> it would be
 69 | L<http://www.w3.org/TR/html4/>.
 70 | 
 71 | =item B<-X, --exclude> I<regexp>
 72 | 
 73 | Do not check links whose full, canonical URIs match I<regexp>.  Note that
 74 | this option limits recursion the same way as B<--exclude-docs> with the same
 75 | regular expression would.
 76 | 
 77 | =item B<--exclude-docs> I<regexp>
 78 | 
 79 | In recursive mode, do not check links in documents whose full, canonical
 80 | URIs match I<regexp>.  This option may be specified multiple times.
 81 | 
 82 | =item B<--suppress-redirect> I<URI-E<gt>URI>
 83 | 
 84 | Do not report a redirect from the first to the second URI.  The "-E<gt>" is
 85 | literal text.  This option may be specified multiple times.  Whitespace may
 86 | be used instead of "-E<gt>" to separate the URIs.
 87 | 
 88 | =item B<--suppress-redirect-prefix> I<URI-E<gt>URI>
 89 | 
 90 | Do not report a redirect from a child of the first URI to the same child of
 91 | the second URI.  The \"->\" is literal text.  This option may be specified
 92 | multiple times.  Whitespace may be used instead of "-E<gt>" to separate the
 93 | URIs.
 94 | 
 95 | =item B<--suppress-temp-redirects>
 96 | 
 97 | Do not report warnings about temporary redirects.
 98 | 
 99 | =item B<--suppress-broken> I<CODE:URI>
100 | 
101 | Do not report a broken link with the given CODE.  CODE is the HTTP
102 | response, or -1 for robots exclusion.  The ":" is literal text.  This
103 | option may be specified multiple times.  Whitespace may be used instead of
104 | ":" to separate the CODE and the URI.
105 | 
106 | =item B<--suppress-fragment> I<URI>
107 | 
108 | Do not report the given broken fragment URI.  A fragment URI contains "#".
109 | This option may be specified multiple times.
110 | 
111 | =item B<-L, --languages> I<accept-language>
112 | 
113 | The C<Accept-Language> HTTP header to send.  In command line mode,
114 | this header is not sent by default.  The special value C<auto> causes
115 | a value to be detected from the C<LANG> environment variable, and sent
116 | if found.  In CGI mode, the default is to send the value received from
117 | the client as is.
118 | 
119 | =item B<-c, --cookies> I<cookie-file>
120 | 
121 | Use cookies, load/save them in I<cookie-file>.  The special value
122 | C<tmp> causes non-persistent use of cookies, i.e. they are used but
123 | only stored in memory for the duration of this link checker run.
124 | 
125 | =item B<-R, --no-referer>
126 | 
127 | Do not send the C<Referer> HTTP header.
128 | 
129 | =item B<-q, --quiet>
130 | 
131 | No output if no errors are found.  Implies B<--summary>.
132 | 
133 | =item B<-v, --verbose>
134 | 
135 | Verbose mode.
136 | 
137 | =item B<-i, --indicator>
138 | 
139 | Show progress while parsing as percentage of lines processed.  No
140 | indicator is shown for documents containing no linefeeds.
141 | 
142 | =item B<-u, --user> I<username>
143 | 
144 | Specify a username for authentication.
145 | 
146 | =item B<-p, --password> I<password>
147 | 
148 | Specify a password for authentication.
149 | 
150 | =item B<--hide-same-realm>
151 | 
152 | Hide 401's that are in the same realm as the document checked.
153 | 
154 | =item B<-S, --sleep> I<secs>
155 | 
156 | Sleep the specified number of seconds between requests to each server.
157 | Defaults to 1 second, which is also the minimum allowed.
158 | 
159 | =item B<-t, --timeout> I<secs>
160 | 
161 | Timeout for requests, in seconds.  The default is 30.
162 | 
163 | =item B<-C, --connection-cache> I<number>
164 | 
165 | Maximum number of cached connections.  Using this option overrides the
166 | C<Connection_Cache_Size> configuration file parameter, see its
167 | documentation below for the default value and more information.
168 | 
169 | =item B<-d, --domain> I<domain>
170 | 
171 | Perl regular expression describing the domain to which the authentication
172 | information (if present) will be sent.  The default value can be specified
173 | in the configuration file.  See the C<Trusted> entry in the configuration
174 | file description below for more information.
175 | 
176 | =item B<--masquerade> I<"real-prefix surrogate-prefix">
177 | 
178 | Perform a simple string substitution: URIs which begin with the
179 | string C<real-prefix> are rewritten using the C<surrogate-prefix>
180 | before being dereferenced.  Useful for making a local
181 | directory masquerade as a remote one. For example:
182 | 
183 |   --masquerade "http://example.com/x/y/z/ file:///my/local/dir/"
184 | 
185 | If the document being checked contains a link to
186 | http://example.com/x/y/z/foo.html, then the local file system will be
187 | checked for file:///my/local/dir/foo.html.
188 | 
189 | B<--masquerade> takes a single argument consisting of two URIs,
190 | separated by whitespace.  The quote marks are not part of the
191 | argument, but one usual way of providing a value with embedded
192 | whitespace is to enclose it in quotes.
193 | 
194 | =item B<-H, --html>
195 | 
196 | HTML output.
197 | 
198 | =back
199 | 
200 | =head1 FILES
201 | 
202 | =over 5
203 | 
204 | =item F</etc/w3c/checklink.conf>
205 | 
206 | The main configuration file.  You can use the L<W3C_CHECKLINK_CFG> environment
207 | variable to override the default location.
208 | 
209 | C<Trusted> specifies a regular expression for matching trusted domains
210 | (ie. domains where HTTP basic authentication, if any, will be sent).
211 | The regular expression will be matched case insensitively against host
212 | names.  The default behavior (when unset, that is) is to send the
213 | authentication information only to the host which requests it; usually
214 | you don't want to change this.  For example, the following configures
215 | I<only> the w3.org domain as trusted:
216 | 
217 |     Trusted = \.w3\.org$
218 | 
219 | C<Allow_Private_IPs> is a boolean flag indicating whether checking links
220 | on non-public IP addresses is allowed.  The default is true in command line
221 | mode and false when run as a CGI script.  For example, to disallow checking
222 | non-public IP addresses, regardless of the mode, use:
223 | 
224 |    Allow_Private_IPs = 0
225 | 
226 | C<Forbidden_Protocols> is a comma separated list of additional protocols/URI
227 | schemes that the link checker is not allowed to use.  The C<javascript> and
228 | C<mailto> schemes are always forbidden, and so is the C<file> scheme when
229 | running as a CGI script.
230 | 
231 |    Forbidden_Protocols = javascript,mailto
232 | 
233 | C<Markup_Validator_URI> and C<CSS_Validator_URI> are formatted URIs to the
234 | respective validators.  The C<%s> in these will be replaced with the full
235 | "URI encoded" URI to the document being checked, and shown in the link
236 | checker results view in the online/CGI version.  The defaults are:
237 | 
238 |    Markup_Validator_URI =
239 |      http://validator.w3.org/check?uri=%s
240 |    CSS_Validator_URI =
241 |      http://jigsaw.w3.org/css-validator/validator?uri=%s
242 | 
243 | C<Doc_URI> is a URI used for linking to the documentation, and CSS and
244 | JavaScript files in the dynamically generated content of the link checker.
245 | The default is:
246 | 
247 |    Doc_URI = http://validator.w3.org/docs/checklink.html
248 | 
249 | C<Connection_Cache_Size> is an integer denoting the maximum number of
250 | connections the link checker will keep open at any given time.  The
251 | default is:
252 | 
253 |    Connection_Cache_Size = 2
254 | 
255 | =back
256 | 
257 | =head1 ENVIRONMENT
258 | 
259 | checklink uses the libwww-perl library which has a number of environment
260 | variables affecting its behaviour.  See L</"SEE ALSO"> for some
261 | pointers.
262 | 
263 | =over 5
264 | 
265 | =item B<W3C_CHECKLINK_CFG>
266 | 
267 | If set, overrides the path to the configuration file.
268 | 
269 | =back
270 | 
271 | =head1 SEE ALSO
272 | 
273 | The documentation for this program is available on the web at
274 | L<http://validator.w3.org/docs/checklink.html>.
275 | 
276 | L<LWP>, L<Net::FTP>, L<Net::NNTP>, L<Net::IP>, L<perlre>.
277 | 
278 | =head1 AUTHOR
279 | 
280 | This program was originally written by Hugo Haas E<lt>hugo@w3.orgE<gt>, based
281 | on Renaud Bruyeron's F<checklink.pl>.  It has been enhanced by Ville Skyttä
282 | and many other volunteers since.  Use the E<lt>www-validator@w3.orgE<gt>
283 | mailing list for feedback, and see
284 | L<http://validator.w3.org/docs/checklink.html#csb> for more information.
285 | 
286 | This manual page was originally written by Frédéric Schütz
287 | E<lt>schutz@mathgen.chE<gt> for the Debian GNU/Linux system (but may
288 | be used by others).
289 | 
290 | =head1 COPYRIGHT
291 | 
292 | This program is licensed under the W3C® Software License,
293 | L<http://www.w3.org/Consortium/Legal/copyright-software>.
294 | 
295 | =cut
296 | 


--------------------------------------------------------------------------------
/docs/checklink.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 |   <head>
  4 |     <title>W3C Link Checker Documentation</title>
  5 |     <link rel="author" href="mailto:www-validator@w3.org" />
  6 |     <style media="all">@import "linkchecker.css";</style>
  7 |   </head>
  8 | 
  9 |   <body>
 10 | 	  <header id="banner">
 11 |       <h1 id="title"><a href="https://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="../images/no_w3c.png" width="110" height="61" /></a>
 12 | 	    <a href="../checklink"><span>Link Checker</span></a></h1>
 13 | 	    <p id="tagline">Check links and anchors in Web pages or full Web sites</p>
 14 |     </header>
 15 |     <main id="main">
 16 |       <nav>
 17 |         <ul>
 18 |           <li><a href="#about">About this service</a></li>
 19 |           <li><a href="#what">What it does</a></li>
 20 |           <li><a href="#online">Use it online</a></li>
 21 |           <li><a href="#install">Install it locally</a></li>
 22 |           <li><a href="#bot">Robots exclusion</a></li>
 23 |           <li><a href="#csb">Comments, suggestions and bugs</a></li>
 24 |         </ul>
 25 |       </nav>
 26 | 
 27 |       <section id="about">
 28 |         <h2>About this service</h2>
 29 |         <p>
 30 |           In order to check the validity of the technical reports that W3C
 31 |           publishes, the Systems Team has developed a link checker.
 32 |         </p>
 33 |         <p>
 34 |           A first version was developed in <time datetime="1998-08">August 1998</time> by
 35 |           <a href="https://www.w3.org/People/Renaud/">Renaud Bruyeron</a>.
 36 |           Since it was lacking some functionalities,
 37 |           <a href="https://www.w3.org/People/Hugo/">Hugo Haas</a>
 38 |           rewrote it more or less from scratch in <time datetime="1999-11">November 1999</time>.
 39 |           It has been improved by Ville Skyttä and many other volunteers since.
 40 |         </p>
 41 |         <p>
 42 |           The source code is available publicly under the
 43 |           <a href="https://www.w3.org/Consortium/Legal/copyright-software">W3C IPR
 44 |           software notice</a> from
 45 |           <a href="https://search.cpan.org/dist/W3C-LinkChecker/"><abbr
 46 |           title="Comprehensive Perl Archive Network">CPAN</abbr></a> (released
 47 |           versions) and a
 48 |           <a href="https://github.com/w3c/link-checker/">github repository</a>
 49 |           (development and archived release versions).
 50 |         </p>
 51 |       </section>
 52 | 
 53 |       <section id="what">
 54 |         <h2>What it does</h2>
 55 | 
 56 |         <p>
 57 |           The link checker reads an HTML or XHTML document or a CSS style sheet
 58 |           and extracts a list of anchors and links.
 59 |         </p>
 60 | 
 61 |         <p>
 62 |           It checks that no anchor is defined twice.
 63 |         </p>
 64 | 
 65 |         <p>
 66 |           It then checks that all the links are dereferenceable, including
 67 |           the fragments. It warns about HTTP redirects, including directory
 68 |           redirects.
 69 |         </p>
 70 | 
 71 |         <p>
 72 |           It can check recursively a part of a Web site.
 73 |         </p>
 74 | 
 75 |         <p>
 76 |           There is a command line version and a
 77 |           <abbr title="Common Gateway Interface">CGI</abbr> version. They both
 78 |           support <a href="https://www.ietf.org/rfc/rfc2617.txt">HTTP basic
 79 |           authentication</a>. This is achieved in the CGI version
 80 |           by passing through the authorization information from the user browser
 81 |           to the site tested.
 82 |         </p>
 83 |       </section>
 84 | 
 85 |       <section id="online">
 86 |         <h2>Use it online</h2>
 87 |         <p>
 88 |           There is an
 89 |           <a href="https://validator.w3.org/checklink">online version</a>
 90 |           of the link checker.
 91 |         </p>
 92 |         <p>
 93 |           In the online version (and in general, when run as a CGI script),
 94 |           the number of documents that can be checked recursively is limited.
 95 |         </p>
 96 |         <p id="wait">
 97 |           Both the command line version and the online one sleep at least one
 98 |           second between requests to each server to avoid abuses and target
 99 |           server congestion.
100 |         </p>
101 |         <h3>Access keys</h3>
102 |         <p>
103 |           The following access keys are implemented throughout the
104 |           site in an attempt to help users using screen readers.
105 |         </p>
106 |         <ol>
107 |           <li>Home: access key <kbd>1</kbd> leads back to the service's home page.</li>
108 |           <li>Downloads: access key <kbd>2</kbd> leads to downloads.</li>
109 |           <li>Documentation: access key <kbd>3</kbd> leads to the documentation index for
110 |           the service.</li>
111 |           <li>Feedback: access key <kbd>4</kbd> leads to the feedback instructions.</li>
112 |         </ol>
113 |       </section>
114 | 
115 |       <section id="install">
116 |         <h2>Install it locally</h2>
117 |         <p>
118 |           The link checker is written in Perl. It is packaged as a standard
119 |           <a href="https://www.cpan.org/">CPAN</a> distribution, and depends on
120 |           a few other modules which are also available from CPAN.
121 |         </p>
122 | 
123 |         <h3 id="install-CPAN">Install with the CPAN utility</h3>
124 |         <p>If you system has a working installation of Perl, you should be able to install the link checker and its dependencies with a single line from the commandline shell:</p>
125 |         <p><code>sudo perl -MCPAN -e 'install W3C::LinkChecker'</code> (use without the <code>sudo</code> command if installing from an administrator account).</p>
126 |         <p>If this is the first time you use the CPAN utility, you may have to answer a few setup questions before the tool downloads, builds and installs the link checker.</p>
127 | 
128 |         <h3 id="install-manual">Install by hand</h3>
129 |         <p>If for any reason the technique described above is not working or if you prefer installing each package by hand, follow the instructions below:</p>
130 |         <ol>
131 |           <li>
132 |           Install <a href="https://www.perl.org/get.html">Perl</a>, version 5.8
133 |           or newer.
134 |           </li>
135 |           <li>
136 |           You will need the following <a href="https://www.cpan.org/">CPAN</a>
137 |           distributions, as well as the distributions they possibly depend on.
138 |           Depending on your Perl version, you might already have some of
139 |           these installed.  Also, the latest versions of these may require a
140 |           recent version of Perl.	 As long as the minimum version requirement(s)
141 |           below are satisfied, everything should be fine.	 The latest version
142 |           should not be needed, just get an older version that works with your
143 |           Perl.  For an introduction to installing Perl modules,
144 |           see <a href="https://www.cpan.org/misc/cpan-faq.html#How_install_Perl_modules">The CPAN FAQ</a>.
145 |           <ul>
146 |             <li><a href="https://search.cpan.org/dist/W3C-LinkChecker/">W3C-LinkChecker</a> (the link checker itself)</li>
147 |             <li><a href="https://search.cpan.org/dist/CGI.pm/">CGI.pm</a> (required for CGI mode only)</li>
148 |             <li><a href="https://search.cpan.org/dist/Config-General/">Config-General</a> (optional, version 2.06 or newer; required only for reading the (optional) configuration file)</li>
149 |             <li><a href="https://search.cpan.org/dist/CSS-DOM/">CSS-DOM</a> (version 0.09 or newer)</li>
150 |             <li><a href="https://search.cpan.org/dist/Encode-Locale/">Encode-Locale</a> (required for command line mode only)</li>
151 |             <li><a href="https://search.cpan.org/dist/HTML-Parser/">HTML-Parser</a> (version 3.20 or newer)</li>
152 |             <li><a href="https://search.cpan.org/dist/libwww-perl/">libwww-perl</a> (version 5.833 or newer)</li>
153 |             <li><a href="https://search.cpan.org/dist/Net-IP/">Net-IP</a> (optional but recommended; required for restricting access to <a href="https://www.ietf.org/rfc/rfc1918.txt">private IP addresses</a>)</li>
154 |             <li><a href="https://search.cpan.org/dist/TermReadKey/">TermReadKey</a> (optional but recommended; required only in command line mode for password input)</li>
155 |             <li><a href="https://search.cpan.org/dist/Time-HiRes/">Time-HiRes</a></li>
156 |             <li><a href="https://search.cpan.org/dist/URI/">URI</a> (version 1.53 or newer)</li>
157 |           </ul>
158 |           </li>
159 |           <li>
160 |           Optionally install the link checker configuration file,
161 |           <code>etc/checklink.conf</code> contained in the link checker
162 |           distribution package into <code>/etc/w3c/checklink.conf</code>
163 |           or set the <code>W3C_CHECKLINK_CFG</code> environment variable to the
164 |           location where you installed it.
165 |           </li>
166 |           <li>
167 |           Optionally, install the <code>checklink</code> script into a location
168 |           in your web server which allows execution of CGI scripts (typically a
169 |           directory named <code>cgi-bin</code> somewhere below your web server's
170 |           root directory).
171 |           </li>
172 |           <li>
173 |           See also the <code>README</code> and <code>INSTALL</code> file(s)
174 |           included in the above distributions.
175 |           </li>
176 |         </ol>
177 |         <p>
178 |           Running <code>checklink --help</code> shows how to
179 |           use the command line version.  The distribution package also includes
180 |           more extensive <abbr title="Plain Old Documentation">POD</abbr>
181 |           documentation, use
182 |           <code><a href="https://search.cpan.org/dist/Pod-Perldoc/lib/perldoc.pod">perldoc</a> checklink</code> (or <kbd>man checklink</kbd> on Unixish systems)
183 |           to view it.
184 |         </p>
185 |         <p>
186 |           <abbr title="Secure Sockets Layer">SSL</abbr>/<abbr title="Transport Layer Security">TLS</abbr>v1
187 |           support for <code>https</code> in the link checker needs support for
188 |           it in libwww-perl; see
189 |           <a href="https://search.cpan.org/dist/libwww-perl/README.SSL">README.SSL</a>
190 |           in the libwww-perl distribution for more information.
191 |         </p>
192 |         <p>
193 |           In online mode, link checker's output should not be buffered to avoid
194 |           browser timeouts.  The link checker itself does not buffer its output,
195 |           but in some cases output buffering needs to be explicitly disabled for
196 |           it in the web server running it.  One such case is Apache's <code>mod_deflate</code>
197 |           compression module which as a side effect results in output buffering;
198 |           one way to disable it for the link checker (while leaving it enabled for
199 |           other resources if configured so elsewhere) is to add the following
200 |           section to an appropriate place in the Apache configuration (assuming the
201 |           link checker script's filename is <code>checklink</code>):
202 |         </p>
203 |         <code><pre>
204 |         &lt;Files checklink&gt;
205 |         SetEnv no-gzip
206 |         &lt;/Files&gt;
207 |         </pre></code>
208 |         <p>
209 |           If you want to enable the authentication capabilities with Apache,
210 |           have a look at
211 |           <a href="https://lists.w3.org/Archives/Public/www-validator/1999JulSep/0140.html">Steven Drake's hack</a>.
212 |         </p>
213 |         <p>
214 |           The link checker honors proxy settings from the
215 |           <code><var>scheme</var>_proxy</code> environment variables.  See
216 |           <a href="https://search.cpan.org/dist/libwww-perl/lib/LWP.pm#ENVIRONMENT">LWP(3)</a> and
217 |           <a href="https://search.cpan.org/dist/libwww-perl/lib/LWP/UserAgent.pm#%24ua-%3Eenv_proxy">LWP::UserAgent(3)'s
218 |           <code>env_proxy</code></a> method for more information.
219 |         </p>
220 |         <p>
221 |           Some environment variables affect the way how the link checker uses
222 |           <a href="https://www.ietf.org/rfc/rfc959.txt"><abbr title="File Transfer Protocol">FTP</abbr></a>.
223 |           In particular, passive mode is the default.  See
224 |           <a href="https://search.cpan.org/dist/libnet/Net/FTP.pm#CONSTRUCTOR">Net::FTP(3)</a>
225 |           for more information.
226 |         </p>
227 |         <p>
228 |           There are multiple alternatives for configuring the default
229 |           <a href="https://www.ietf.org/rfc/rfc977.txt"><abbr title="Network News Transfer Protocol">NNTP</abbr></a>
230 |           server for use with <code>news:</code> URIs without explicit hostnames,
231 |           see
232 |           <a href="https://search.cpan.org/dist/libnet/Net/NNTP.pm#CONSTRUCTOR">Net::NNTP(3)</a>
233 |           for more information.
234 |         </p>
235 |       </section>
236 | 
237 |       <section id="bot">
238 |         <h2>Robots exclusion</h2>
239 |         <p>
240 |           The link checker honors
241 |           <a href="https://www.robotstxt.org/robotstxt.html">robots exclusion
242 |             rules</a>.  To place rules specific to the W3C Link Checker in
243 |           <code>/robots.txt</code> files, sites can use the
244 |           <code>W3C-checklink</code> user agent string.  For example, to allow
245 |           the link checker to access all documents on a server and to disallow
246 |           all other robots, one could use the following:
247 |         </p>
248 |         <code><pre>
249 |           User-Agent: *
250 |           Disallow: /
251 |           User-Agent: W3C-checklink
252 |           Disallow:
253 |         </pre></code>
254 |         <p>
255 |           Robots exclusion support in the link checker is based on the
256 |           <a href="https://search.cpan.org/dist/libwww-perl/lib/LWP/RobotUA.pm">LWP::RobotUA</a>
257 |           Perl module.  It currently supports the
258 |           "<a href="https://www.robotstxt.org/orig.html">original 1994 version</a>"
259 |           of the standard.  The robots META tag, ie.
260 |           <code>&lt;meta name="robots" content="..."&gt;</code>, is not supported.
261 |           Other than that, the link checker's implementation goes all the way
262 |           in trying to honor robots exclusion rules; if a
263 |           <code>/robots.txt</code> disallows it, not even the first document
264 |           submitted as the root for a link checker run is fetched.
265 |         </p>
266 |         <p>
267 |           Note that <code>/robots.txt</code> rules affect only user agents
268 |           that honor it; it is not a generic method for access control.
269 |         </p>
270 |       </section>
271 | 
272 |       <section id="csb">
273 |         <h2>Comments, suggestions and bugs</h2>
274 |         <p>
275 |           The current version has proven to be stable. It could however be
276 |           improved, see the <a href="https://github.com/w3c/link-checker/issues">list of open enhancement ideas and bugs</a> for details.
277 |         </p>
278 |       </section>
279 |     </main>
280 | 
281 |     <footer>
282 |       <nav>
283 |         <ul class="navbar" id="menu">
284 |           <li><a href="https://validator.w3.org/checklink" accesskey="1" title="The Link Checker Service at W3C">Link Checker</a></li>
285 |           <li><a href="https://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li>
286 |           <li><a href="#csb" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li>
287 |           <li><a href="https://validator.w3.org" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li>
288 |         </ul>
289 |       </nav>
290 |       <address>
291 |         <a title="Send Feedback for the W3C Link Checker"
292 |         href="https://validator.w3.org/feedback.html">The W3C QA-dev Team</a>
293 |       </address>
294 |       <p class="copyright">
295 |         <a rel="Copyright" href="https://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 1994-2023
296 |         <a href="https://www.w3.org/"><abbr title="World Wide Web Consortium">W3C</abbr></a>&reg;,
297 |         All Rights Reserved.
298 |         W3C <a href="https://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>,
299 |         <a href="https://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>,
300 |         <a rel="Copyright" href="https://www.w3.org/Consortium/Legal/copyright-documents">document use</a>
301 |         and <a rel="Copyright" href="https://www.w3.org/Consortium/Legal/copyright-software">software licensing</a>
302 |         rules apply. Your interactions with this site are in accordance
303 |         with our <a href="https://www.w3.org/Consortium/Legal/privacy-statement#Public">public</a> and
304 |         <a href="https://www.w3.org/Consortium/Legal/privacy-statement#Members">Member</a> privacy
305 |         statements.
306 |       </p>
307 |     </footer>
308 |   </body>
309 | </html>
310 | 


--------------------------------------------------------------------------------
/docs/linkchecker.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |    Base Style Sheet for the W3C Link Checker.
  3 | 
  4 |    Copyright 2000-2011 W3C (MIT, INRIA, Keio). All Rights Reserved.
  5 |    See http://www.w3.org/Consortium/Legal/ipr-notice.html#Copyright
  6 | */
  7 | 
  8 | html, body {
  9 |     line-height: 120%;
 10 |     color: black;
 11 |     background: white;
 12 |     font-family: "Bitstream Vera Sans", sans-serif;
 13 |     margin: 0;
 14 |     padding: 0;
 15 |     border: 0;
 16 | }
 17 | 
 18 | #main {
 19 |     margin: 1em 2em;
 20 | }
 21 | 
 22 | #main form {
 23 |     clear: both;
 24 |     background: #EAEBEE url(../images/round-tr.png) no-repeat top right;
 25 |     padding: 0.5em 1.3em;
 26 |     border-bottom: 1px solid #DCDDE0;
 27 | }
 28 | 
 29 | a img {
 30 |     border: 0;
 31 | }
 32 | 
 33 | a:link, a:visited {
 34 |     text-decoration: underline;
 35 |     color: #365D95;
 36 | }
 37 | 
 38 | a:hover, a:active {
 39 |     text-decoration: underline;
 40 |     color: #1F2126;
 41 | }
 42 | 
 43 | abbr:hover {
 44 |     cursor: help;
 45 | }
 46 | abbr[title], span[title], strong[title] {
 47 |     border-bottom: thin dotted;
 48 |     cursor: help;
 49 | }
 50 | 
 51 | pre, code {
 52 |     font-family: "Bitstream Vera Sans Mono", monospace;
 53 |     line-height: 100%;
 54 |     white-space: pre;
 55 | }
 56 | 
 57 | div.progress pre {
 58 |     height: 12em;
 59 |     font-size: small;
 60 |     overflow: auto;
 61 |     padding: 0.5em 1.3em;
 62 |     border: 1px solid #DCDDE0;
 63 |     margin-top: 0;
 64 | }
 65 | 
 66 | div.progress h3 {
 67 |     margin-bottom: 0;
 68 |     background: white;
 69 |     border: 1px solid #DCDDE0;
 70 |     border-bottom: 0;
 71 |     padding: .4em .8em;
 72 |     text-indent: 0;
 73 |     overflow: hidden;
 74 | }
 75 | 
 76 | progress {
 77 | 
 78 |     inline-size:    100%;
 79 |     block-size:     0.15em;
 80 |     border:         1px solid #DCDDE0;
 81 |     border-bottom:  0;
 82 | }
 83 | 
 84 | fieldset {
 85 |     border: 0;
 86 |     padding: 0;
 87 | }
 88 | legend {
 89 |     font-size: 1.1em;
 90 |     padding: 1em 0 0.23em;
 91 |     letter-spacing: 0.06em;
 92 | }
 93 | 
 94 | fieldset p {
 95 |     margin: 0 !important;
 96 |     padding: 0.7em 0 0.5em 1em;
 97 |     border-top: 1px solid #CBCDD5;
 98 |     background: #EAEBEE url(../images/double.png) left top repeat-x;
 99 | }
100 | 
101 | input#uri {
102 |     font-family: Monaco, "Courier New", Monospace;
103 |     font-size: 0.9em;
104 |     border: 1px solid #BBB;
105 |     border-top: 1px solid #777;
106 |     border-bottom: 1px solid #DDD;
107 |     background: #FEFEFE url(../images/textbg.png) no-repeat top left;
108 |     padding: 0.2em 0.2em;
109 |     max-width: 1000px;
110 |     font-variant: normal;
111 |     width: 95%;
112 |     margin: 0.3em 0 0 1em;
113 | }
114 | 
115 | p.submit_button input {
116 | 	display: block;
117 | 	width: 8em;
118 | 	text-align: center;
119 | 	margin: 10px auto;
120 | 	padding: 0 12px;
121 | 	-webkit-border-radius: 28px;
122 | 	-moz-border-radius: 28px;
123 | 	border-radius: 28px;
124 | 	-webkit-box-shadow: 0 0 2px #86888e;
125 | 	-moz-box-shadow: 0 0 2px #86888e;
126 | 	box-shadow: 0 0 2px #86888e;
127 | 	color: #365d95;
128 | 	font-size: 16px;
129 | 	background-color: #eaebee;
130 | 	border: solid #86888e 1px;
131 | 	text-decoration: none;
132 | 	line-height: 32px;
133 | 	cursor: pointer;
134 | }
135 | 
136 | 
137 | p.submit_button input:active,
138 | p.submit_button input:hover {
139 | 	color: #1f2126;
140 | }
141 | 
142 | a:link img, a:visited img {
143 |     border-style: none;
144 | }
145 | a img {
146 |     color: black; /* The only way to hide the border in NS 4.x */
147 | }
148 | 
149 | ul.toc {
150 |     list-style: none;
151 | }
152 | 
153 | ol li {
154 |     padding: .1em;
155 | }
156 | 
157 | th {
158 |     text-align: left;
159 | }
160 | 
161 | .hideme {
162 |     display: none;
163 | }
164 | 
165 | @media screen {
166 |     /* only hidden visually */
167 |     .progress label span { display: none }
168 | }
169 | 
170 | /* These are usually targets and not links */
171 | h1 a, h1 a:hover, h2 a, h2 a:hover, h3 a, h3 a:hover {
172 |     color: inherit;
173 |     background-color: inherit;
174 | }
175 | 
176 | img {
177 |     vertical-align: middle;
178 | }
179 | 
180 | address img {
181 |     float: right;
182 |     width: 88px;
183 | }
184 | 
185 | address {
186 |     padding: 0 2em;
187 |     font-size: small;
188 |     text-align: center;
189 |     color: #888;
190 |     background-color: white;
191 | }
192 | 
193 | p.copyright {
194 |     margin-top: 5em;
195 |     padding-top: .5em;
196 |     font-size: xx-small;
197 |     max-width: 85ex;
198 |     text-align: justify;
199 |     text-transform: uppercase;
200 |     margin-left: auto;
201 |     margin-right:auto;
202 |     font-family: "Bitstream Vera Sans Mono", monospace;
203 |     color: #888;
204 |     line-height: 120%;
205 | }
206 | 
207 | p.copyright a {
208 |     color: #88F;
209 |     text-decoration: none;
210 | }
211 | 
212 | /* Various header(ish) things. Definitions cribbed from the CORE Styles. */
213 | 
214 | h1#title {
215 |     font-family: "Myriad Web", "Myriad Pro", "Gill Sans", Helvetica, Arial, Sans-Serif;
216 |     background-color: #365D95;
217 |     color: #FDFDFD;
218 |     font-size: 1.6em;
219 |     font-weight: normal;
220 |     background: url(../images/head-bl.png) bottom left no-repeat;
221 |     padding-bottom: 0.430em;
222 |     margin: 0;
223 |     line-height: 1;
224 | }
225 | h1#title a, h1#title a img {
226 |     background-color: #365D95;
227 | }
228 | 
229 | h1 span {
230 |     border-bottom: 1px solid #6383B1;
231 |     border-color: #4E6F9E;
232 | }
233 | 
234 | h1#title a:link, h1#title a:hover, h1#title a:visited, h1#title a:active {
235 |     color: #FDFDFD !important;
236 |     text-decoration: none;
237 | }
238 | 
239 | h1#title img {
240 |     vertical-align: middle;
241 |     margin-right: 0.7em;
242 | }
243 | 
244 | p#tagline {
245 |     font-size: 0.7em;
246 |     margin: -2em 0 0 12.1em;
247 |     padding-bottom: 1em;
248 |     letter-spacing: 0.1em;
249 |     line-height: 100% !important;
250 |     color: #D0DCEE;
251 |     background-color: transparent;
252 | }
253 | 
254 | #banner {
255 |     background: #365D95 url(../images/head-br.png) bottom right no-repeat;
256 |     margin: 1.5em 2em;
257 | }
258 | 
259 | h2 {
260 |     font-size: 1.5em;
261 |     text-align: left;
262 |     font-weight: bold;
263 |     font-style: normal;
264 |     text-decoration: none;
265 |     margin-top: 1em;
266 |     margin-bottom: 1em;
267 |     line-height: 120%;
268 | }
269 | 
270 | h3 {
271 |     font-size: 1.3em;
272 |     font-weight: normal;
273 |     font-style: normal;
274 |     text-decoration: none;
275 |     background-color: #EEE;
276 |     text-indent: 1em;
277 |     padding: .2em;
278 |     border-top: 1px dotted black;
279 | }
280 | 
281 | /*
282 |    Navbar
283 | */
284 | 
285 | ul#menu {
286 |     text-align: center;
287 |     margin: 1em 2em;
288 |     background: #EAEBEE url(../images/round-br.png) no-repeat bottom right;
289 |     padding: 0.5em 0 0.3em;
290 |     border-top: 1px solid #DCDDE0;
291 | }
292 | 
293 | ul#menu span {
294 |     display: none;
295 | }
296 | 
297 | ul#menu a:link, ul#menu a:visited {
298 |     background: #EAEBEE;
299 |     color: #365D95;
300 |     text-decoration: none;
301 | }
302 | 
303 | ul#menu a:hover, ul#menu a:active {
304 |     color: #1F2126;
305 |     text-decoration: underline;
306 | }
307 | 
308 | ul#menu li {
309 |     display: inline;
310 |     margin-right: 0.8em;
311 | }
312 | 
313 | /* Results */
314 | 
315 | .report {
316 |     width: 100%;
317 | }
318 | 
319 | table.report {
320 |     border-collapse: collapse;
321 | }
322 | table.report th {
323 |     padding: .5em;
324 |     background-color: #FCFCFC;
325 | }
326 | table.report td {
327 |     padding: .5em;
328 | }
329 | 
330 | dl.report {
331 |     margin-left: 0 !important;
332 |     margin-right: 0 !important;
333 |     padding: 0;
334 |     border-bottom: 1px solid #EAEBEE;
335 |     border-left: 1px solid #EAEBEE;
336 |     border-right: 1px solid #EAEBEE;
337 | }
338 | 
339 | dl.report dt, dl.report dd {
340 |     border-bottom: 0;
341 | }
342 | dl.report dt {
343 |     border-top: 1px solid #EAEBEE;
344 |     margin-top: .8em;
345 |     padding-left: .5em;
346 |     padding-top: .5em;
347 |     font-weight: bold;
348 | }
349 | dl.report dt span.msg_loc, dl.report dt span.redirected_to {
350 |     font-weight: normal;
351 | }
352 | dl.report dd {
353 |     border-top: 0;
354 |     margin: 0;
355 |     text-indent: 0;
356 |     padding: 0;
357 |     margin-left: 1.5em;
358 | }
359 | dl.report dd.responsecode {
360 |     padding-top: 1em;
361 |     font-size: smaller;
362 | }
363 | dl.report dd.message_explanation {
364 |     font-size: smaller;
365 |     margin-bottom: 1.5em;
366 | }
367 | 
368 | dl.report dd p{
369 |     padding: 0;
370 |     line-height: 150%;
371 | }
372 | 
373 | div.settings {
374 |     font-size: smaller;
375 |     float: right;
376 | }
377 | div.settings ul {
378 |     margin: 0;
379 |     padding-left: 1.5em;
380 | }
381 | .unauthorized {
382 |     background-color: aqua;
383 | }
384 | 
385 | .redirect {
386 |     font-weight: normal;
387 |     font-style: italic;
388 | }
389 | .broken {
390 |     color: #A00;
391 | }
392 | dl.report .broken {
393 |     font-weight: bold;
394 | }
395 | .multiple {
396 |     color: fuchsia;
397 | }
398 | .dubious {
399 |     font-style: italic;
400 | }
401 | 
402 | span.err_type img {
403 |     width: 1.2em;
404 |     height: 1.2em;
405 |     padding-bottom: .2em;
406 |     margin-right: .5em;
407 |     vertical-align: middle;
408 | }
409 | 
410 | /* W3C includes (donation and sponsorship program) */
411 | #w3c-include {}
412 | 
413 | 


--------------------------------------------------------------------------------
/docs/linkchecker.js:
--------------------------------------------------------------------------------
 1 | function uriOk(num)
 2 | {
 3 |     if (!document.getElementById) {
 4 |         return true;
 5 |     }
 6 | 
 7 |     var u = document.getElementById('uri_' + num);
 8 |     var ok = false;
 9 |     if (u.value.length > 0) {
10 |         if (u.value.search) {
11 |             ok = (u.value.search(/\S/) !== -1);
12 |         }
13 |         else {
14 |             ok = true;
15 |         }
16 |     }
17 |     if (!ok) {
18 |         u.focus();
19 |     }
20 |     return ok;
21 | }
22 | 
23 | function show_progress(progress_id, progress_text, progress_percentage)
24 | {
25 |     var div = document.getElementById("progress" + progress_id);
26 | 
27 |     var head = div.getElementsByTagName("h3")[0];
28 |     var text = document.createTextNode(progress_text);
29 |     var span = document.createElement("span");
30 |     span.appendChild(text);
31 |     head.replaceChild(span, head.getElementsByTagName("span")[0]);
32 | 
33 |     var label = div.getElementsByTagName("label")[0];
34 |     var labelText = label.getElementsByTagName("span")[0];
35 |     labelText.textContent = "Progress: " + progress_percentage;
36 | 
37 |     var bar = label.getElementsByTagName("progress")[0];
38 |     bar.value = Number.parseFloat(progress_percentage);
39 |     bar.textContent = progress_percentage;
40 | 
41 |     var pre = div.getElementsByTagName("pre")[0];
42 |     pre.scrollTop = pre.scrollHeight;
43 | }
44 | 


--------------------------------------------------------------------------------
/docs/tests/links_borked.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 2 | 	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 3 | 
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 | 	<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
 7 | 
 8 | 	<title>Sample HTML Documents with broken links (40X, bogus host, 500, etc)</title>
 9 | 	
10 | </head>
11 | 
12 | <body>
13 |   <ul>
14 |     <li><a href="http://bogus.example">bogus hostname</a></li>
15 |     <li><a href="http://www.w3.org/wrongwrongwrong">404</a></li>
16 |     <li><a href="http://qa-dev.w3.org/wmvs/#bogus_frag">bogus #frag</a></li>
17 |     <li><a href="http://qa-dev.w3.org/wlc/#bogus_frag1">bogus #frag (multiple)</a></li>
18 |     <li><a href="http://qa-dev.w3.org/wlc/#bogus_frag2">bogus #frag (multiple)</a></li>
19 |     
20 |   </ul>
21 | </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/docs/tests/links_ok.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 2 | 	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 3 | 
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 | 	<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
 7 | 
 8 | 	<title>Sample HTML Documents with no broken link</title>
 9 | 	
10 | </head>
11 | 
12 | <body>
13 | <p><a href="http://www.w3.org/">W3C Homepage</a></p>
14 | 
15 | </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/docs/tests/links_redirects.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 2 | 	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 3 | 
 4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 5 | <head>
 6 | 	<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
 7 | 
 8 | 	<title>Sample HTML Documents with no broken link, but different types of redirects</title>
 9 | 	
10 | </head>
11 | 
12 | <body>
13 | <ul>
14 |   <li><a href="http://www.w3.org/QA">W3C /QA (directory redirect)</a></li>
15 |   <li><a href="http://www.w3.org/QA/weekinqa">/QA/weekinqa (permanent redirect)</a></li>
16 |   <li><a href="http://www.w3.org/QA/monthinqa">/QA/monthinqa (temp redirect)</a></li>
17 | </ul>
18 | 
19 | </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/etc/checklink.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Configuration file for the W3C Link Checker
 3 | #
 4 | # See Config::General(3) for the syntax; 'SplitPolicy' is 'equalsign' here.
 5 | #
 6 | 
 7 | #
 8 | # Trusted is a regular expression for matching "trusted" domains.  This is
 9 | # used to restrict the domains where HTTP basic authentication will be sent.
10 | # This is matched case insensitively against resoures' hostnames.
11 | #
12 | # Not specifying a value here means that the basic authentication will only
13 | # be sent to the same host where the authentication was requested from.
14 | #
15 | # For example, the following would allow sending the authentication to any
16 | # host in the w3.org domain (and *only* there):
17 | # Trusted = \.w3\.org$
18 | 
19 | 
20 | #
21 | # Allow_Private_IPs is a boolean flag (1/0) for specifying whether checking of
22 | # links to non-public RFC 1918 IP addresses is allowed.
23 | #
24 | # The default, ie. not specifying the value here means that checking links
25 | # on non-public IP addresses is disabled when checklink runs as a CGI script,
26 | # and allowed in command line mode.
27 | #
28 | # For example, the following would disallow private IP addresses regardless
29 | # of the mode:
30 | # Allow_Private_IPs = 0
31 | 
32 | 
33 | #
34 | # Markup_Validator_URI and CSS_Validator_URI are formatted URIs to the
35 | # respective validators.  The %s in these will be replaced with the full
36 | # "URI encoded" URI to the document being checked, and shown in the link
37 | # checker results view in the online/CGI version.
38 | #
39 | # Defaults:
40 | # Markup_Validator_URI = http://validator.w3.org/check?uri=%s
41 | # CSS_Validator_URI = http://jigsaw.w3.org/css-validator/validator?uri=%s
42 | 
43 | 
44 | #
45 | # Doc_URI is the URI to the Link Checker documentation, shown in the
46 | # results report in CGI mode, and the usage message in command line mode.
47 | # The URIs to the CSS and JavaScript files in the generated HTML are also
48 | # formed using this as their base URI.  If you have installed the documentation
49 | # locally somewhere, you may wish to change this to point to that location.
50 | # This must be an absolute URI.
51 | #
52 | # Default:
53 | # Doc_URI = http://validator.w3.org/docs/checklink.html
54 | 
55 | 
56 | #
57 | # Allowed_Protocols is a comma separated list of additional protocols/URI
58 | # schemes that the link checker is allowed to use besides http, https and ftp.
59 | #
60 | # The javascript and mailto schemes for example are forbidden, and so is
61 | # the file scheme when running as a CGI script.
62 | #
63 | # Default:
64 | # Allowed_Protocols = http,https,ftp
65 | 
66 | 
67 | #
68 | # Connection_Cache_Size is an integer denoting the maximum number of
69 | # connections the link checker will keep open at any given time.
70 | #
71 | # Default:
72 | # Connection_Cache_Size = 2
73 | 


--------------------------------------------------------------------------------
/etc/perltidyrc:
--------------------------------------------------------------------------------
 1 | # perltidy(1) profile for the W3C Link Checker
 2 | --standard-error-output
 3 | --warning-output
 4 | --output-line-ending=unix
 5 | --maximum-line-length=79
 6 | --indent-columns=4
 7 | --continuation-indentation=4
 8 | --vertical-tightness=2
 9 | --paren-tightness=2
10 | --brace-tightness=2
11 | --square-bracket-tightness=2
12 | --opening-sub-brace-on-new-line
13 | --nospace-for-semicolon
14 | --nooutdent-long-lines
15 | --break-after-all-operators
16 | 


--------------------------------------------------------------------------------
/images/double.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/double.png


--------------------------------------------------------------------------------
/images/grad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/grad.png


--------------------------------------------------------------------------------
/images/head-bl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/head-bl.png


--------------------------------------------------------------------------------
/images/head-br.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/head-br.png


--------------------------------------------------------------------------------
/images/info_icons/README:
--------------------------------------------------------------------------------
 1 | * error.png and info.png from:
 2 | 
 3 | information icons set
 4 | by:	 Jakub Jankiewicz
 5 | license: public domain
 6 | http://openclipart.org/media/files/kuba/2051
 7 | 
 8 | 
 9 | * warning.png from:
10 | 
11 | Warning Notification
12 | by:	 eastshores
13 | license: public domain
14 | http://openclipart.org/media/files/eastshores/2833
15 | 


--------------------------------------------------------------------------------
/images/info_icons/error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/info_icons/error.png


--------------------------------------------------------------------------------
/images/info_icons/info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/info_icons/info.png


--------------------------------------------------------------------------------
/images/info_icons/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/info_icons/warning.png


--------------------------------------------------------------------------------
/images/no_w3c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/no_w3c.png


--------------------------------------------------------------------------------
/images/round-br.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/round-br.png


--------------------------------------------------------------------------------
/images/round-tr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/round-tr.png


--------------------------------------------------------------------------------
/images/textbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/textbg.png


--------------------------------------------------------------------------------
/images/w3c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/w3c/link-checker/c944a90c0bc7d2561b0b1e7a5a256d354badab0b/images/w3c.png


--------------------------------------------------------------------------------
/t/00compile.t:
--------------------------------------------------------------------------------
1 | use Test::More tests => 2;                                       # -*- perl -*-
2 | use File::Spec ();
3 | 
4 | ok(system($^X, '-wTc', File::Spec->catfile('bin', 'checklink')) == 0);
5 | require_ok('W3C::LinkChecker');
6 | 


--------------------------------------------------------------------------------
/w3c.json:
--------------------------------------------------------------------------------
1 | {
2 |     "contacts": ["dontcallmedom"],
3 |     "repo-type": "tool"
4 | }
5 | 


--------------------------------------------------------------------------------