├── .gitignore
├── .travis.yml
├── AUTHORS.txt
├── CHANGELOG.rst
├── LICENSE.txt
├── README.rst
├── pylinkvalidator
    ├── __init__.py
    ├── api.py
    ├── bin
    │   └── pylinkvalidate.py
    ├── compat.py
    ├── crawler.py
    ├── included
    │   ├── __init__.py
    │   └── bs4
    │   │   ├── __init__.py
    │   │   ├── builder
    │   │       ├── __init__.py
    │   │       ├── _html5lib.py
    │   │       ├── _htmlparser.py
    │   │       └── _lxml.py
    │   │   ├── dammit.py
    │   │   ├── diagnose.py
    │   │   └── element.py
    ├── models.py
    ├── reporter.py
    ├── testfiles
    │   ├── a.html
    │   ├── alone.html
    │   ├── badtel.html
    │   ├── c.html
    │   ├── d.html
    │   ├── depth
    │   │   ├── 0.html
    │   │   ├── 0b.html
    │   │   ├── 1.html
    │   │   ├── 2.html
    │   │   ├── 3.html
    │   │   └── root.html
    │   ├── f.html
    │   ├── index.html
    │   ├── robots.txt
    │   ├── sub
    │   │   ├── b.html
    │   │   ├── e.html
    │   │   ├── small_image.gif
    │   │   ├── style.css
    │   │   └── test.js
    │   ├── à.html
    │   └── é.html
    ├── tests.py
    └── urlutil.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | pylinkvalidator.egg-info/
3 | dist/
4 | build/
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.6"
 4 |   - "2.7"
 5 |   - "3.4"
 6 |   - "3.6"
 7 | install:
 8 |   - "pip install ."
 9 | script: nosetests
10 | sudo: false
11 | 


--------------------------------------------------------------------------------
/AUTHORS.txt:
--------------------------------------------------------------------------------
 1 | Pylinkvalidator was originally created as part of pylinkchecker in 2013 by
 2 | Barthelemy Dagenais while he was working at Xprima Inc. It has been forked on
 3 | June 24th 2014 with the name pylinkvalidator.
 4 | 
 5 | Here is an inevitably incomplete list of MUCH-APPRECIATED CONTRIBUTORS --
 6 | people who have submitted patches, reported bugs, and generally made
 7 | pylinkvalidator that much better:
 8 | 
 9 | Arun Elias
10 | Jim Priest <jpriest@redhat.com>
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | CHANGELOG
 2 | =========
 3 | 
 4 | 0.3 (to be published)
 5 | --------------------
 6 | 
 7 | - Added --ignore-bad-tel-urls option to ignore phone URLs that do not conform
 8 |   to the telephone number URI RFC 3966.
 9 | - Added --allow-insecure-content option to crawl pages with HTTPS errors (e.g.,
10 |   self signed certificate).
11 | 
12 | 0.2 (July 22th 2015)
13 | --------------------
14 | 
15 | - Added the --depth option to limit crawling to certain depths.
16 | 
17 | 0.1 (June 24th 2014)
18 | --------------------
19 | 
20 | Initial fork of pylinkchecker
21 | 
22 | - Changed pylinkchecker to pylinkvalidator
23 | - Changed pylinkcheck.py to pylinkvalidate
24 | - Updated license
25 | - PEP 8 compliance
26 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014-2015 Barthelemy Dagenais and individual contributors. All
 2 | rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | - Redistributions of source code must retain the above copyright notice, this
 8 | list of conditions and the following disclaimer.
 9 | 
10 | - Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 | 
14 | - The name of the author may not be used to endorse or promote products
15 | derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 | POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 
30 | Pylinkvalidator is a fork of Pylinkchecker, which was licensed at the date of
31 | the fork, 24 June 2014, under these conditions:
32 | 
33 | Copyright (c) 2013, Technologies Xprima Inc All rights reserved.
34 | 
35 | Redistribution and use in source and binary forms, with or without
36 | modification, are permitted provided that the following conditions are met:
37 | 
38 | - Redistributions of source code must retain the above copyright notice, this
39 | list of conditions and the following disclaimer.
40 | 
41 | - Redistributions in binary form must reproduce the above copyright notice,
42 | this list of conditions and the following disclaimer in the documentation
43 | and/or other materials provided with the distribution.
44 | 
45 | - The name of the author may not be used to endorse or promote products
46 | derived from this software without specific prior written permission.
47 | 
48 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
49 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
52 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 | POSSIBILITY OF SUCH DAMAGE.
59 | 
60 | 
61 | Pylinkvalidator includes a copy of BeautifulSoup which is licensed under the
62 | MIT License.
63 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | pylinkvalidator
  2 | ===============
  3 | 
  4 | :Version: 0.3
  5 | 
  6 | pylinkvalidator is a standalone and pure python link validator and crawler that
  7 | traverses a web site and reports errors (e.g., 500 and 404 errors) encountered.
  8 | The crawler can also download resources such as images, scripts and
  9 | stylesheets.
 10 | 
 11 | pylinkvalidator's performance can be improved by installing additional libraries
 12 | that require a C compiler, but these libraries are optional.
 13 | 
 14 | We created pylinkvalidator so that it could be executed in environments without
 15 | access to a compiler (e.g., Microsoft Windows, some posix production
 16 | environments) or with an old version of python (e.g., Centos).
 17 | 
 18 | pylinkvalidator is highly modular and has many configuration options, but the
 19 | only required parameter is the starting url: pylinkvalidate.py
 20 | http://www.example.com/
 21 | 
 22 | pylinkvalidator can also be used programmatically by calling one of the functions
 23 | in ``pylinkvalidator.api``
 24 | 
 25 | .. image:: https://api.travis-ci.org/bartdag/pylinkvalidator.png
 26 | 
 27 | 
 28 | Quick Start
 29 | -----------
 30 | 
 31 | Install pylinkvalidator with pip or easy_install:
 32 | 
 33 | ::
 34 | 
 35 |   pip install pylinkvalidator
 36 | 
 37 | 
 38 | Crawl all pages from a site and show progress:
 39 | 
 40 | ::
 41 | 
 42 |   pylinkvalidate.py -P http://www.example.com/
 43 | 
 44 | 
 45 | Requirements
 46 | ------------
 47 | 
 48 | pylinkvalidator does not require external libraries if executed with python 2.x.
 49 | It requires beautifulsoup4 if executed with python 3.x. It has been tested on
 50 | python 2.6, python 2.7, and python 3.6.
 51 | 
 52 | For production use, it is strongly recommended to use lxml or html5lib because
 53 | the default HTML parser provided by python is not very lenient.
 54 | 
 55 | 
 56 | Optional Requirements
 57 | ---------------------
 58 | 
 59 | These libraries can be installed to enable certain modes in pylinkvalidator:
 60 | 
 61 | lxml
 62 |   beautifulsoup can use lxml to speed up the parsing of HTML pages. Because
 63 |   lxml requires C libraries, this is only an optional requirement.
 64 | 
 65 | html5lib
 66 |   beautifulsoup can use html5lib to process incorrect or strange markup. It is
 67 |   slower than lxml, but believed to be more lenient.
 68 | 
 69 | gevent
 70 |   this non-blocking io library enables pylinkvalidator to use green threads
 71 |   instead of processes or threads. gevent could potentially speed up the
 72 |   crawling speed on web sites with many small pages.
 73 | 
 74 | cchardet
 75 |   this library speeds up the detection of document encoding.
 76 | 
 77 | 
 78 | Usage
 79 | -----
 80 | 
 81 | This is a list of all available options. See the end of the README file for
 82 | usage examples.
 83 | 
 84 | ::
 85 | 
 86 |   Usage: pylinkvalidate.py [options] URL ...
 87 | 
 88 |   Options:
 89 |     --version             Show program's version number and exit
 90 |     -h, --help            Show this help message and exit
 91 |     -V VERBOSE, --verbose=VERBOSE
 92 |                           Display debugging info
 93 |                             None:  --verbose=0 (default)
 94 |                             Quiet: --verbose=1
 95 |                             Info:  --verbose=2
 96 | 
 97 |     Crawler Options:
 98 |       These options modify the way the crawler traverses the site.
 99 | 
100 |       -O, --test-outside  Fetch resources from other domains without crawling
101 |                           them
102 |       -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
103 |                           Comma-separated list of additional hosts to crawl
104 |                           (e.g., example.com,subdomain.another.com)
105 |       -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
106 |                           Comma-separated list of host/path prefixes to ignore
107 |                           (e.g., www.example.com/ignore_this_and_after/)
108 |       -b, --ignore-bad-tel-urls
109 |                           ignore badly formed tel URLs missing the leading +
110 |                           sign, e.g., tel:1234567890 - only necessary for Python
111 |                           > 2.6
112 |       -u USERNAME, --username=USERNAME
113 |                           Username to use with basic HTTP authentication
114 |       -p PASSWORD, --password=PASSWORD
115 |                           Password to use with basic HTTP authentication
116 |       -M, --multi         each argument is considered to be a different site
117 |       -D HEADER, --header=HEADER
118 |                           custom header of the form Header: Value (repeat for
119 |                           multiple headers)
120 |       --url-file-path=URL_FILE_PATH
121 |                           get starting URLs from a line-separated file
122 |       -t TYPES, --types=TYPES
123 |                           Comma-separated values of tags to look for when
124 |                           crawling a site. Default (and supported types):
125 |                           a,img,link,script
126 |       -T TIMEOUT, --timeout=TIMEOUT
127 |                           Seconds to wait before considering that a page timed
128 |                           out (default = 10)
129 |       -C, --strict        Does not strip href and src attributes from
130 |                           whitespaces
131 |       -P, --progress      Prints crawler progress in the console
132 |       -N, --run-once      Only crawl the first page (eq. to depth=0)
133 |       -d DEPTH, --depth=DEPTH
134 |                           Maximum crawl depth (default = 1)
135 |       -e, --prefer-server-encoding
136 |                           Prefer server encoding if specified. Else detect
137 |                           encoding
138 |       --check-presence=CONTENT_PRESENCE
139 |                           Check presence of raw or HTML content on all pages.
140 |                           e.g., <tag attr1="val">regex:content</tag>. Content
141 |                           can be either regex:pattern or plain content
142 |       --check-absence=CONTENT_ABSENCE
143 |                           Check absence of raw or HTML content on all pages.
144 |                           e.g., <tag attr1="val">regex:content</tag>. Content
145 |                           can be either regex:pattern or plain content
146 |       --check-presence-once=CONTENT_PRESENCE_ONCE
147 |                           Check presence of raw or HTML content for one page:
148 |                           path,content, e.g.,: /path,<tag
149 |                           attr1="val">regex:content</tag>. Content can be either
150 |                           regex:pattern or plain content. Path can be either
151 |                           relative or absolute with domain.
152 |       --check-absence-once=CONTENT_ABSENCE_ONCE
153 |                           Check absence of raw or HTML content for one page:
154 |                           path,content, e.g.,path,<tag
155 |                           attr1="val">regex:content</tag>. Content can be either
156 |                           regex:pattern or plain content. Path can be either
157 |                           relative or absolute with domain.
158 |       -S, --show-source   Show source of links (html) in the report.
159 |       --allow-insecure-content
160 |                           Allow insecure content for HTTPS sites with
161 |                           certificate errors
162 | 
163 |     Performance Options:
164 |       These options can impact the performance of the crawler.
165 | 
166 |       -w WORKERS, --workers=WORKERS
167 |                           Number of workers to spawn (default = 1)
168 |       -m MODE, --mode=MODE
169 |                           Types of workers: thread (default), process, or green
170 |       -R PARSER, --parser=PARSER
171 |                           Types of HTML parse: html.parser (default) or lxml
172 | 
173 |     Output Options:
174 |       These options change the output of the crawler.
175 | 
176 |       -f FORMAT, --format=FORMAT
177 |                           Format of the report: plain (default)
178 |       -o OUTPUT, --output=OUTPUT
179 |                           Path of the file where the report will be printed.
180 |       -W WHEN, --when=WHEN
181 |                           When to print the report. error (only if a
182 |                           crawling error occurs) or always (default)
183 |       -E REPORT_TYPE, --report-type=REPORT_TYPE
184 |                           Type of report to print: errors (default, summary and
185 |                           erroneous links), summary, all (summary and all links)
186 |       -c, --console       Prints report to the console in addition to other
187 |                           output options such as file or email.
188 | 
189 |     Email Options:
190 |       These options allows the crawler to send a report by email.
191 | 
192 |       -a ADDRESS, --address=ADDRESS
193 |                           Comma-separated list of email addresses used to send a
194 |                           report
195 |       --from=FROM_ADDRESS
196 |                           Email address to use in the from field of the email
197 |                           (optional)
198 |       -s SMTP, --smtp=SMTP
199 |                           Host of the smtp server
200 |       --port=PORT         Port of the smtp server (optional)
201 |       --tls               Use TLS with the email server.
202 |       --subject=SUBJECT   Subject of the email (optional)
203 |       --smtp-username=SMTP_USERNAME
204 |                           Username to use with the smtp server (optional)
205 |       --smtp-password=SMTP_PASSWORD
206 |                           Password to use with the smtp server (optional)
207 | 
208 | Usage Example
209 | -------------
210 | 
211 | Crawl a site and show progress
212 |   ``pylinkvalidate.py --progress http://example.com/``
213 | 
214 | Crawl a site starting from 2 URLs
215 |   ``pylinkvalidate.py http://example.com/ http://example2.com/``
216 | 
217 | Crawl a site (example.com) and all pages belonging to another host
218 |   ``pylinkvalidate.py -H additionalhost.com http://example.com/``
219 | 
220 | Report status of all links (even successful ones)
221 |   ``pylinkvalidate.py --report-type=all http://example.com/``
222 | 
223 | Report status of all links and HTML show source of these links
224 |   ``pylinkvalidate.py --report-type=all --show-source http://example.com/``
225 | 
226 | Only crawl starting URLs and access all linked resources
227 |   ``pylinkvalidate.py --run-once http://example.com/``
228 | 
229 | Crawl two levels (one more than run-once) and access all linked resources
230 |   ``pylinkvalidate.py --depth=1 http://example.com/``
231 | 
232 | Only access links (a href) and ignore images, stylesheets and scripts
233 |   ``pylinkvalidate.py --types=a http://example.com/``
234 | 
235 | Crawl a site with 4 threads (default is one thread)
236 |   ``pylinkvalidate.py --workers=4 http://example.com/``
237 | 
238 | Crawl a site with 4 processes (default is one thread)
239 |   ``pylinkvalidate.py --mode=process --workers=4 http://example.com/``
240 | 
241 | Crawl a site and use LXML to parse HTML (faster, must be installed)
242 |   ``pylinkvalidate.py --parser=LXML http://example.com/``
243 | 
244 | Print debugging info
245 |   ``pylinkvalidate.py --verbose=2 http://example.com/``
246 | 
247 | Change User-Agent request header
248 |   ``pylinkvalidate.py --header="User-Agent: Mozilla/5.0" http://example.com/``
249 | 
250 | Crawl multiple sites and report results per site
251 |   ``pylinkvalidate.py --multi http://example.com/ http://www.example2.net/``
252 | 
253 | Check that all HTML pages have a body tag with a specific class:
254 |   ``pylinkvalidate.py --check-content '<body class="test"></body>' http://example.com/``
255 | 
256 | Check that no HTML pages have a paragraph tag with a pattern:
257 |   ``pylinkvalidate.py --check-absence '<p>regex:Hello\s+World</body>' http://example.com/``
258 | 
259 | Check that robots.txt have a Disallow none:
260 |   ``pylinkvalidate.py --check-content-once '/robots.txt,regex:^Disallow:\s*$' http://example.com/``
261 | 
262 | Allow insecure content for HTTPS sites with certificate errors [SSL: CERTIFICATE_VERIFY_FAILED]
263 |   ``pylinkvalidate.py --allow-insecure-content https://self-signed.example.com/``
264 | 
265 | 
266 | API Usage
267 | ---------
268 | 
269 | To crawl a site from a single URL:
270 | 
271 | .. code-block:: python
272 | 
273 |   from pylinkvalidator.api import crawl
274 |   crawled_site = crawl("http://www.example.com/")
275 |   number_of_crawled_pages = len(crawled_site.pages)
276 |   number_of_errors = len(crawled_sites.error_pages)
277 | 
278 | 
279 | To crawl a site and pass some configuration options (the same supported by the
280 | command line interface):
281 | 
282 | 
283 | .. code-block:: python
284 | 
285 |   from pylinkvalidator.api import crawl_with_options
286 |   crawled_site = crawl_with_options(["http://www.example.com/"], {"run-once":
287 |       True, "workers": 10})
288 |   number_of_crawled_pages = len(crawled_site.pages)
289 |   number_of_errors = len(crawled_sites.error_pages)
290 | 
291 | 
292 | FAQ and Troubleshooting
293 | -----------------------
294 | 
295 | I cannot find pylinkvalidate.py on Windows with virtualenv
296 |   This is a known problem with virtualenv on windows. The interpreter is
297 |   different than the one used by the virtualenv. Prefix pylinkvalidate.py with the
298 |   full path: ``python c:\myvirtualenv\Scripts\pylinkvalidate.py``
299 | 
300 | I see Exception KeyError ... module 'threading' when using --mode=green
301 |   This output is generally harmless and is generated by gevent patching the
302 |   python thread module. If someone knows how to make it go away, patches are
303 |   more than welcome :-)
304 | 
305 | 
306 | License
307 | -------
308 | 
309 | This software is licensed under the `New BSD License`. See the `LICENSE` file
310 | in the for the full license text. It includes the beautifulsoup library which
311 | is licensed under the MIT license.
312 | 


--------------------------------------------------------------------------------
/pylinkvalidator/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Main pylinkvalidator package
4 | """
5 | 
6 | __version__ = "0.3"
7 | 


--------------------------------------------------------------------------------
/pylinkvalidator/api.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Contains a simple crawling API to use pylinkvalidator programmatically.
 4 | 
 5 | We will do everything to keep functions in this module backward compatible
 6 | across versions.
 7 | """
 8 | from __future__ import unicode_literals, absolute_import
 9 | 
10 | from pylinkvalidator.crawler import configure_logger, execute_from_config
11 | from pylinkvalidator.models import Config
12 | 
13 | 
14 | def crawl(url):
15 |     """Crawls a URL and returns a pylinkvalidator.crawler.Site instance.
16 | 
17 |     :rtype: A pylinkvalidator.crawler.Site instance
18 |     """
19 |     config = Config()
20 |     config.parse_api_config([url])
21 |     logger = configure_logger(config)
22 |     crawler = execute_from_config(config, logger)
23 | 
24 |     return crawler.site
25 | 
26 | 
27 | def crawl_with_options(urls, options_dict=None, logger_builder=None):
28 |     """Crawls URLs with provided options and logger.
29 | 
30 |     :param options_dict: Must contain the long name of the command line
31 |             options. (optional)
32 | 
33 |     :param logger_builder: Function that will be called to instantiate a
34 |             logger. (optional)
35 | 
36 |     :rtype: A pylinkvalidator.crawler.Site instance
37 |     """
38 | 
39 |     config = Config()
40 | 
41 |     config.parse_api_config(urls, options_dict)
42 | 
43 |     if not logger_builder:
44 |         logger = configure_logger(config)
45 |     else:
46 |         logger = logger_builder()
47 | 
48 |     # TODO In the future, we will pass the logger builder and not the logger
49 |     # to enable the ProcessSiteCrawler to instantiate its own custom logger.
50 |     crawler = execute_from_config(config, logger)
51 | 
52 |     return crawler.site
53 | 


--------------------------------------------------------------------------------
/pylinkvalidator/bin/pylinkvalidate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | from pylinkvalidator import crawler
4 | 
5 | if __name__ == "__main__":
6 |     crawler.execute_from_command_line()
7 | 


--------------------------------------------------------------------------------
/pylinkvalidator/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # flake8: noqa
 3 | """
 4 | Contains the compatibility layer for python 2 & 3
 5 | """
 6 | from __future__ import unicode_literals, absolute_import
 7 | 
 8 | import sys
 9 | 
10 | if sys.version_info[0] < 3:
11 |     range = xrange
12 |     import urlparse
13 |     from urllib import quote
14 |     import SimpleHTTPServer
15 |     import SocketServer
16 |     from urllib2 import HTTPError
17 |     import Queue
18 |     unicode = unicode
19 | 
20 |     def get_content_type(m):
21 |         return m.gettype()
22 | 
23 |     def get_charset(m):
24 |         return m.getparam("charset")
25 | 
26 |     def get_safe_str(s):
27 |         return s.encode("utf-8")
28 | 
29 |     from StringIO import StringIO
30 | else:
31 |     range = range
32 |     import urllib.parse as urlparse
33 |     from urllib.parse import quote
34 |     import http.server as SimpleHTTPServer
35 |     import socketserver as SocketServer
36 |     from urllib.error import HTTPError
37 |     import queue as Queue
38 |     unicode = str
39 | 
40 |     def get_content_type(m):
41 |         return m.get_content_type()
42 | 
43 |     def get_charset(m):
44 |         return m.get_content_charset()
45 | 
46 |     def get_safe_str(s):
47 |         return s
48 |     from io import StringIO
49 | 
50 | try:
51 |     from logging import NullHandler
52 | except ImportError:
53 |     from logging import Handler
54 | 
55 |     class NullHandler(Handler):
56 |         def emit(self, record):
57 |             pass
58 | 
59 |         def handle(self, record):
60 |             pass
61 | 
62 |         def createLock(self):
63 |             return None
64 | 
65 | 
66 | def get_url_open():
67 |     # Not automatically imported to allow monkey patching.
68 |     if sys.version_info[0] < 3:
69 |         from urllib2 import urlopen
70 |     else:
71 |         from urllib.request import urlopen
72 |     return urlopen
73 | 
74 | 
75 | def get_url_request():
76 |     if sys.version_info[0] < 3:
77 |         from urllib2 import Request
78 |     else:
79 |         from urllib.request import Request
80 |     return Request
81 | 


--------------------------------------------------------------------------------
/pylinkvalidator/crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Contains the crawling logic.
  4 | """
  5 | from __future__ import unicode_literals, absolute_import
  6 | 
  7 | import base64
  8 | from collections import defaultdict
  9 | import logging
 10 | import sys
 11 | import time
 12 | 
 13 | from pylinkvalidator.included.bs4 import BeautifulSoup, UnicodeDammit
 14 | 
 15 | import pylinkvalidator.compat as compat
 16 | from pylinkvalidator.compat import (
 17 |     range, HTTPError, get_url_open, unicode,
 18 |     get_content_type, get_url_request, get_charset)
 19 | from pylinkvalidator.models import (
 20 |     Config, WorkerInit, Response, PageCrawl,
 21 |     ExceptionStr, Link, SitePage, WorkerInput, TYPE_ATTRIBUTES, HTML_MIME_TYPE,
 22 |     MODE_THREAD, MODE_PROCESS, MODE_GREEN, WHEN_ALWAYS, UTF8Class,
 23 |     PageStatus, PageSource, PAGE_QUEUED, PAGE_CRAWLED, VERBOSE_QUIET,
 24 |     VERBOSE_NORMAL, LazyLogParam, PREFIX_ALL)
 25 | from pylinkvalidator.reporter import report
 26 | from pylinkvalidator.urlutil import (
 27 |     get_clean_url_split, get_absolute_url_split,
 28 |     is_link, is_similar_url_split, is_supported_scheme)
 29 | 
 30 | 
 31 | WORK_DONE = '__WORK_DONE__'
 32 | 
 33 | 
 34 | def get_logger(propagate=False):
 35 |     """Returns a logger."""
 36 |     root_logger = logging.getLogger()
 37 | 
 38 |     logger = logging.getLogger(__name__)
 39 | 
 40 |     handler = logging.StreamHandler()
 41 | 
 42 |     formatter = logging.Formatter(
 43 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 44 |     handler.setFormatter(formatter)
 45 | 
 46 |     if root_logger.level != logging.CRITICAL:
 47 |         logger.addHandler(handler)
 48 |         logger.propagate = propagate
 49 |     else:
 50 |         logger.addHandler(compat.NullHandler())
 51 | 
 52 |     return logger
 53 | 
 54 | 
 55 | class SiteCrawler(object):
 56 |     """Main crawler/orchestrator"""
 57 | 
 58 |     def __init__(self, config, logger):
 59 |         self.config = config
 60 |         self.start_url_splits = list(config.start_url_splits)
 61 |         self.workers = []
 62 |         self.input_queue = self.build_queue(config)
 63 |         self.output_queue = self.build_queue(config)
 64 |         self.logger = logger
 65 |         self.site = Site(self.start_url_splits, config, self.logger)
 66 | 
 67 |     def build_logger(self):
 68 |         return self.logger
 69 | 
 70 |     def crawl(self):
 71 |         worker_init = WorkerInit(
 72 |             self.config.worker_config, self.input_queue,
 73 |             self.output_queue, self.build_logger())
 74 |         self.workers = self.get_workers(self.config, worker_init)
 75 | 
 76 |         queue_size = len(self.start_url_splits)
 77 |         for start_url_split in self.start_url_splits:
 78 |             self.input_queue.put(
 79 |                 WorkerInput(
 80 |                     start_url_split, True, 0, start_url_split.netloc,
 81 |                     self.config.content_check),
 82 |                 False)
 83 | 
 84 |         self.start_workers(self.workers, self.input_queue, self.output_queue)
 85 | 
 86 |         self.start_progress()
 87 | 
 88 |         while True:
 89 |             page_crawl = self.output_queue.get()
 90 |             queue_size -= 1
 91 |             new_worker_inputs = self.process_page_crawl(page_crawl)
 92 | 
 93 |             # We only process new pages if we did not exceed configured depth
 94 |             for worker_input in new_worker_inputs:
 95 |                 queue_size += 1
 96 |                 self.input_queue.put(worker_input, False)
 97 | 
 98 |             self.progress(page_crawl, len(self.site.pages), queue_size)
 99 | 
100 |             if queue_size <= 0:
101 |                 self.stop_workers(self.workers, self.input_queue,
102 |                                   self.output_queue)
103 |                 self.stop_progress()
104 |                 return self.site
105 | 
106 |     def start_progress(self):
107 |         if self.config.options.progress:
108 |             print("Starting crawl...")
109 | 
110 |     def stop_progress(self):
111 |         if self.config.options.progress:
112 |             print("Crawling Done...\n")
113 | 
114 |     def progress(self, page_crawl, done_size, queue_size):
115 |         if not self.config.options.progress:
116 |             return
117 | 
118 |         total = done_size + queue_size
119 |         percent = float(done_size) / float(total) * 100.0
120 | 
121 |         url = ""
122 |         if page_crawl.final_url_split:
123 |             url = page_crawl.final_url_split.geturl()
124 |         elif page_crawl.original_url_split:
125 |             url = page_crawl.original_url_split.geturl()
126 | 
127 |         status = page_crawl.status
128 |         if not status:
129 |             status = "error"
130 | 
131 |         print("{0} - {1} ({2} of {3} - {4:.0f}%)".format(
132 |             status, url, done_size, total, percent))
133 | 
134 |     def build_queue(self, config):
135 |         """Returns an object implementing the Queue interface."""
136 |         raise NotImplementedError()
137 | 
138 |     def get_workers(self, config, worker_init):
139 |         """Returns a sequence of workers of the desired type."""
140 |         raise NotImplementedError()
141 | 
142 |     def start_workers(self, workers, input_queue, output_queue):
143 |         """Start the workers."""
144 |         raise NotImplementedError()
145 | 
146 |     def stop_workers(self, workers, input_queue, output_queue):
147 |         """Stops the workers."""
148 |         for worker in workers:
149 |             input_queue.put(WORK_DONE)
150 | 
151 |     def process_page_crawl(self, page_crawl):
152 |         """Returns a sequence of SplitResult to crawl."""
153 |         return self.site.add_crawled_page(page_crawl)
154 | 
155 | 
156 | class ThreadSiteCrawler(SiteCrawler):
157 |     """Site Crawler with thread workers."""
158 | 
159 |     def build_queue(self, config):
160 |         return compat.Queue.Queue()
161 | 
162 |     def get_workers(self, config, worker_init):
163 |         from threading import Thread
164 |         workers = []
165 |         for _ in range(config.worker_size):
166 |             workers.append(
167 |                 Thread(target=crawl_page, kwargs={'worker_init': worker_init}))
168 | 
169 |         return workers
170 | 
171 |     def start_workers(self, workers, input_queue, output_queue):
172 |         for worker in workers:
173 |             worker.start()
174 | 
175 | 
176 | class ProcessSiteCrawler(SiteCrawler):
177 |     """Site Crawler with process workers."""
178 | 
179 |     def __init__(self, *args, **kwargs):
180 |         import multiprocessing
181 |         self.manager = multiprocessing.Manager()
182 |         self.ProcessClass = multiprocessing.Process
183 |         super(ProcessSiteCrawler, self).__init__(*args, **kwargs)
184 | 
185 |     def build_logger(self):
186 |         """We do not want to share a logger."""
187 |         return None
188 | 
189 |     def build_queue(self, config):
190 |         return self.manager.Queue()
191 | 
192 |     def get_workers(self, config, worker_init):
193 |         workers = []
194 |         for _ in range(config.worker_size):
195 |             workers.append(self.ProcessClass(
196 |                 target=crawl_page, kwargs={'worker_init': worker_init}))
197 | 
198 |         return workers
199 | 
200 |     def start_workers(self, workers, input_queue, output_queue):
201 |         for worker in workers:
202 |             worker.start()
203 | 
204 | 
205 | class GreenSiteCrawler(SiteCrawler):
206 |     """Site Crawler with green thread workers."""
207 | 
208 |     def __init__(self, *args, **kwargs):
209 |         from gevent import monkey, queue, Greenlet
210 |         # TODO thread=false should be used to remove useless exception
211 |         # But weird behavior sometimes happen when it is not patched...
212 |         monkey.patch_all()
213 |         self.QueueClass = queue.Queue
214 |         self.GreenClass = Greenlet
215 |         super(GreenSiteCrawler, self).__init__(*args, **kwargs)
216 | 
217 |     def build_queue(self, config):
218 |         return self.QueueClass()
219 | 
220 |     def get_workers(self, config, worker_init):
221 |         workers = []
222 |         for _ in range(config.worker_size):
223 |             workers.append(self.GreenClass(
224 |                 crawl_page, worker_init=worker_init))
225 | 
226 |         return workers
227 | 
228 |     def start_workers(self, workers, input_queue, output_queue):
229 |         for worker in workers:
230 |             worker.start()
231 | 
232 | 
233 | class PageCrawler(object):
234 |     """Worker that parses a page and extracts links"""
235 | 
236 |     def __init__(self, worker_init):
237 |         self.worker_config = worker_init.worker_config
238 |         self.input_queue = worker_init.input_queue
239 |         self.output_queue = worker_init.output_queue
240 |         self.urlopen = get_url_open()
241 |         self.request_class = get_url_request()
242 |         self.logger = worker_init.logger
243 |         if not self.logger:
244 |             # Get a new one!
245 |             self.logger = get_logger()
246 | 
247 |         # We do this here to allow patching by gevent
248 |         import socket
249 |         self.timeout_exception = socket.timeout
250 | 
251 |         self.auth_header = None
252 | 
253 |         if self.worker_config.username and self.worker_config.password:
254 |             base64string = unicode(
255 |                 base64.encodestring(
256 |                     '{0}:{1}'.format(
257 |                         self.worker_config.username,
258 |                         self.worker_config.password)
259 |                     .encode("utf-8")), "utf-8")
260 |             self.auth_header = ("Authorization",
261 |                                 "Basic {0}".format(base64string))
262 | 
263 |     def crawl_page_forever(self):
264 |         """Starts page crawling loop for this worker."""
265 | 
266 |         while True:
267 |             worker_input = self.input_queue.get()
268 | 
269 |             if worker_input == WORK_DONE:
270 |                 # No more work! Pfew!
271 |                 return
272 |             else:
273 |                 page_crawl = self._crawl_page(worker_input)
274 |                 self.output_queue.put(page_crawl)
275 | 
276 |     def _crawl_page(self, worker_input):
277 |         page_crawl = None
278 |         erroneous_content = []
279 |         missing_content = []
280 |         url_split_to_crawl = worker_input.url_split
281 | 
282 |         try:
283 |             response = open_url(
284 |                 self.urlopen, self.request_class,
285 |                 url_split_to_crawl.geturl(), self.worker_config.timeout,
286 |                 self.timeout_exception, self.auth_header,
287 |                 extra_headers=self.worker_config.extra_headers,
288 |                 logger=self.logger)
289 | 
290 |             if response.exception:
291 |                 if response.status:
292 |                     # This is a http error. Good.
293 |                     page_crawl = PageCrawl(
294 |                         original_url_split=url_split_to_crawl,
295 |                         final_url_split=None, status=response.status,
296 |                         is_timeout=False, is_redirect=False, links=[],
297 |                         exception=None, is_html=False,
298 |                         depth=worker_input.depth,
299 |                         response_time=response.response_time,
300 |                         process_time=None,
301 |                         site_origin=worker_input.site_origin)
302 |                 elif response.is_timeout:
303 |                     # This is a timeout. No need to wrap the exception
304 |                     page_crawl = PageCrawl(
305 |                         original_url_split=url_split_to_crawl,
306 |                         final_url_split=None, status=None,
307 |                         is_timeout=True, is_redirect=False, links=[],
308 |                         exception=None, is_html=False,
309 |                         depth=worker_input.depth,
310 |                         response_time=response.response_time,
311 |                         process_time=0,
312 |                         site_origin=worker_input.site_origin)
313 |                 else:
314 |                     # Something bad happened when opening the url
315 |                     exception = ExceptionStr(
316 |                         unicode(type(response.exception)),
317 |                         unicode(response.exception))
318 |                     page_crawl = PageCrawl(
319 |                         original_url_split=url_split_to_crawl,
320 |                         final_url_split=None, status=None,
321 |                         is_timeout=False, is_redirect=False, links=[],
322 |                         exception=exception, is_html=False,
323 |                         depth=worker_input.depth,
324 |                         response_time=response.response_time,
325 |                         process_time=0,
326 |                         site_origin=worker_input.site_origin)
327 |             else:
328 |                 final_url_split = get_clean_url_split(response.final_url)
329 | 
330 |                 message = response.content.info()
331 |                 mime_type = get_content_type(message)
332 |                 if self.worker_config.prefer_server_encoding:
333 |                     charset = get_charset(message)
334 |                 else:
335 |                     charset = None
336 |                 links = []
337 | 
338 |                 is_html = mime_type == HTML_MIME_TYPE
339 |                 process_time = None
340 | 
341 |                 if is_html and worker_input.should_crawl:
342 |                     start = time.time()
343 |                     html_soup = BeautifulSoup(
344 |                         response.content, self.worker_config.parser,
345 |                         from_encoding=charset)
346 |                     links = self.get_links(html_soup, final_url_split)
347 |                     if self._has_content_to_check(worker_input):
348 |                         (missing_content, erroneous_content) =\
349 |                             self.check_content(
350 |                                 unicode(html_soup), html_soup,
351 |                                 url_split_to_crawl,
352 |                                 final_url_split, worker_input.content_check)
353 |                     process_time = time.time() - start
354 |                 else:
355 |                     self.logger.debug(
356 |                         "Won't crawl %s. MIME Type: %s. Should crawl: %s",
357 |                         final_url_split, mime_type,
358 |                         worker_input.should_crawl)
359 |                     if self._has_content_to_check(worker_input):
360 |                         text_content = self.get_text_content(
361 |                             response.content.read(), charset)
362 |                         (missing_content, erroneous_content) =\
363 |                             self.check_content(
364 |                                 text_content, None, url_split_to_crawl,
365 |                                 final_url_split, worker_input.content_check)
366 | 
367 |                 page_crawl = PageCrawl(
368 |                     original_url_split=url_split_to_crawl,
369 |                     final_url_split=final_url_split, status=response.status,
370 |                     is_timeout=False, is_redirect=response.is_redirect,
371 |                     links=links, exception=None, is_html=is_html,
372 |                     depth=worker_input.depth,
373 |                     response_time=response.response_time,
374 |                     process_time=process_time,
375 |                     site_origin=worker_input.site_origin,
376 |                     missing_content=missing_content,
377 |                     erroneous_content=erroneous_content)
378 |         except Exception as exc:
379 |             exception = ExceptionStr(unicode(type(exc)), unicode(exc))
380 |             page_crawl = PageCrawl(
381 |                 original_url_split=url_split_to_crawl,
382 |                 final_url_split=None, status=None,
383 |                 is_timeout=False, is_redirect=False, links=[],
384 |                 exception=exception, is_html=False,
385 |                 depth=worker_input.depth,
386 |                 response_time=None,
387 |                 process_time=None,
388 |                 site_origin=worker_input.site_origin)
389 |             self.logger.exception("Exception occurred while crawling a page.")
390 | 
391 |         return page_crawl
392 | 
393 |     def _has_content_to_check(self, worker_input):
394 |         return worker_input.content_check and\
395 |             worker_input.content_check.has_something_to_check
396 | 
397 |     def get_text_content(self, binary_blob, charset):
398 |         """Retrieves unicode content from response binary blob.
399 |         """
400 |         override_encodings = []
401 |         if charset:
402 |             override_encodings.append(charset)
403 | 
404 |         return UnicodeDammit(binary_blob, override_encodings).unicode_markup
405 | 
406 |     def check_content(
407 |             self, response_content, html_soup, original_url_split,
408 |             final_url_split, content_check):
409 |         """Ensures that the specified content is present (or absent).
410 |         """
411 |         missing_content = []
412 |         erroneous_content = []
413 | 
414 |         if html_soup:
415 |             for content, found in self.check_html_content_single(
416 |                     content_check.html_presence, html_soup, original_url_split,
417 |                     final_url_split):
418 |                 if not found:
419 |                     missing_content.append(content)
420 | 
421 |         if html_soup:
422 |             for content, found in self.check_html_content_single(
423 |                     content_check.html_absence, html_soup, original_url_split,
424 |                     final_url_split):
425 |                 if found:
426 |                     erroneous_content.append(content)
427 | 
428 |         for content, found in self.check_text_content_single(
429 |                 content_check.text_presence, response_content,
430 |                 original_url_split, final_url_split):
431 |             if not found:
432 |                 missing_content.append(content)
433 | 
434 |         for content, found in self.check_text_content_single(
435 |                 content_check.text_absence, response_content,
436 |                 original_url_split, final_url_split):
437 |             if found:
438 |                 erroneous_content.append(content)
439 | 
440 |         return (missing_content, erroneous_content)
441 | 
442 |     def check_html_content_single(
443 |             self, html_to_check, html_soup, original_url_split,
444 |             final_url_split):
445 |         """Returns a list of tuple (content, presence) indicating whether an
446 |         html tag was present or not in the source.
447 |         """
448 |         content = []
449 | 
450 |         for key, html_check_list in html_to_check.items():
451 |             if key == PREFIX_ALL or\
452 |                     is_similar_url_split(key, original_url_split) or\
453 |                     is_similar_url_split(key, final_url_split):
454 |                 # we check
455 |                 for html_check in html_check_list:
456 |                     kwargs = {}
457 |                     if html_check.attrs:
458 |                         kwargs["attrs"] = html_check.attrs
459 |                     if html_check.content:
460 |                         # XXX Use text because the included bs4 does not use
461 |                         # the new string parameter and text is backward
462 |                         # compatible.
463 |                         kwargs["text"] = html_check.content
464 |                     found = html_soup.find(
465 |                         html_check.tag, **kwargs) is not None
466 |                     content.append((str(html_check), found))
467 | 
468 |         return content
469 | 
470 |     def check_text_content_single(
471 |             self, text_content_to_check, full_text, original_url_split,
472 |             final_url_split):
473 |         """Returns a list of tuple (content, presence) indicating whether an
474 |         html tag was present or not in the source.
475 |         """
476 |         content = []
477 | 
478 |         for key, text_check_list in text_content_to_check.items():
479 |             if key == PREFIX_ALL or\
480 |                     is_similar_url_split(key, original_url_split) or\
481 |                     is_similar_url_split(key, final_url_split):
482 |                 # we check
483 |                 for text_check in text_check_list:
484 |                     try:
485 |                         match = text_check.search(full_text)
486 |                         content.append((text_check.pattern, match is not None))
487 |                     except AttributeError:
488 |                         found = text_check in full_text
489 |                         content.append((text_check, found))
490 | 
491 |         return content
492 | 
493 |     def get_links(self, html_soup, original_url_split):
494 |         """Gets links for desired types (e.g., a, link, img, script)
495 | 
496 |         :param html_soup: The page parsed by BeautifulSoup
497 |         :param original_url_split: The URL of the page used to resolve relative
498 |                 links.
499 |         :rtype: A sequence of Link objects
500 |         """
501 | 
502 |         # This is a weird html tag that defines the base URL of a page.
503 |         base_url_split = original_url_split
504 | 
505 |         bases = html_soup.find_all('base')
506 |         if bases:
507 |             base = bases[0]
508 |             if 'href' in base.attrs:
509 |                 base_url_split = get_clean_url_split(base['href'])
510 | 
511 |         links = []
512 |         for element_type in self.worker_config.types:
513 |             if element_type not in TYPE_ATTRIBUTES:
514 |                 raise Exception(
515 |                     "Unknown element type: {0}".format(element_type))
516 |             attribute = TYPE_ATTRIBUTES[element_type]
517 |             element_links = html_soup.find_all(element_type)
518 |             links.extend(self._get_links(
519 |                 element_links, attribute, base_url_split, original_url_split))
520 |         return links
521 | 
522 |     def _get_links(self, elements, attribute, base_url_split,
523 |                    original_url_split):
524 |         links = []
525 |         for element in elements:
526 |             if attribute in element.attrs:
527 |                 url = element[attribute]
528 | 
529 |                 if not self.worker_config.strict_mode:
530 |                     url = url.strip()
531 | 
532 |                 if not is_link(url):
533 |                     continue
534 |                 abs_url_split = get_absolute_url_split(url, base_url_split)
535 | 
536 |                 if not is_supported_scheme(
537 |                         abs_url_split, self.worker_config.ignore_bad_tel_urls):
538 |                     continue
539 | 
540 |                 link = Link(
541 |                     type=unicode(element.name), url_split=abs_url_split,
542 |                     original_url_split=original_url_split,
543 |                     source_str=unicode(element))
544 |                 links.append(link)
545 | 
546 |         return links
547 | 
548 | 
549 | class Site(UTF8Class):
550 |     """Contains all the visited and visiting pages of a site.
551 | 
552 |     This class is NOT thread-safe and should only be accessed by one thread at
553 |     a time!
554 |     """
555 | 
556 |     def __init__(self, start_url_splits, config, logger=None):
557 |         self.start_url_splits = start_url_splits
558 | 
559 |         self.pages = {}
560 |         """Map of url:SitePage"""
561 | 
562 |         self.multi_pages = defaultdict(dict)
563 |         """Map of netloc:map(url:SitePage). Only used in multi sites mode."""
564 | 
565 |         self.error_pages = {}
566 |         """Map of url:SitePage with is_ok=False"""
567 | 
568 |         self.multi_error_pages = defaultdict(dict)
569 |         """Map of netloc:map(url:SitePage). Only used in multi sites
570 |         mode."""
571 | 
572 |         self.page_statuses = {}
573 |         """Map of url:PageStatus (PAGE_QUEUED, PAGE_CRAWLED)"""
574 | 
575 |         self.config = config
576 | 
577 |         self.logger = logger
578 | 
579 |         for start_url_split in self.start_url_splits:
580 |             self.page_statuses[start_url_split] = PageStatus(PAGE_QUEUED, [])
581 | 
582 |     def collect_multi_sites(self):
583 |         """Collects page results and maps them to their respective domain in
584 |         multi_pages and multi_error_pages.
585 |         """
586 |         for url, page in self.pages.items():
587 |             self.multi_pages[page.site_origin][url] = page
588 | 
589 |         for url, page in self.error_pages.items():
590 |             self.multi_error_pages[page.site_origin][url] = page
591 | 
592 |     @property
593 |     def is_ok(self):
594 |         """Returns True if there is no error page."""
595 |         return len(self.error_pages) == 0
596 | 
597 |     def add_crawled_page(self, page_crawl):
598 |         """Adds a crawled page. Returns a list of url split to crawl"""
599 |         if page_crawl.original_url_split not in self.page_statuses:
600 |             self.logger.warning("Original URL not seen before!")
601 |             return []
602 | 
603 |         status = self.page_statuses[page_crawl.original_url_split]
604 | 
605 |         # Mark it as crawled
606 |         self.page_statuses[page_crawl.original_url_split] = PageStatus(
607 |             PAGE_CRAWLED, None)
608 | 
609 |         if page_crawl.original_url_split in self.pages:
610 |             self.logger.warning(
611 |                 "Original URL already crawled! Concurrency issue!")
612 |             return []
613 | 
614 |         final_url_split = page_crawl.final_url_split
615 |         if not final_url_split:
616 |             # Happens on 404/500/timeout/error
617 |             final_url_split = page_crawl.original_url_split
618 | 
619 |         if final_url_split in self.pages:
620 |             # This means that we already processed this final page.
621 |             # It's a redirect. Just add a source
622 |             site_page = self.pages[final_url_split]
623 |             site_page.add_sources(status.sources)
624 |         else:
625 |             # We never crawled this page before
626 |             is_local = self.config.is_local(final_url_split)
627 |             site_page = SitePage(
628 |                 final_url_split, page_crawl.status,
629 |                 page_crawl.is_timeout, page_crawl.exception,
630 |                 page_crawl.is_html, is_local,
631 |                 response_time=page_crawl.response_time,
632 |                 process_time=page_crawl.process_time,
633 |                 site_origin=page_crawl.site_origin,
634 |                 missing_content=page_crawl.missing_content,
635 |                 erroneous_content=page_crawl.erroneous_content)
636 |             site_page.add_sources(status.sources)
637 |             self.pages[final_url_split] = site_page
638 | 
639 |             if not site_page.is_ok:
640 |                 self.error_pages[final_url_split] = site_page
641 | 
642 |         return self.process_links(page_crawl)
643 | 
644 |     def process_links(self, page_crawl):
645 |         links_to_process = []
646 | 
647 |         source_url_split = page_crawl.original_url_split
648 |         if page_crawl.final_url_split:
649 |             source_url_split = page_crawl.final_url_split
650 | 
651 |         for link in page_crawl.links:
652 |             url_split = link.url_split
653 |             if not self.config.should_download(url_split):
654 |                 self.logger.debug(
655 |                     "Won't download %s. Is local? %s",
656 |                     url_split,
657 |                     LazyLogParam(lambda: self.config.is_local(url_split)))
658 |                 continue
659 | 
660 |             page_status = self.page_statuses.get(url_split, None)
661 |             page_source = PageSource(source_url_split, link.source_str)
662 | 
663 |             if not page_status:
664 |                 # We never encountered this url before
665 |                 self.page_statuses[url_split] = PageStatus(
666 |                     PAGE_QUEUED, [page_source])
667 |                 should_crawl = self.config.should_crawl(
668 |                     url_split, page_crawl.depth)
669 |                 links_to_process.append(WorkerInput(
670 |                     url_split, should_crawl, page_crawl.depth + 1,
671 |                     page_crawl.site_origin, self.config.content_check))
672 |             elif page_status.status == PAGE_CRAWLED:
673 |                 # Already crawled. Add source
674 |                 if url_split in self.pages:
675 |                     self.pages[url_split].add_sources([page_source])
676 |                 else:
677 |                     # TODO the final url is different. need a way to link it...
678 |                     pass
679 |             elif page_status.status == PAGE_QUEUED:
680 |                 # Already queued for crawling. Add source.
681 |                 page_status.sources.append(page_source)
682 | 
683 |         return links_to_process
684 | 
685 |     def get_average_response_time(self):
686 |         """Computes the average response time of pages that returned an HTTP
687 |         code (good or bad). Exceptions such as timeout are ignored.
688 |         """
689 |         response_time_sum = 0
690 |         total = 0
691 |         for page in self.pages.values():
692 |             if page.response_time is not None:
693 |                 response_time_sum += page.response_time
694 |                 total += 1
695 | 
696 |         if total > 0:
697 |             return float(response_time_sum) / float(total)
698 |         else:
699 |             return 0
700 | 
701 |     def get_average_process_time(self):
702 |         """Computes the average process (parse) time of pages that returned an HTTP
703 |         code (good or bad). Exceptions are ignored.
704 |         """
705 |         process_time_sum = 0
706 |         total = 0
707 |         for page in self.pages.values():
708 |             if page.process_time is not None:
709 |                 process_time_sum += page.process_time
710 |                 total += 1
711 | 
712 |         if total > 0:
713 |             return float(process_time_sum) / float(total)
714 |         else:
715 |             return 0
716 | 
717 |     def __unicode__(self):
718 |         return "Site for {0}".format(self.start_url_splits)
719 | 
720 | 
721 | def crawl_page(worker_init):
722 |     """Safe redirection to the page crawler"""
723 |     page_crawler = PageCrawler(worker_init)
724 |     page_crawler.crawl_page_forever()
725 | 
726 | 
727 | def open_url(open_func, request_class, url, timeout, timeout_exception,
728 |              auth_header=None, extra_headers=None, logger=None):
729 |     """Opens a URL and returns a Response object.
730 | 
731 |     All parameters are required to be able to use a patched version of the
732 |     Python standard library (i.e., patched by gevent)
733 | 
734 |     :param open_func: url open function, typicaly urllib2.urlopen
735 |     :param request_class: the request class to use
736 |     :param url: the url to open
737 |     :param timeout: number of seconds to wait before timing out
738 |     :param timeout_exception: the exception thrown by open_func if a timeout
739 |             occurs
740 |     :param auth_header: authentication header
741 |     :param extra_headers: dict of {Header: Value}
742 |     :param logger: logger used to log exceptions
743 |     :rtype: A Response object
744 |     """
745 |     try:
746 |         request = request_class(url)
747 | 
748 |         if auth_header:
749 |             request.add_header(auth_header[0], auth_header[1])
750 | 
751 |         if extra_headers:
752 |             for header, value in extra_headers.items():
753 |                 request.add_header(header, value)
754 | 
755 |         start = time.time()
756 |         output_value = open_func(request, timeout=timeout)
757 |         stop = time.time()
758 |         final_url = output_value.geturl()
759 |         code = output_value.getcode()
760 |         response = Response(
761 |             content=output_value, status=code, exception=None,
762 |             original_url=url, final_url=final_url,
763 |             is_redirect=final_url != url, is_timeout=False,
764 |             response_time=stop-start)
765 |     except HTTPError as http_error:
766 |         stop = time.time()
767 |         code = http_error.code
768 |         response = Response(
769 |             content=None, status=code, exception=http_error,
770 |             original_url=url, final_url=None, is_redirect=False,
771 |             is_timeout=False, response_time=stop-start)
772 |     except timeout_exception as t_exception:
773 |         response = Response(
774 |             content=None, status=None, exception=t_exception,
775 |             original_url=url, final_url=None, is_redirect=False,
776 |             is_timeout=True, response_time=None)
777 |     except Exception as exc:
778 |         if logger:
779 |             logger.warning("Exception while opening an URL", exc_info=True)
780 |         response = Response(
781 |             content=None, status=None, exception=exc,
782 |             original_url=url, final_url=None, is_redirect=False,
783 |             is_timeout=False, response_time=None)
784 | 
785 |     return response
786 | 
787 | 
788 | def execute_from_command_line():
789 |     """Runs the crawler and retrieves the configuration from the command
790 |        line.
791 |     """
792 |     try:
793 |         start = time.time()
794 |         config = Config()
795 |         config.parse_cli_config()
796 | 
797 |         logger = configure_logger(config)
798 |         crawler = execute_from_config(config, logger)
799 | 
800 |         stop = time.time()
801 | 
802 |         if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
803 |             report(crawler.site, config, stop - start, logger)
804 | 
805 |         if not crawler.site.is_ok:
806 |             sys.exit(1)
807 |     except Exception as e:
808 |         print(e)
809 |         sys.exit(1)
810 | 
811 | 
812 | def configure_logger(config):
813 |     """Configures a logger based on the configuration."""
814 |     if config.options.verbose == VERBOSE_QUIET:
815 |         logging.basicConfig(level=logging.CRITICAL)
816 |     elif config.options.verbose == VERBOSE_NORMAL:
817 |         logging.basicConfig(level=logging.WARNING)
818 |     else:
819 |         logging.basicConfig(level=logging.DEBUG)
820 | 
821 |     logger = get_logger()
822 | 
823 |     return logger
824 | 
825 | 
826 | def execute_from_config(config, logger):
827 |     """Executes a crawler given a config and logger."""
828 |     if not config.start_urls:
829 |         raise Exception("At least one starting URL must be supplied.")
830 | 
831 |     if config.options.allow_insecure_content:
832 |         # Ref: https://www.python.org/dev/peps/pep-0476/#opting-out
833 |         import ssl
834 |         try:
835 |             _create_unverified_https_context = ssl._create_unverified_context
836 |         except AttributeError:
837 |             # Legacy Python that doesn't verify HTTPS certificates by default
838 |             pass
839 |         else:
840 |             # Handle target environment that doesn't support HTTPS verification
841 |             ssl._create_default_https_context = _create_unverified_https_context
842 | 
843 |     if config.options.mode == MODE_THREAD:
844 |         crawler = ThreadSiteCrawler(config, logger)
845 |     elif config.options.mode == MODE_PROCESS:
846 |         crawler = ProcessSiteCrawler(config, logger)
847 |     elif config.options.mode == MODE_GREEN:
848 |         crawler = GreenSiteCrawler(config, logger)
849 | 
850 |     if not crawler:
851 |         raise Exception("Invalid crawling mode supplied.")
852 | 
853 |     crawler.crawl()
854 | 
855 |     if config.options.multi:
856 |         crawler.site.collect_multi_sites()
857 | 
858 |     return crawler
859 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bartdag/pylinkvalidator/aac5934d88a9c99d0e4f40a8884ad942b6b10ea0/pylinkvalidator/included/__init__.py


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/__init__.py:
--------------------------------------------------------------------------------
  1 | """Beautiful Soup
  2 | Elixir and Tonic
  3 | "The Screen-Scraper's Friend"
  4 | http://www.crummy.com/software/BeautifulSoup/
  5 | 
  6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a
  7 | (possibly invalid) document into a tree representation. Beautiful Soup
  8 | provides provides methods and Pythonic idioms that make it easy to
  9 | navigate, search, and modify the parse tree.
 10 | 
 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml
 12 | and/or html5lib is installed.
 13 | 
 14 | For more than you ever wanted to know about Beautiful Soup, see the
 15 | documentation:
 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 17 | """
 18 | 
 19 | from __future__ import absolute_import
 20 | import sys
 21 | 
 22 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
 23 | __version__ = "4.2.1"
 24 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
 25 | __license__ = "MIT"
 26 | 
 27 | 
 28 | use_system_version = False
 29 | 
 30 | try:
 31 |     # The system-installed version has priority providing it is not an
 32 |     # earlier version. The embedded bs4 only works for Python 2.
 33 |     import bs4
 34 |     if (bs4.__version__.split('.') >= __version__.split('.')) or\
 35 |             sys.version_info[0] >= 3:
 36 |         from bs4 import *
 37 | 
 38 |         # Necessary for direct import in pylinkvalidator
 39 |         UnicodeDammit = bs4.UnicodeDammit
 40 |         use_system_version = True
 41 |         # Make sure we copy over the version. See #17071
 42 |         __version__ = bs4.__version__
 43 | except ImportError:
 44 |     if sys.version_info[0] >= 3:
 45 |         raise
 46 | 
 47 | if not use_system_version:
 48 | 
 49 |     __all__ = ['BeautifulSoup']
 50 | 
 51 |     import re
 52 |     import warnings
 53 | 
 54 |     from .builder import builder_registry
 55 |     from .dammit import UnicodeDammit
 56 |     from .element import (
 57 |         CData,
 58 |         Comment,
 59 |         DEFAULT_OUTPUT_ENCODING,
 60 |         Declaration,
 61 |         Doctype,
 62 |         NavigableString,
 63 |         PageElement,
 64 |         ProcessingInstruction,
 65 |         ResultSet,
 66 |         SoupStrainer,
 67 |         Tag,
 68 |         )
 69 | 
 70 |     # The very first thing we do is give a useful error if someone is
 71 |     # running this code under Python 3 without converting it.
 72 |     syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
 73 | 
 74 |     class BeautifulSoup(Tag):
 75 |         """
 76 |         This class defines the basic interface called by the tree builders.
 77 | 
 78 |         These methods will be called by the parser:
 79 |           reset()
 80 |           feed(markup)
 81 | 
 82 |         The tree builder may call these methods from its feed() implementation:
 83 |           handle_starttag(name, attrs) # See note about return value
 84 |           handle_endtag(name)
 85 |           handle_data(data) # Appends to the current data node
 86 |           endData(containerClass=NavigableString) # Ends the current data node
 87 | 
 88 |         No matter how complicated the underlying parser is, you should be
 89 |         able to build a tree using 'start tag' events, 'end tag' events,
 90 |         'data' events, and "done with data" events.
 91 | 
 92 |         If you encounter an empty-element tag (aka a self-closing tag,
 93 |         like HTML's <br> tag), call handle_starttag and then
 94 |         handle_endtag.
 95 |         """
 96 |         ROOT_TAG_NAME = u'[document]'
 97 | 
 98 |         # If the end-user gives no indication which tree builder they
 99 |         # want, look for one with these features.
100 |         DEFAULT_BUILDER_FEATURES = ['html', 'fast']
101 | 
102 |         # Used when determining whether a text node is all whitespace and
103 |         # can be replaced with a single space. A text node that contains
104 |         # fancy Unicode spaces (usually non-breaking) should be left
105 |         # alone.
106 |         STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
107 | 
108 |         def __init__(self, markup="", features=None, builder=None,
109 |                      parse_only=None, from_encoding=None, **kwargs):
110 |             """The Soup object is initialized as the 'root tag', and the
111 |             provided markup (which can be a string or a file-like object)
112 |             is fed into the underlying parser."""
113 | 
114 |             if 'convertEntities' in kwargs:
115 |                 warnings.warn(
116 |                     "BS4 does not respect the convertEntities argument to the "
117 |                     "BeautifulSoup constructor. Entities are always converted "
118 |                     "to Unicode characters.")
119 | 
120 |             if 'markupMassage' in kwargs:
121 |                 del kwargs['markupMassage']
122 |                 warnings.warn(
123 |                     "BS4 does not respect the markupMassage argument to the "
124 |                     "BeautifulSoup constructor. The tree builder is responsible "
125 |                     "for any necessary markup massage.")
126 | 
127 |             if 'smartQuotesTo' in kwargs:
128 |                 del kwargs['smartQuotesTo']
129 |                 warnings.warn(
130 |                     "BS4 does not respect the smartQuotesTo argument to the "
131 |                     "BeautifulSoup constructor. Smart quotes are always converted "
132 |                     "to Unicode characters.")
133 | 
134 |             if 'selfClosingTags' in kwargs:
135 |                 del kwargs['selfClosingTags']
136 |                 warnings.warn(
137 |                     "BS4 does not respect the selfClosingTags argument to the "
138 |                     "BeautifulSoup constructor. The tree builder is responsible "
139 |                     "for understanding self-closing tags.")
140 | 
141 |             if 'isHTML' in kwargs:
142 |                 del kwargs['isHTML']
143 |                 warnings.warn(
144 |                     "BS4 does not respect the isHTML argument to the "
145 |                     "BeautifulSoup constructor. You can pass in features='html' "
146 |                     "or features='xml' to get a builder capable of handling "
147 |                     "one or the other.")
148 | 
149 |             def deprecated_argument(old_name, new_name):
150 |                 if old_name in kwargs:
151 |                     warnings.warn(
152 |                         'The "%s" argument to the BeautifulSoup constructor '
153 |                         'has been renamed to "%s."' % (old_name, new_name))
154 |                     value = kwargs[old_name]
155 |                     del kwargs[old_name]
156 |                     return value
157 |                 return None
158 | 
159 |             parse_only = parse_only or deprecated_argument(
160 |                 "parseOnlyThese", "parse_only")
161 | 
162 |             from_encoding = from_encoding or deprecated_argument(
163 |                 "fromEncoding", "from_encoding")
164 | 
165 |             if len(kwargs) > 0:
166 |                 arg = kwargs.keys().pop()
167 |                 raise TypeError(
168 |                     "__init__() got an unexpected keyword argument '%s'" % arg)
169 | 
170 |             if builder is None:
171 |                 if isinstance(features, basestring):
172 |                     features = [features]
173 |                 if features is None or len(features) == 0:
174 |                     features = self.DEFAULT_BUILDER_FEATURES
175 |                 builder_class = builder_registry.lookup(*features)
176 |                 if builder_class is None:
177 |                     raise FeatureNotFound(
178 |                         "Couldn't find a tree builder with the features you "
179 |                         "requested: %s. Do you need to install a parser library?"
180 |                         % ",".join(features))
181 |                 builder = builder_class()
182 |             self.builder = builder
183 |             self.is_xml = builder.is_xml
184 |             self.builder.soup = self
185 | 
186 |             self.parse_only = parse_only
187 | 
188 |             self.reset()
189 | 
190 |             if hasattr(markup, 'read'):        # It's a file-type object.
191 |                 markup = markup.read()
192 |             (self.markup, self.original_encoding, self.declared_html_encoding,
193 |              self.contains_replacement_characters) = (
194 |                 self.builder.prepare_markup(markup, from_encoding))
195 | 
196 |             try:
197 |                 self._feed()
198 |             except StopParsing:
199 |                 pass
200 | 
201 |             # Clear out the markup and remove the builder's circular
202 |             # reference to this object.
203 |             self.markup = None
204 |             self.builder.soup = None
205 | 
206 |         def _feed(self):
207 |             # Convert the document to Unicode.
208 |             self.builder.reset()
209 | 
210 |             self.builder.feed(self.markup)
211 |             # Close out any unfinished strings and close all the open tags.
212 |             self.endData()
213 |             while self.currentTag.name != self.ROOT_TAG_NAME:
214 |                 self.popTag()
215 | 
216 |         def reset(self):
217 |             Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
218 |             self.hidden = 1
219 |             self.builder.reset()
220 |             self.currentData = []
221 |             self.currentTag = None
222 |             self.tagStack = []
223 |             self.pushTag(self)
224 | 
225 |         def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
226 |             """Create a new tag associated with this soup."""
227 |             return Tag(None, self.builder, name, namespace, nsprefix, attrs)
228 | 
229 |         def new_string(self, s, subclass=NavigableString):
230 |             """Create a new NavigableString associated with this soup."""
231 |             navigable = subclass(s)
232 |             navigable.setup()
233 |             return navigable
234 | 
235 |         def insert_before(self, successor):
236 |             raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
237 | 
238 |         def insert_after(self, successor):
239 |             raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
240 | 
241 |         def popTag(self):
242 |             tag = self.tagStack.pop()
243 |             #print "Pop", tag.name
244 |             if self.tagStack:
245 |                 self.currentTag = self.tagStack[-1]
246 |             return self.currentTag
247 | 
248 |         def pushTag(self, tag):
249 |             #print "Push", tag.name
250 |             if self.currentTag:
251 |                 self.currentTag.contents.append(tag)
252 |             self.tagStack.append(tag)
253 |             self.currentTag = self.tagStack[-1]
254 | 
255 |         def endData(self, containerClass=NavigableString):
256 |             if self.currentData:
257 |                 currentData = u''.join(self.currentData)
258 |                 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
259 |                     not set([tag.name for tag in self.tagStack]).intersection(
260 |                         self.builder.preserve_whitespace_tags)):
261 |                     if '\n' in currentData:
262 |                         currentData = '\n'
263 |                     else:
264 |                         currentData = ' '
265 |                 self.currentData = []
266 |                 if self.parse_only and len(self.tagStack) <= 1 and \
267 |                        (not self.parse_only.text or \
268 |                         not self.parse_only.search(currentData)):
269 |                     return
270 |                 o = containerClass(currentData)
271 |                 self.object_was_parsed(o)
272 | 
273 |         def object_was_parsed(self, o, parent=None, most_recent_element=None):
274 |             """Add an object to the parse tree."""
275 |             parent = parent or self.currentTag
276 |             most_recent_element = most_recent_element or self._most_recent_element
277 |             o.setup(parent, most_recent_element)
278 |             if most_recent_element is not None:
279 |                 most_recent_element.next_element = o
280 |             self._most_recent_element = o
281 |             parent.contents.append(o)
282 | 
283 |         def _popToTag(self, name, nsprefix=None, inclusivePop=True):
284 |             """Pops the tag stack up to and including the most recent
285 |             instance of the given tag. If inclusivePop is false, pops the tag
286 |             stack up to but *not* including the most recent instqance of
287 |             the given tag."""
288 |             #print "Popping to %s" % name
289 |             if name == self.ROOT_TAG_NAME:
290 |                 return
291 | 
292 |             numPops = 0
293 |             mostRecentTag = None
294 | 
295 |             for i in range(len(self.tagStack) - 1, 0, -1):
296 |                 if (name == self.tagStack[i].name
297 |                     and nsprefix == self.tagStack[i].prefix):
298 |                     numPops = len(self.tagStack) - i
299 |                     break
300 |             if not inclusivePop:
301 |                 numPops = numPops - 1
302 | 
303 |             for i in range(0, numPops):
304 |                 mostRecentTag = self.popTag()
305 |             return mostRecentTag
306 | 
307 |         def handle_starttag(self, name, namespace, nsprefix, attrs):
308 |             """Push a start tag on to the stack.
309 | 
310 |             If this method returns None, the tag was rejected by the
311 |             SoupStrainer. You should proceed as if the tag had not occured
312 |             in the document. For instance, if this was a self-closing tag,
313 |             don't call handle_endtag.
314 |             """
315 | 
316 |             # print "Start tag %s: %s" % (name, attrs)
317 |             self.endData()
318 | 
319 |             if (self.parse_only and len(self.tagStack) <= 1
320 |                 and (self.parse_only.text
321 |                      or not self.parse_only.search_tag(name, attrs))):
322 |                 return None
323 | 
324 |             tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
325 |                       self.currentTag, self._most_recent_element)
326 |             if tag is None:
327 |                 return tag
328 |             if self._most_recent_element:
329 |                 self._most_recent_element.next_element = tag
330 |             self._most_recent_element = tag
331 |             self.pushTag(tag)
332 |             return tag
333 | 
334 |         def handle_endtag(self, name, nsprefix=None):
335 |             #print "End tag: " + name
336 |             self.endData()
337 |             self._popToTag(name, nsprefix)
338 | 
339 |         def handle_data(self, data):
340 |             self.currentData.append(data)
341 | 
342 |         def decode(self, pretty_print=False,
343 |                    eventual_encoding=DEFAULT_OUTPUT_ENCODING,
344 |                    formatter="minimal"):
345 |             """Returns a string or Unicode representation of this document.
346 |             To get Unicode, pass None for encoding."""
347 | 
348 |             if self.is_xml:
349 |                 # Print the XML declaration
350 |                 encoding_part = ''
351 |                 if eventual_encoding != None:
352 |                     encoding_part = ' encoding="%s"' % eventual_encoding
353 |                 prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
354 |             else:
355 |                 prefix = u''
356 |             if not pretty_print:
357 |                 indent_level = None
358 |             else:
359 |                 indent_level = 0
360 |             return prefix + super(BeautifulSoup, self).decode(
361 |                 indent_level, eventual_encoding, formatter)
362 | 
363 |     # Alias to make it easier to type import: 'from bs4 import _soup'
364 |     _s = BeautifulSoup
365 |     _soup = BeautifulSoup
366 | 
367 |     class BeautifulStoneSoup(BeautifulSoup):
368 |         """Deprecated interface to an XML parser."""
369 | 
370 |         def __init__(self, *args, **kwargs):
371 |             kwargs['features'] = 'xml'
372 |             warnings.warn(
373 |                 'The BeautifulStoneSoup class is deprecated. Instead of using '
374 |                 'it, pass features="xml" into the BeautifulSoup constructor.')
375 |             super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
376 | 
377 | 
378 |     class StopParsing(Exception):
379 |         pass
380 | 
381 | 
382 |     class FeatureNotFound(ValueError):
383 |         pass
384 | 
385 | 
386 |     #By default, act as an HTML pretty-printer.
387 |     if __name__ == '__main__':
388 |         import sys
389 |         soup = BeautifulSoup(sys.stdin)
390 |         print(soup.prettify())
391 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/builder/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | 
  4 | if sys.version_info[0] < 3:
  5 |     from collections import defaultdict
  6 |     import itertools
  7 |     import sys
  8 |     from pylinkvalidator.included.bs4.element import (
  9 |         CharsetMetaAttributeValue,
 10 |         ContentMetaAttributeValue,
 11 |         whitespace_re
 12 |         )
 13 | 
 14 |     __all__ = [
 15 |         'HTMLTreeBuilder',
 16 |         'SAXTreeBuilder',
 17 |         'TreeBuilder',
 18 |         'TreeBuilderRegistry',
 19 |         ]
 20 | 
 21 |     # Some useful features for a TreeBuilder to have.
 22 |     FAST = 'fast'
 23 |     PERMISSIVE = 'permissive'
 24 |     STRICT = 'strict'
 25 |     XML = 'xml'
 26 |     HTML = 'html'
 27 |     HTML_5 = 'html5'
 28 | 
 29 | 
 30 |     class TreeBuilderRegistry(object):
 31 | 
 32 |         def __init__(self):
 33 |             self.builders_for_feature = defaultdict(list)
 34 |             self.builders = []
 35 | 
 36 |         def register(self, treebuilder_class):
 37 |             """Register a treebuilder based on its advertised features."""
 38 |             for feature in treebuilder_class.features:
 39 |                 self.builders_for_feature[feature].insert(0, treebuilder_class)
 40 |             self.builders.insert(0, treebuilder_class)
 41 | 
 42 |         def lookup(self, *features):
 43 |             if len(self.builders) == 0:
 44 |                 # There are no builders at all.
 45 |                 return None
 46 | 
 47 |             if len(features) == 0:
 48 |                 # They didn't ask for any features. Give them the most
 49 |                 # recently registered builder.
 50 |                 return self.builders[0]
 51 | 
 52 |             # Go down the list of features in order, and eliminate any builders
 53 |             # that don't match every feature.
 54 |             features = list(features)
 55 |             features.reverse()
 56 |             candidates = None
 57 |             candidate_set = None
 58 |             while len(features) > 0:
 59 |                 feature = features.pop()
 60 |                 we_have_the_feature = self.builders_for_feature.get(feature, [])
 61 |                 if len(we_have_the_feature) > 0:
 62 |                     if candidates is None:
 63 |                         candidates = we_have_the_feature
 64 |                         candidate_set = set(candidates)
 65 |                     else:
 66 |                         # Eliminate any candidates that don't have this feature.
 67 |                         candidate_set = candidate_set.intersection(
 68 |                             set(we_have_the_feature))
 69 | 
 70 |             # The only valid candidates are the ones in candidate_set.
 71 |             # Go through the original list of candidates and pick the first one
 72 |             # that's in candidate_set.
 73 |             if candidate_set is None:
 74 |                 return None
 75 |             for candidate in candidates:
 76 |                 if candidate in candidate_set:
 77 |                     return candidate
 78 |             return None
 79 | 
 80 |     # The BeautifulSoup class will take feature lists from developers and use them
 81 |     # to look up builders in this registry.
 82 |     builder_registry = TreeBuilderRegistry()
 83 | 
 84 |     class TreeBuilder(object):
 85 |         """Turn a document into a Beautiful Soup object tree."""
 86 | 
 87 |         features = []
 88 | 
 89 |         is_xml = False
 90 |         preserve_whitespace_tags = set()
 91 |         empty_element_tags = None # A tag will be considered an empty-element
 92 |                                   # tag when and only when it has no contents.
 93 | 
 94 |         # A value for these tag/attribute combinations is a space- or
 95 |         # comma-separated list of CDATA, rather than a single CDATA.
 96 |         cdata_list_attributes = {}
 97 | 
 98 | 
 99 |         def __init__(self):
100 |             self.soup = None
101 | 
102 |         def reset(self):
103 |             pass
104 | 
105 |         def can_be_empty_element(self, tag_name):
106 |             """Might a tag with this name be an empty-element tag?
107 | 
108 |             The final markup may or may not actually present this tag as
109 |             self-closing.
110 | 
111 |             For instance: an HTMLBuilder does not consider a <p> tag to be
112 |             an empty-element tag (it's not in
113 |             HTMLBuilder.empty_element_tags). This means an empty <p> tag
114 |             will be presented as "<p></p>", not "<p />".
115 | 
116 |             The default implementation has no opinion about which tags are
117 |             empty-element tags, so a tag will be presented as an
118 |             empty-element tag if and only if it has no contents.
119 |             "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
120 |             be left alone.
121 |             """
122 |             if self.empty_element_tags is None:
123 |                 return True
124 |             return tag_name in self.empty_element_tags
125 | 
126 |         def feed(self, markup):
127 |             raise NotImplementedError()
128 | 
129 |         def prepare_markup(self, markup, user_specified_encoding=None,
130 |                            document_declared_encoding=None):
131 |             return markup, None, None, False
132 | 
133 |         def test_fragment_to_document(self, fragment):
134 |             """Wrap an HTML fragment to make it look like a document.
135 | 
136 |             Different parsers do this differently. For instance, lxml
137 |             introduces an empty <head> tag, and html5lib
138 |             doesn't. Abstracting this away lets us write simple tests
139 |             which run HTML fragments through the parser and compare the
140 |             results against other HTML fragments.
141 | 
142 |             This method should not be used outside of tests.
143 |             """
144 |             return fragment
145 | 
146 |         def set_up_substitutions(self, tag):
147 |             return False
148 | 
149 |         def _replace_cdata_list_attribute_values(self, tag_name, attrs):
150 |             """Replaces class="foo bar" with class=["foo", "bar"]
151 | 
152 |             Modifies its input in place.
153 |             """
154 |             if self.cdata_list_attributes:
155 |                 universal = self.cdata_list_attributes.get('*', [])
156 |                 tag_specific = self.cdata_list_attributes.get(
157 |                     tag_name.lower(), [])
158 |                 for cdata_list_attr in itertools.chain(universal, tag_specific):
159 |                     if cdata_list_attr in attrs:
160 |                         # Basically, we have a "class" attribute whose
161 |                         # value is a whitespace-separated list of CSS
162 |                         # classes. Split it into a list.
163 |                         value = attrs[cdata_list_attr]
164 |                         if isinstance(value, basestring):
165 |                             values = whitespace_re.split(value)
166 |                         else:
167 |                             # html5lib sometimes calls setAttributes twice
168 |                             # for the same tag when rearranging the parse
169 |                             # tree. On the second call the attribute value
170 |                             # here is already a list.  If this happens,
171 |                             # leave the value alone rather than trying to
172 |                             # split it again.
173 |                             values = value
174 |                         attrs[cdata_list_attr] = values
175 |             return attrs
176 | 
177 |     class SAXTreeBuilder(TreeBuilder):
178 |         """A Beautiful Soup treebuilder that listens for SAX events."""
179 | 
180 |         def feed(self, markup):
181 |             raise NotImplementedError()
182 | 
183 |         def close(self):
184 |             pass
185 | 
186 |         def startElement(self, name, attrs):
187 |             attrs = dict((key[1], value) for key, value in list(attrs.items()))
188 |             #print "Start %s, %r" % (name, attrs)
189 |             self.soup.handle_starttag(name, attrs)
190 | 
191 |         def endElement(self, name):
192 |             #print "End %s" % name
193 |             self.soup.handle_endtag(name)
194 | 
195 |         def startElementNS(self, nsTuple, nodeName, attrs):
196 |             # Throw away (ns, nodeName) for now.
197 |             self.startElement(nodeName, attrs)
198 | 
199 |         def endElementNS(self, nsTuple, nodeName):
200 |             # Throw away (ns, nodeName) for now.
201 |             self.endElement(nodeName)
202 |             #handler.endElementNS((ns, node.nodeName), node.nodeName)
203 | 
204 |         def startPrefixMapping(self, prefix, nodeValue):
205 |             # Ignore the prefix for now.
206 |             pass
207 | 
208 |         def endPrefixMapping(self, prefix):
209 |             # Ignore the prefix for now.
210 |             # handler.endPrefixMapping(prefix)
211 |             pass
212 | 
213 |         def characters(self, content):
214 |             self.soup.handle_data(content)
215 | 
216 |         def startDocument(self):
217 |             pass
218 | 
219 |         def endDocument(self):
220 |             pass
221 | 
222 | 
223 |     class HTMLTreeBuilder(TreeBuilder):
224 |         """This TreeBuilder knows facts about HTML.
225 | 
226 |         Such as which tags are empty-element tags.
227 |         """
228 | 
229 |         preserve_whitespace_tags = set(['pre', 'textarea'])
230 |         empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
231 |                                   'spacer', 'link', 'frame', 'base'])
232 | 
233 |         # The HTML standard defines these attributes as containing a
234 |         # space-separated list of values, not a single value. That is,
235 |         # class="foo bar" means that the 'class' attribute has two values,
236 |         # 'foo' and 'bar', not the single value 'foo bar'.  When we
237 |         # encounter one of these attributes, we will parse its value into
238 |         # a list of values if possible. Upon output, the list will be
239 |         # converted back into a string.
240 |         cdata_list_attributes = {
241 |             "*" : ['class', 'accesskey', 'dropzone'],
242 |             "a" : ['rel', 'rev'],
243 |             "link" :  ['rel', 'rev'],
244 |             "td" : ["headers"],
245 |             "th" : ["headers"],
246 |             "td" : ["headers"],
247 |             "form" : ["accept-charset"],
248 |             "object" : ["archive"],
249 | 
250 |             # These are HTML5 specific, as are *.accesskey and *.dropzone above.
251 |             "area" : ["rel"],
252 |             "icon" : ["sizes"],
253 |             "iframe" : ["sandbox"],
254 |             "output" : ["for"],
255 |             }
256 | 
257 |         def set_up_substitutions(self, tag):
258 |             # We are only interested in <meta> tags
259 |             if tag.name != 'meta':
260 |                 return False
261 | 
262 |             http_equiv = tag.get('http-equiv')
263 |             content = tag.get('content')
264 |             charset = tag.get('charset')
265 | 
266 |             # We are interested in <meta> tags that say what encoding the
267 |             # document was originally in. This means HTML 5-style <meta>
268 |             # tags that provide the "charset" attribute. It also means
269 |             # HTML 4-style <meta> tags that provide the "content"
270 |             # attribute and have "http-equiv" set to "content-type".
271 |             #
272 |             # In both cases we will replace the value of the appropriate
273 |             # attribute with a standin object that can take on any
274 |             # encoding.
275 |             meta_encoding = None
276 |             if charset is not None:
277 |                 # HTML 5 style:
278 |                 # <meta charset="utf8">
279 |                 meta_encoding = charset
280 |                 tag['charset'] = CharsetMetaAttributeValue(charset)
281 | 
282 |             elif (content is not None and http_equiv is not None
283 |                   and http_equiv.lower() == 'content-type'):
284 |                 # HTML 4 style:
285 |                 # <meta http-equiv="content-type" content="text/html; charset=utf8">
286 |                 tag['content'] = ContentMetaAttributeValue(content)
287 | 
288 |             return (meta_encoding is not None)
289 | 
290 |     def register_treebuilders_from(module):
291 |         """Copy TreeBuilders from the given module into this module."""
292 |         # I'm fairly sure this is not the best way to do this.
293 |         this_module = sys.modules['pylinkvalidator.included.bs4.builder']
294 |         for name in module.__all__:
295 |             obj = getattr(module, name)
296 | 
297 |             if issubclass(obj, TreeBuilder):
298 |                 setattr(this_module, name, obj)
299 |                 this_module.__all__.append(name)
300 |                 # Register the builder while we're at it.
301 |                 this_module.builder_registry.register(obj)
302 | 
303 |     # Builders are registered in reverse order of priority, so that custom
304 |     # builder registrations will take precedence. In general, we want lxml
305 |     # to take precedence over html5lib, because it's faster. And we only
306 |     # want to use HTMLParser as a last result.
307 |     from . import _htmlparser
308 |     register_treebuilders_from(_htmlparser)
309 |     try:
310 |         from . import _html5lib
311 |         register_treebuilders_from(_html5lib)
312 |     except ImportError:
313 |         # They don't have html5lib installed.
314 |         pass
315 |     try:
316 |         from . import _lxml
317 |         register_treebuilders_from(_lxml)
318 |     except ImportError:
319 |         # They don't have lxml installed.
320 |         pass
321 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/builder/_html5lib.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | 
  4 | if sys.version_info[0] < 3:
  5 |     __all__ = [
  6 |         'HTML5TreeBuilder',
  7 |         ]
  8 | 
  9 |     import warnings
 10 |     from pylinkvalidator.included.bs4.builder import (
 11 |         PERMISSIVE,
 12 |         HTML,
 13 |         HTML_5,
 14 |         HTMLTreeBuilder,
 15 |         )
 16 |     from pylinkvalidator.included.bs4.element import NamespacedAttribute
 17 |     import html5lib
 18 |     from html5lib.constants import namespaces
 19 |     from pylinkvalidator.included.bs4.element import (
 20 |         Comment,
 21 |         Doctype,
 22 |         NavigableString,
 23 |         Tag,
 24 |         )
 25 | 
 26 |     class HTML5TreeBuilder(HTMLTreeBuilder):
 27 |         """Use html5lib to build a tree."""
 28 | 
 29 |         features = ['html5lib', PERMISSIVE, HTML_5, HTML]
 30 | 
 31 |         def prepare_markup(self, markup, user_specified_encoding):
 32 |             # Store the user-specified encoding for use later on.
 33 |             self.user_specified_encoding = user_specified_encoding
 34 |             return markup, None, None, False
 35 | 
 36 |         # These methods are defined by Beautiful Soup.
 37 |         def feed(self, markup):
 38 |             if self.soup.parse_only is not None:
 39 |                 warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
 40 |             parser = html5lib.HTMLParser(tree=self.create_treebuilder)
 41 |             doc = parser.parse(markup, encoding=self.user_specified_encoding)
 42 | 
 43 |             # Set the character encoding detected by the tokenizer.
 44 |             if isinstance(markup, unicode):
 45 |                 # We need to special-case this because html5lib sets
 46 |                 # charEncoding to UTF-8 if it gets Unicode input.
 47 |                 doc.original_encoding = None
 48 |             else:
 49 |                 doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
 50 | 
 51 |         def create_treebuilder(self, namespaceHTMLElements):
 52 |             self.underlying_builder = TreeBuilderForHtml5lib(
 53 |                 self.soup, namespaceHTMLElements)
 54 |             return self.underlying_builder
 55 | 
 56 |         def test_fragment_to_document(self, fragment):
 57 |             """See `TreeBuilder`."""
 58 |             return u'<html><head></head><body>%s</body></html>' % fragment
 59 | 
 60 | 
 61 |     class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
 62 | 
 63 |         def __init__(self, soup, namespaceHTMLElements):
 64 |             self.soup = soup
 65 |             super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 66 | 
 67 |         def documentClass(self):
 68 |             self.soup.reset()
 69 |             return Element(self.soup, self.soup, None)
 70 | 
 71 |         def insertDoctype(self, token):
 72 |             name = token["name"]
 73 |             publicId = token["publicId"]
 74 |             systemId = token["systemId"]
 75 | 
 76 |             doctype = Doctype.for_name_and_ids(name, publicId, systemId)
 77 |             self.soup.object_was_parsed(doctype)
 78 | 
 79 |         def elementClass(self, name, namespace):
 80 |             tag = self.soup.new_tag(name, namespace)
 81 |             return Element(tag, self.soup, namespace)
 82 | 
 83 |         def commentClass(self, data):
 84 |             return TextNode(Comment(data), self.soup)
 85 | 
 86 |         def fragmentClass(self):
 87 |             self.soup = BeautifulSoup("")
 88 |             self.soup.name = "[document_fragment]"
 89 |             return Element(self.soup, self.soup, None)
 90 | 
 91 |         def appendChild(self, node):
 92 |             # XXX This code is not covered by the BS4 tests.
 93 |             self.soup.append(node.element)
 94 | 
 95 |         def getDocument(self):
 96 |             return self.soup
 97 | 
 98 |         def getFragment(self):
 99 |             return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
100 | 
101 |     class AttrList(object):
102 |         def __init__(self, element):
103 |             self.element = element
104 |             self.attrs = dict(self.element.attrs)
105 |         def __iter__(self):
106 |             return list(self.attrs.items()).__iter__()
107 |         def __setitem__(self, name, value):
108 |             "set attr", name, value
109 |             self.element[name] = value
110 |         def items(self):
111 |             return list(self.attrs.items())
112 |         def keys(self):
113 |             return list(self.attrs.keys())
114 |         def __len__(self):
115 |             return len(self.attrs)
116 |         def __getitem__(self, name):
117 |             return self.attrs[name]
118 |         def __contains__(self, name):
119 |             return name in list(self.attrs.keys())
120 | 
121 | 
122 |     class Element(html5lib.treebuilders._base.Node):
123 |         def __init__(self, element, soup, namespace):
124 |             html5lib.treebuilders._base.Node.__init__(self, element.name)
125 |             self.element = element
126 |             self.soup = soup
127 |             self.namespace = namespace
128 | 
129 |         def appendChild(self, node):
130 |             if (node.element.__class__ == NavigableString and self.element.contents
131 |                 and self.element.contents[-1].__class__ == NavigableString):
132 |                 # Concatenate new text onto old text node
133 |                 # XXX This has O(n^2) performance, for input like
134 |                 # "a</a>a</a>a</a>..."
135 |                 old_element = self.element.contents[-1]
136 |                 new_element = self.soup.new_string(old_element + node.element)
137 |                 old_element.replace_with(new_element)
138 |                 self.soup._most_recent_element = new_element
139 |             else:
140 |                 self.soup.object_was_parsed(node.element, parent=self.element)
141 | 
142 |         def getAttributes(self):
143 |             return AttrList(self.element)
144 | 
145 |         def setAttributes(self, attributes):
146 |             if attributes is not None and len(attributes) > 0:
147 | 
148 |                 converted_attributes = []
149 |                 for name, value in list(attributes.items()):
150 |                     if isinstance(name, tuple):
151 |                         new_name = NamespacedAttribute(*name)
152 |                         del attributes[name]
153 |                         attributes[new_name] = value
154 | 
155 |                 self.soup.builder._replace_cdata_list_attribute_values(
156 |                     self.name, attributes)
157 |                 for name, value in attributes.items():
158 |                     self.element[name] = value
159 | 
160 |                 # The attributes may contain variables that need substitution.
161 |                 # Call set_up_substitutions manually.
162 |                 #
163 |                 # The Tag constructor called this method when the Tag was created,
164 |                 # but we just set/changed the attributes, so call it again.
165 |                 self.soup.builder.set_up_substitutions(self.element)
166 |         attributes = property(getAttributes, setAttributes)
167 | 
168 |         def insertText(self, data, insertBefore=None):
169 |             text = TextNode(self.soup.new_string(data), self.soup)
170 |             if insertBefore:
171 |                 self.insertBefore(text, insertBefore)
172 |             else:
173 |                 self.appendChild(text)
174 | 
175 |         def insertBefore(self, node, refNode):
176 |             index = self.element.index(refNode.element)
177 |             if (node.element.__class__ == NavigableString and self.element.contents
178 |                 and self.element.contents[index-1].__class__ == NavigableString):
179 |                 # (See comments in appendChild)
180 |                 old_node = self.element.contents[index-1]
181 |                 new_str = self.soup.new_string(old_node + node.element)
182 |                 old_node.replace_with(new_str)
183 |             else:
184 |                 self.element.insert(index, node.element)
185 |                 node.parent = self
186 | 
187 |         def removeChild(self, node):
188 |             node.element.extract()
189 | 
190 |         def reparentChildren(self, newParent):
191 |             while self.element.contents:
192 |                 child = self.element.contents[0]
193 |                 child.extract()
194 |                 if isinstance(child, Tag):
195 |                     newParent.appendChild(
196 |                         Element(child, self.soup, namespaces["html"]))
197 |                 else:
198 |                     newParent.appendChild(
199 |                         TextNode(child, self.soup))
200 | 
201 |         def cloneNode(self):
202 |             tag = self.soup.new_tag(self.element.name, self.namespace)
203 |             node = Element(tag, self.soup, self.namespace)
204 |             for key,value in self.attributes:
205 |                 node.attributes[key] = value
206 |             return node
207 | 
208 |         def hasContent(self):
209 |             return self.element.contents
210 | 
211 |         def getNameTuple(self):
212 |             if self.namespace == None:
213 |                 return namespaces["html"], self.name
214 |             else:
215 |                 return self.namespace, self.name
216 | 
217 |         nameTuple = property(getNameTuple)
218 | 
219 |     class TextNode(Element):
220 |         def __init__(self, element, soup):
221 |             html5lib.treebuilders._base.Node.__init__(self, None)
222 |             self.element = element
223 |             self.soup = soup
224 | 
225 |         def cloneNode(self):
226 |             raise NotImplementedError
227 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/builder/_htmlparser.py:
--------------------------------------------------------------------------------
  1 | """Use the HTMLParser library to parse HTML files that aren't too bad."""
  2 | import sys
  3 | 
  4 | if sys.version_info[0] < 3:
  5 | 
  6 |     __all__ = [
  7 |         'HTMLParserTreeBuilder',
  8 |         ]
  9 | 
 10 |     from HTMLParser import (
 11 |         HTMLParser,
 12 |         HTMLParseError,
 13 |         )
 14 |     import sys
 15 |     import warnings
 16 | 
 17 |     # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
 18 |     # argument, which we'd like to set to False. Unfortunately,
 19 |     # http://bugs.python.org/issue13273 makes strict=True a better bet
 20 |     # before Python 3.2.3.
 21 |     #
 22 |     # At the end of this file, we monkeypatch HTMLParser so that
 23 |     # strict=True works well on Python 3.2.2.
 24 |     major, minor, release = sys.version_info[:3]
 25 |     CONSTRUCTOR_TAKES_STRICT = (
 26 |         major > 3
 27 |         or (major == 3 and minor > 2)
 28 |         or (major == 3 and minor == 2 and release >= 3))
 29 | 
 30 |     from pylinkvalidator.included.bs4.element import (
 31 |         CData,
 32 |         Comment,
 33 |         Declaration,
 34 |         Doctype,
 35 |         ProcessingInstruction,
 36 |         )
 37 |     from pylinkvalidator.included.bs4.dammit import EntitySubstitution, UnicodeDammit
 38 | 
 39 |     from pylinkvalidator.included.bs4.builder import (
 40 |         HTML,
 41 |         HTMLTreeBuilder,
 42 |         STRICT,
 43 |         )
 44 | 
 45 | 
 46 |     HTMLPARSER = 'html.parser'
 47 | 
 48 |     class BeautifulSoupHTMLParser(HTMLParser):
 49 |         def handle_starttag(self, name, attrs):
 50 |             # XXX namespace
 51 |             self.soup.handle_starttag(name, None, None, dict(attrs))
 52 | 
 53 |         def handle_endtag(self, name):
 54 |             self.soup.handle_endtag(name)
 55 | 
 56 |         def handle_data(self, data):
 57 |             self.soup.handle_data(data)
 58 | 
 59 |         def handle_charref(self, name):
 60 |             # XXX workaround for a bug in HTMLParser. Remove this once
 61 |             # it's fixed.
 62 |             if name.startswith('x'):
 63 |                 real_name = int(name.lstrip('x'), 16)
 64 |             elif name.startswith('X'):
 65 |                 real_name = int(name.lstrip('X'), 16)
 66 |             else:
 67 |                 real_name = int(name)
 68 | 
 69 |             try:
 70 |                 data = unichr(real_name)
 71 |             except (ValueError, OverflowError), e:
 72 |                 data = u"\N{REPLACEMENT CHARACTER}"
 73 | 
 74 |             self.handle_data(data)
 75 | 
 76 |         def handle_entityref(self, name):
 77 |             character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
 78 |             if character is not None:
 79 |                 data = character
 80 |             else:
 81 |                 data = "&%s;" % name
 82 |             self.handle_data(data)
 83 | 
 84 |         def handle_comment(self, data):
 85 |             self.soup.endData()
 86 |             self.soup.handle_data(data)
 87 |             self.soup.endData(Comment)
 88 | 
 89 |         def handle_decl(self, data):
 90 |             self.soup.endData()
 91 |             if data.startswith("DOCTYPE "):
 92 |                 data = data[len("DOCTYPE "):]
 93 |             elif data == 'DOCTYPE':
 94 |                 # i.e. "<!DOCTYPE>"
 95 |                 data = ''
 96 |             self.soup.handle_data(data)
 97 |             self.soup.endData(Doctype)
 98 | 
 99 |         def unknown_decl(self, data):
100 |             if data.upper().startswith('CDATA['):
101 |                 cls = CData
102 |                 data = data[len('CDATA['):]
103 |             else:
104 |                 cls = Declaration
105 |             self.soup.endData()
106 |             self.soup.handle_data(data)
107 |             self.soup.endData(cls)
108 | 
109 |         def handle_pi(self, data):
110 |             self.soup.endData()
111 |             if data.endswith("?") and data.lower().startswith("xml"):
112 |                 # "An XHTML processing instruction using the trailing '?'
113 |                 # will cause the '?' to be included in data." - HTMLParser
114 |                 # docs.
115 |                 #
116 |                 # Strip the question mark so we don't end up with two
117 |                 # question marks.
118 |                 data = data[:-1]
119 |             self.soup.handle_data(data)
120 |             self.soup.endData(ProcessingInstruction)
121 | 
122 | 
123 |     class HTMLParserTreeBuilder(HTMLTreeBuilder):
124 | 
125 |         is_xml = False
126 |         features = [HTML, STRICT, HTMLPARSER]
127 | 
128 |         def __init__(self, *args, **kwargs):
129 |             if CONSTRUCTOR_TAKES_STRICT:
130 |                 kwargs['strict'] = False
131 |             self.parser_args = (args, kwargs)
132 | 
133 |         def prepare_markup(self, markup, user_specified_encoding=None,
134 |                            document_declared_encoding=None):
135 |             """
136 |             :return: A 4-tuple (markup, original encoding, encoding
137 |             declared within markup, whether any characters had to be
138 |             replaced with REPLACEMENT CHARACTER).
139 |             """
140 |             if isinstance(markup, unicode):
141 |                 return markup, None, None, False
142 | 
143 |             try_encodings = [user_specified_encoding, document_declared_encoding]
144 |             dammit = UnicodeDammit(markup, try_encodings, is_html=True)
145 |             return (dammit.markup, dammit.original_encoding,
146 |                     dammit.declared_html_encoding,
147 |                     dammit.contains_replacement_characters)
148 | 
149 |         def feed(self, markup):
150 |             args, kwargs = self.parser_args
151 |             parser = BeautifulSoupHTMLParser(*args, **kwargs)
152 |             parser.soup = self.soup
153 |             try:
154 |                 parser.feed(markup)
155 |             except HTMLParseError, e:
156 |                 warnings.warn(RuntimeWarning(
157 |                     "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
158 |                 raise e
159 | 
160 |     # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
161 |     # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
162 |     # string.
163 |     #
164 |     # XXX This code can be removed once most Python 3 users are on 3.2.3.
165 |     if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
166 |         import re
167 |         attrfind_tolerant = re.compile(
168 |             r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
169 |             r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
170 |         HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
171 | 
172 |         locatestarttagend = re.compile(r"""
173 |       <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
174 |       (?:\s+                             # whitespace before attribute name
175 |         (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
176 |           (?:\s*=\s*                     # value indicator
177 |             (?:'[^']*'                   # LITA-enclosed value
178 |               |\"[^\"]*\"                # LIT-enclosed value
179 |               |[^'\">\s]+                # bare value
180 |              )
181 |            )?
182 |          )
183 |        )*
184 |       \s*                                # trailing whitespace
185 |     """, re.VERBOSE)
186 |         BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
187 | 
188 |         from html.parser import tagfind, attrfind
189 | 
190 |         def parse_starttag(self, i):
191 |             self.__starttag_text = None
192 |             endpos = self.check_for_whole_start_tag(i)
193 |             if endpos < 0:
194 |                 return endpos
195 |             rawdata = self.rawdata
196 |             self.__starttag_text = rawdata[i:endpos]
197 | 
198 |             # Now parse the data between i+1 and j into a tag and attrs
199 |             attrs = []
200 |             match = tagfind.match(rawdata, i+1)
201 |             assert match, 'unexpected call to parse_starttag()'
202 |             k = match.end()
203 |             self.lasttag = tag = rawdata[i+1:k].lower()
204 |             while k < endpos:
205 |                 if self.strict:
206 |                     m = attrfind.match(rawdata, k)
207 |                 else:
208 |                     m = attrfind_tolerant.match(rawdata, k)
209 |                 if not m:
210 |                     break
211 |                 attrname, rest, attrvalue = m.group(1, 2, 3)
212 |                 if not rest:
213 |                     attrvalue = None
214 |                 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
215 |                      attrvalue[:1] == '"' == attrvalue[-1:]:
216 |                     attrvalue = attrvalue[1:-1]
217 |                 if attrvalue:
218 |                     attrvalue = self.unescape(attrvalue)
219 |                 attrs.append((attrname.lower(), attrvalue))
220 |                 k = m.end()
221 | 
222 |             end = rawdata[k:endpos].strip()
223 |             if end not in (">", "/>"):
224 |                 lineno, offset = self.getpos()
225 |                 if "\n" in self.__starttag_text:
226 |                     lineno = lineno + self.__starttag_text.count("\n")
227 |                     offset = len(self.__starttag_text) \
228 |                              - self.__starttag_text.rfind("\n")
229 |                 else:
230 |                     offset = offset + len(self.__starttag_text)
231 |                 if self.strict:
232 |                     self.error("junk characters in start tag: %r"
233 |                                % (rawdata[k:endpos][:20],))
234 |                 self.handle_data(rawdata[i:endpos])
235 |                 return endpos
236 |             if end.endswith('/>'):
237 |                 # XHTML-style empty tag: <span attr="value" />
238 |                 self.handle_startendtag(tag, attrs)
239 |             else:
240 |                 self.handle_starttag(tag, attrs)
241 |                 if tag in self.CDATA_CONTENT_ELEMENTS:
242 |                     self.set_cdata_mode(tag)
243 |             return endpos
244 | 
245 |         def set_cdata_mode(self, elem):
246 |             self.cdata_elem = elem.lower()
247 |             self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
248 | 
249 |         BeautifulSoupHTMLParser.parse_starttag = parse_starttag
250 |         BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
251 | 
252 |         CONSTRUCTOR_TAKES_STRICT = True
253 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/builder/_lxml.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | 
  4 | if sys.version_info[0] < 3:
  5 |     __all__ = [
  6 |         'LXMLTreeBuilderForXML',
  7 |         'LXMLTreeBuilder',
  8 |         ]
  9 | 
 10 |     from io import BytesIO
 11 |     from StringIO import StringIO
 12 |     import collections
 13 |     from lxml import etree
 14 |     from pylinkvalidator.included.bs4.element import Comment, Doctype, NamespacedAttribute
 15 |     from pylinkvalidator.included.bs4.builder import (
 16 |         FAST,
 17 |         HTML,
 18 |         HTMLTreeBuilder,
 19 |         PERMISSIVE,
 20 |         TreeBuilder,
 21 |         XML)
 22 |     from pylinkvalidator.included.bs4.dammit import UnicodeDammit
 23 | 
 24 |     LXML = 'lxml'
 25 | 
 26 |     class LXMLTreeBuilderForXML(TreeBuilder):
 27 |         DEFAULT_PARSER_CLASS = etree.XMLParser
 28 | 
 29 |         is_xml = True
 30 | 
 31 |         # Well, it's permissive by XML parser standards.
 32 |         features = [LXML, XML, FAST, PERMISSIVE]
 33 | 
 34 |         CHUNK_SIZE = 512
 35 | 
 36 |         # This namespace mapping is specified in the XML Namespace
 37 |         # standard.
 38 |         DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
 39 | 
 40 |         @property
 41 |         def default_parser(self):
 42 |             # This can either return a parser object or a class, which
 43 |             # will be instantiated with default arguments.
 44 |             return etree.XMLParser(target=self, strip_cdata=False, recover=True)
 45 | 
 46 |         def __init__(self, parser=None, empty_element_tags=None):
 47 |             if empty_element_tags is not None:
 48 |                 self.empty_element_tags = set(empty_element_tags)
 49 |             if parser is None:
 50 |                 # Use the default parser.
 51 |                 parser = self.default_parser
 52 |             if isinstance(parser, collections.Callable):
 53 |                 # Instantiate the parser with default arguments
 54 |                 parser = parser(target=self, strip_cdata=False)
 55 |             self.parser = parser
 56 |             self.soup = None
 57 |             self.nsmaps = [self.DEFAULT_NSMAPS]
 58 | 
 59 |         def _getNsTag(self, tag):
 60 |             # Split the namespace URL out of a fully-qualified lxml tag
 61 |             # name. Copied from lxml's src/lxml/sax.py.
 62 |             if tag[0] == '{':
 63 |                 return tuple(tag[1:].split('}', 1))
 64 |             else:
 65 |                 return (None, tag)
 66 | 
 67 |         def prepare_markup(self, markup, user_specified_encoding=None,
 68 |                            document_declared_encoding=None):
 69 |             """
 70 |             :return: A 3-tuple (markup, original encoding, encoding
 71 |             declared within markup).
 72 |             """
 73 |             if isinstance(markup, unicode):
 74 |                 return markup, None, None, False
 75 | 
 76 |             try_encodings = [user_specified_encoding, document_declared_encoding]
 77 |             dammit = UnicodeDammit(markup, try_encodings, is_html=True)
 78 |             return (dammit.markup, dammit.original_encoding,
 79 |                     dammit.declared_html_encoding,
 80 |                     dammit.contains_replacement_characters)
 81 | 
 82 |         def feed(self, markup):
 83 |             if isinstance(markup, bytes):
 84 |                 markup = BytesIO(markup)
 85 |             elif isinstance(markup, unicode):
 86 |                 markup = StringIO(markup)
 87 |             # Call feed() at least once, even if the markup is empty,
 88 |             # or the parser won't be initialized.
 89 |             data = markup.read(self.CHUNK_SIZE)
 90 |             self.parser.feed(data)
 91 |             while data != '':
 92 |                 # Now call feed() on the rest of the data, chunk by chunk.
 93 |                 data = markup.read(self.CHUNK_SIZE)
 94 |                 if data != '':
 95 |                     self.parser.feed(data)
 96 |             self.parser.close()
 97 | 
 98 |         def close(self):
 99 |             self.nsmaps = [self.DEFAULT_NSMAPS]
100 | 
101 |         def start(self, name, attrs, nsmap={}):
102 |             # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
103 |             attrs = dict(attrs)
104 |             nsprefix = None
105 |             # Invert each namespace map as it comes in.
106 |             if len(self.nsmaps) > 1:
107 |                 # There are no new namespaces for this tag, but
108 |                 # non-default namespaces are in play, so we need a
109 |                 # separate tag stack to know when they end.
110 |                 self.nsmaps.append(None)
111 |             elif len(nsmap) > 0:
112 |                 # A new namespace mapping has come into play.
113 |                 inverted_nsmap = dict((value, key) for key, value in nsmap.items())
114 |                 self.nsmaps.append(inverted_nsmap)
115 |                 # Also treat the namespace mapping as a set of attributes on the
116 |                 # tag, so we can recreate it later.
117 |                 attrs = attrs.copy()
118 |                 for prefix, namespace in nsmap.items():
119 |                     attribute = NamespacedAttribute(
120 |                         "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
121 |                     attrs[attribute] = namespace
122 | 
123 |             # Namespaces are in play. Find any attributes that came in
124 |             # from lxml with namespaces attached to their names, and
125 |             # turn then into NamespacedAttribute objects.
126 |             new_attrs = {}
127 |             for attr, value in attrs.items():
128 |                 namespace, attr = self._getNsTag(attr)
129 |                 if namespace is None:
130 |                     new_attrs[attr] = value
131 |                 else:
132 |                     nsprefix = self._prefix_for_namespace(namespace)
133 |                     attr = NamespacedAttribute(nsprefix, attr, namespace)
134 |                     new_attrs[attr] = value
135 |             attrs = new_attrs
136 | 
137 |             namespace, name = self._getNsTag(name)
138 |             nsprefix = self._prefix_for_namespace(namespace)
139 |             self.soup.handle_starttag(name, namespace, nsprefix, attrs)
140 | 
141 |         def _prefix_for_namespace(self, namespace):
142 |             """Find the currently active prefix for the given namespace."""
143 |             if namespace is None:
144 |                 return None
145 |             for inverted_nsmap in reversed(self.nsmaps):
146 |                 if inverted_nsmap is not None and namespace in inverted_nsmap:
147 |                     return inverted_nsmap[namespace]
148 |             return None
149 | 
150 |         def end(self, name):
151 |             self.soup.endData()
152 |             completed_tag = self.soup.tagStack[-1]
153 |             namespace, name = self._getNsTag(name)
154 |             nsprefix = None
155 |             if namespace is not None:
156 |                 for inverted_nsmap in reversed(self.nsmaps):
157 |                     if inverted_nsmap is not None and namespace in inverted_nsmap:
158 |                         nsprefix = inverted_nsmap[namespace]
159 |                         break
160 |             self.soup.handle_endtag(name, nsprefix)
161 |             if len(self.nsmaps) > 1:
162 |                 # This tag, or one of its parents, introduced a namespace
163 |                 # mapping, so pop it off the stack.
164 |                 self.nsmaps.pop()
165 | 
166 |         def pi(self, target, data):
167 |             pass
168 | 
169 |         def data(self, content):
170 |             self.soup.handle_data(content)
171 | 
172 |         def doctype(self, name, pubid, system):
173 |             self.soup.endData()
174 |             doctype = Doctype.for_name_and_ids(name, pubid, system)
175 |             self.soup.object_was_parsed(doctype)
176 | 
177 |         def comment(self, content):
178 |             "Handle comments as Comment objects."
179 |             self.soup.endData()
180 |             self.soup.handle_data(content)
181 |             self.soup.endData(Comment)
182 | 
183 |         def test_fragment_to_document(self, fragment):
184 |             """See `TreeBuilder`."""
185 |             return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
186 | 
187 | 
188 |     class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
189 | 
190 |         features = [LXML, HTML, FAST, PERMISSIVE]
191 |         is_xml = False
192 | 
193 |         @property
194 |         def default_parser(self):
195 |             return etree.HTMLParser
196 | 
197 |         def feed(self, markup):
198 |             self.parser.feed(markup)
199 |             self.parser.close()
200 | 
201 |         def test_fragment_to_document(self, fragment):
202 |             """See `TreeBuilder`."""
203 |             return u'<html><body>%s</body></html>' % fragment
204 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/dammit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Beautiful Soup bonus library: Unicode, Dammit
  3 | 
  4 | This class forces XML data into a standard format (usually to UTF-8 or
  5 | Unicode).  It is heavily based on code from Mark Pilgrim's Universal
  6 | Feed Parser. It does not rewrite the XML or HTML to reflect a new
  7 | encoding; that's the tree builder's job.
  8 | """
  9 | 
 10 | import sys
 11 | 
 12 | if sys.version_info[0] < 3:
 13 |     import codecs
 14 |     from htmlentitydefs import codepoint2name
 15 |     import re
 16 |     import logging
 17 | 
 18 |     # Import a library to autodetect character encodings.
 19 |     chardet_type = None
 20 |     try:
 21 |         # First try the fast C implementation.
 22 |         #  PyPI package: cchardet
 23 |         import cchardet
 24 |         def chardet_dammit(s):
 25 |             return cchardet.detect(s)['encoding']
 26 |     except ImportError:
 27 |         try:
 28 |             # Fall back to the pure Python implementation
 29 |             #  Debian package: python-chardet
 30 |             #  PyPI package: chardet
 31 |             import chardet
 32 |             def chardet_dammit(s):
 33 |                 return chardet.detect(s)['encoding']
 34 |             #import chardet.constants
 35 |             #chardet.constants._debug = 1
 36 |         except ImportError:
 37 |             # No chardet available.
 38 |             def chardet_dammit(s):
 39 |                 return None
 40 | 
 41 |     # Available from http://cjkpython.i18n.org/.
 42 |     try:
 43 |         import iconv_codec
 44 |     except ImportError:
 45 |         pass
 46 | 
 47 |     xml_encoding_re = re.compile(
 48 |         '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
 49 |     html_meta_re = re.compile(
 50 |         '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 51 | 
 52 |     class EntitySubstitution(object):
 53 | 
 54 |         """Substitute XML or HTML entities for the corresponding characters."""
 55 | 
 56 |         def _populate_class_variables():
 57 |             lookup = {}
 58 |             reverse_lookup = {}
 59 |             characters_for_re = []
 60 |             for codepoint, name in list(codepoint2name.items()):
 61 |                 character = unichr(codepoint)
 62 |                 if codepoint != 34:
 63 |                     # There's no point in turning the quotation mark into
 64 |                     # &quot;, unless it happens within an attribute value, which
 65 |                     # is handled elsewhere.
 66 |                     characters_for_re.append(character)
 67 |                     lookup[character] = name
 68 |                 # But we do want to turn &quot; into the quotation mark.
 69 |                 reverse_lookup[name] = character
 70 |             re_definition = "[%s]" % "".join(characters_for_re)
 71 |             return lookup, reverse_lookup, re.compile(re_definition)
 72 |         (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
 73 |          CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
 74 | 
 75 |         CHARACTER_TO_XML_ENTITY = {
 76 |             "'": "apos",
 77 |             '"': "quot",
 78 |             "&": "amp",
 79 |             "<": "lt",
 80 |             ">": "gt",
 81 |             }
 82 | 
 83 |         BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 84 |                                                "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 85 |                                                ")")
 86 | 
 87 |         AMPERSAND_OR_BRACKET = re.compile("([<>&])")
 88 | 
 89 |         @classmethod
 90 |         def _substitute_html_entity(cls, matchobj):
 91 |             entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
 92 |             return "&%s;" % entity
 93 | 
 94 |         @classmethod
 95 |         def _substitute_xml_entity(cls, matchobj):
 96 |             """Used with a regular expression to substitute the
 97 |             appropriate XML entity for an XML special character."""
 98 |             entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
 99 |             return "&%s;" % entity
100 | 
101 |         @classmethod
102 |         def quoted_attribute_value(self, value):
103 |             """Make a value into a quoted XML attribute, possibly escaping it.
104 | 
105 |              Most strings will be quoted using double quotes.
106 | 
107 |               Bob's Bar -> "Bob's Bar"
108 | 
109 |              If a string contains double quotes, it will be quoted using
110 |              single quotes.
111 | 
112 |               Welcome to "my bar" -> 'Welcome to "my bar"'
113 | 
114 |              If a string contains both single and double quotes, the
115 |              double quotes will be escaped, and the string will be quoted
116 |              using double quotes.
117 | 
118 |               Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
119 |             """
120 |             quote_with = '"'
121 |             if '"' in value:
122 |                 if "'" in value:
123 |                     # The string contains both single and double
124 |                     # quotes.  Turn the double quotes into
125 |                     # entities. We quote the double quotes rather than
126 |                     # the single quotes because the entity name is
127 |                     # "&quot;" whether this is HTML or XML.  If we
128 |                     # quoted the single quotes, we'd have to decide
129 |                     # between &apos; and &squot;.
130 |                     replace_with = "&quot;"
131 |                     value = value.replace('"', replace_with)
132 |                 else:
133 |                     # There are double quotes but no single quotes.
134 |                     # We can use single quotes to quote the attribute.
135 |                     quote_with = "'"
136 |             return quote_with + value + quote_with
137 | 
138 |         @classmethod
139 |         def substitute_xml(cls, value, make_quoted_attribute=False):
140 |             """Substitute XML entities for special XML characters.
141 | 
142 |             :param value: A string to be substituted. The less-than sign
143 |               will become &lt;, the greater-than sign will become &gt;,
144 |               and any ampersands will become &amp;. If you want ampersands
145 |               that appear to be part of an entity definition to be left
146 |               alone, use substitute_xml_containing_entities() instead.
147 | 
148 |             :param make_quoted_attribute: If True, then the string will be
149 |              quoted, as befits an attribute value.
150 |             """
151 |             # Escape angle brackets and ampersands.
152 |             value = cls.AMPERSAND_OR_BRACKET.sub(
153 |                 cls._substitute_xml_entity, value)
154 | 
155 |             if make_quoted_attribute:
156 |                 value = cls.quoted_attribute_value(value)
157 |             return value
158 | 
159 |         @classmethod
160 |         def substitute_xml_containing_entities(
161 |             cls, value, make_quoted_attribute=False):
162 |             """Substitute XML entities for special XML characters.
163 | 
164 |             :param value: A string to be substituted. The less-than sign will
165 |               become &lt;, the greater-than sign will become &gt;, and any
166 |               ampersands that are not part of an entity defition will
167 |               become &amp;.
168 | 
169 |             :param make_quoted_attribute: If True, then the string will be
170 |              quoted, as befits an attribute value.
171 |             """
172 |             # Escape angle brackets, and ampersands that aren't part of
173 |             # entities.
174 |             value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
175 |                 cls._substitute_xml_entity, value)
176 | 
177 |             if make_quoted_attribute:
178 |                 value = cls.quoted_attribute_value(value)
179 |             return value
180 | 
181 | 
182 |         @classmethod
183 |         def substitute_html(cls, s):
184 |             """Replace certain Unicode characters with named HTML entities.
185 | 
186 |             This differs from data.encode(encoding, 'xmlcharrefreplace')
187 |             in that the goal is to make the result more readable (to those
188 |             with ASCII displays) rather than to recover from
189 |             errors. There's absolutely nothing wrong with a UTF-8 string
190 |             containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
191 |             character with "&eacute;" will make it more readable to some
192 |             people.
193 |             """
194 |             return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
195 |                 cls._substitute_html_entity, s)
196 | 
197 | 
198 |     class UnicodeDammit:
199 |         """A class for detecting the encoding of a *ML document and
200 |         converting it to a Unicode string. If the source encoding is
201 |         windows-1252, can replace MS smart quotes with their HTML or XML
202 |         equivalents."""
203 | 
204 |         # This dictionary maps commonly seen values for "charset" in HTML
205 |         # meta tags to the corresponding Python codec names. It only covers
206 |         # values that aren't in Python's aliases and can't be determined
207 |         # by the heuristics in find_codec.
208 |         CHARSET_ALIASES = {"macintosh": "mac-roman",
209 |                            "x-sjis": "shift-jis"}
210 | 
211 |         ENCODINGS_WITH_SMART_QUOTES = [
212 |             "windows-1252",
213 |             "iso-8859-1",
214 |             "iso-8859-2",
215 |             ]
216 | 
217 |         def __init__(self, markup, override_encodings=[],
218 |                      smart_quotes_to=None, is_html=False):
219 |             self.declared_html_encoding = None
220 |             self.smart_quotes_to = smart_quotes_to
221 |             self.tried_encodings = []
222 |             self.contains_replacement_characters = False
223 | 
224 |             if markup == '' or isinstance(markup, unicode):
225 |                 self.markup = markup
226 |                 self.unicode_markup = unicode(markup)
227 |                 self.original_encoding = None
228 |                 return
229 | 
230 |             new_markup, document_encoding, sniffed_encoding = \
231 |                 self._detectEncoding(markup, is_html)
232 |             self.markup = new_markup
233 | 
234 |             u = None
235 |             if new_markup != markup:
236 |                 # _detectEncoding modified the markup, then converted it to
237 |                 # Unicode and then to UTF-8. So convert it from UTF-8.
238 |                 u = self._convert_from("utf8")
239 |                 self.original_encoding = sniffed_encoding
240 | 
241 |             if not u:
242 |                 for proposed_encoding in (
243 |                     override_encodings + [document_encoding, sniffed_encoding]):
244 |                     if proposed_encoding is not None:
245 |                         u = self._convert_from(proposed_encoding)
246 |                         if u:
247 |                             break
248 | 
249 |             # If no luck and we have auto-detection library, try that:
250 |             if not u and not isinstance(self.markup, unicode):
251 |                 u = self._convert_from(chardet_dammit(self.markup))
252 | 
253 |             # As a last resort, try utf-8 and windows-1252:
254 |             if not u:
255 |                 for proposed_encoding in ("utf-8", "windows-1252"):
256 |                     u = self._convert_from(proposed_encoding)
257 |                     if u:
258 |                         break
259 | 
260 |             # As an absolute last resort, try the encodings again with
261 |             # character replacement.
262 |             if not u:
263 |                 for proposed_encoding in (
264 |                     override_encodings + [
265 |                         document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
266 |                     if proposed_encoding != "ascii":
267 |                         u = self._convert_from(proposed_encoding, "replace")
268 |                     if u is not None:
269 |                         logging.warning(
270 |                                 "Some characters could not be decoded, and were "
271 |                                 "replaced with REPLACEMENT CHARACTER.")
272 |                         self.contains_replacement_characters = True
273 |                         break
274 | 
275 |             # We could at this point force it to ASCII, but that would
276 |             # destroy so much data that I think giving up is better
277 |             self.unicode_markup = u
278 |             if not u:
279 |                 self.original_encoding = None
280 | 
281 |         def _sub_ms_char(self, match):
282 |             """Changes a MS smart quote character to an XML or HTML
283 |             entity, or an ASCII character."""
284 |             orig = match.group(1)
285 |             if self.smart_quotes_to == 'ascii':
286 |                 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
287 |             else:
288 |                 sub = self.MS_CHARS.get(orig)
289 |                 if type(sub) == tuple:
290 |                     if self.smart_quotes_to == 'xml':
291 |                         sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
292 |                     else:
293 |                         sub = '&'.encode() + sub[0].encode() + ';'.encode()
294 |                 else:
295 |                     sub = sub.encode()
296 |             return sub
297 | 
298 |         def _convert_from(self, proposed, errors="strict"):
299 |             proposed = self.find_codec(proposed)
300 |             if not proposed or (proposed, errors) in self.tried_encodings:
301 |                 return None
302 |             self.tried_encodings.append((proposed, errors))
303 |             markup = self.markup
304 |             # Convert smart quotes to HTML if coming from an encoding
305 |             # that might have them.
306 |             if (self.smart_quotes_to is not None
307 |                 and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
308 |                 smart_quotes_re = b"([\x80-\x9f])"
309 |                 smart_quotes_compiled = re.compile(smart_quotes_re)
310 |                 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
311 | 
312 |             try:
313 |                 #print "Trying to convert document to %s (errors=%s)" % (
314 |                 #    proposed, errors)
315 |                 u = self._to_unicode(markup, proposed, errors)
316 |                 self.markup = u
317 |                 self.original_encoding = proposed
318 |             except Exception as e:
319 |                 #print "That didn't work!"
320 |                 #print e
321 |                 return None
322 |             #print "Correct encoding: %s" % proposed
323 |             return self.markup
324 | 
325 |         def _to_unicode(self, data, encoding, errors="strict"):
326 |             '''Given a string and its encoding, decodes the string into Unicode.
327 |             %encoding is a string recognized by encodings.aliases'''
328 | 
329 |             # strip Byte Order Mark (if present)
330 |             if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
331 |                    and (data[2:4] != '\x00\x00'):
332 |                 encoding = 'utf-16be'
333 |                 data = data[2:]
334 |             elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
335 |                      and (data[2:4] != '\x00\x00'):
336 |                 encoding = 'utf-16le'
337 |                 data = data[2:]
338 |             elif data[:3] == '\xef\xbb\xbf':
339 |                 encoding = 'utf-8'
340 |                 data = data[3:]
341 |             elif data[:4] == '\x00\x00\xfe\xff':
342 |                 encoding = 'utf-32be'
343 |                 data = data[4:]
344 |             elif data[:4] == '\xff\xfe\x00\x00':
345 |                 encoding = 'utf-32le'
346 |                 data = data[4:]
347 |             newdata = unicode(data, encoding, errors)
348 |             return newdata
349 | 
350 |         def _detectEncoding(self, xml_data, is_html=False):
351 |             """Given a document, tries to detect its XML encoding."""
352 |             xml_encoding = sniffed_xml_encoding = None
353 |             try:
354 |                 if xml_data[:4] == b'\x4c\x6f\xa7\x94':
355 |                     # EBCDIC
356 |                     xml_data = self._ebcdic_to_ascii(xml_data)
357 |                 elif xml_data[:4] == b'\x00\x3c\x00\x3f':
358 |                     # UTF-16BE
359 |                     sniffed_xml_encoding = 'utf-16be'
360 |                     xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
361 |                 elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
362 |                          and (xml_data[2:4] != b'\x00\x00'):
363 |                     # UTF-16BE with BOM
364 |                     sniffed_xml_encoding = 'utf-16be'
365 |                     xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
366 |                 elif xml_data[:4] == b'\x3c\x00\x3f\x00':
367 |                     # UTF-16LE
368 |                     sniffed_xml_encoding = 'utf-16le'
369 |                     xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
370 |                 elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
371 |                          (xml_data[2:4] != b'\x00\x00'):
372 |                     # UTF-16LE with BOM
373 |                     sniffed_xml_encoding = 'utf-16le'
374 |                     xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
375 |                 elif xml_data[:4] == b'\x00\x00\x00\x3c':
376 |                     # UTF-32BE
377 |                     sniffed_xml_encoding = 'utf-32be'
378 |                     xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
379 |                 elif xml_data[:4] == b'\x3c\x00\x00\x00':
380 |                     # UTF-32LE
381 |                     sniffed_xml_encoding = 'utf-32le'
382 |                     xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
383 |                 elif xml_data[:4] == b'\x00\x00\xfe\xff':
384 |                     # UTF-32BE with BOM
385 |                     sniffed_xml_encoding = 'utf-32be'
386 |                     xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
387 |                 elif xml_data[:4] == b'\xff\xfe\x00\x00':
388 |                     # UTF-32LE with BOM
389 |                     sniffed_xml_encoding = 'utf-32le'
390 |                     xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
391 |                 elif xml_data[:3] == b'\xef\xbb\xbf':
392 |                     # UTF-8 with BOM
393 |                     sniffed_xml_encoding = 'utf-8'
394 |                     xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
395 |                 else:
396 |                     sniffed_xml_encoding = 'ascii'
397 |                     pass
398 |             except:
399 |                 xml_encoding_match = None
400 |             xml_encoding_match = xml_encoding_re.match(xml_data)
401 |             if not xml_encoding_match and is_html:
402 |                 xml_encoding_match = html_meta_re.search(xml_data)
403 |             if xml_encoding_match is not None:
404 |                 xml_encoding = xml_encoding_match.groups()[0].decode(
405 |                     'ascii').lower()
406 |                 if is_html:
407 |                     self.declared_html_encoding = xml_encoding
408 |                 if sniffed_xml_encoding and \
409 |                    (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
410 |                                      'iso-10646-ucs-4', 'ucs-4', 'csucs4',
411 |                                      'utf-16', 'utf-32', 'utf_16', 'utf_32',
412 |                                      'utf16', 'u16')):
413 |                     xml_encoding = sniffed_xml_encoding
414 |             return xml_data, xml_encoding, sniffed_xml_encoding
415 | 
416 |         def find_codec(self, charset):
417 |             return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
418 |                    or (charset and self._codec(charset.replace("-", ""))) \
419 |                    or (charset and self._codec(charset.replace("-", "_"))) \
420 |                    or charset
421 | 
422 |         def _codec(self, charset):
423 |             if not charset:
424 |                 return charset
425 |             codec = None
426 |             try:
427 |                 codecs.lookup(charset)
428 |                 codec = charset
429 |             except (LookupError, ValueError):
430 |                 pass
431 |             return codec
432 | 
433 |         EBCDIC_TO_ASCII_MAP = None
434 | 
435 |         def _ebcdic_to_ascii(self, s):
436 |             c = self.__class__
437 |             if not c.EBCDIC_TO_ASCII_MAP:
438 |                 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
439 |                         16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
440 |                         128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
441 |                         144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
442 |                         32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
443 |                         38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
444 |                         45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
445 |                         186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
446 |                         195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
447 |                         201,202,106,107,108,109,110,111,112,113,114,203,204,205,
448 |                         206,207,208,209,126,115,116,117,118,119,120,121,122,210,
449 |                         211,212,213,214,215,216,217,218,219,220,221,222,223,224,
450 |                         225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
451 |                         73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
452 |                         82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
453 |                         90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
454 |                         250,251,252,253,254,255)
455 |                 import string
456 |                 c.EBCDIC_TO_ASCII_MAP = string.maketrans(
457 |                 ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
458 |             return s.translate(c.EBCDIC_TO_ASCII_MAP)
459 | 
460 |         # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
461 |         MS_CHARS = {b'\x80': ('euro', '20AC'),
462 |                     b'\x81': ' ',
463 |                     b'\x82': ('sbquo', '201A'),
464 |                     b'\x83': ('fnof', '192'),
465 |                     b'\x84': ('bdquo', '201E'),
466 |                     b'\x85': ('hellip', '2026'),
467 |                     b'\x86': ('dagger', '2020'),
468 |                     b'\x87': ('Dagger', '2021'),
469 |                     b'\x88': ('circ', '2C6'),
470 |                     b'\x89': ('permil', '2030'),
471 |                     b'\x8A': ('Scaron', '160'),
472 |                     b'\x8B': ('lsaquo', '2039'),
473 |                     b'\x8C': ('OElig', '152'),
474 |                     b'\x8D': '?',
475 |                     b'\x8E': ('#x17D', '17D'),
476 |                     b'\x8F': '?',
477 |                     b'\x90': '?',
478 |                     b'\x91': ('lsquo', '2018'),
479 |                     b'\x92': ('rsquo', '2019'),
480 |                     b'\x93': ('ldquo', '201C'),
481 |                     b'\x94': ('rdquo', '201D'),
482 |                     b'\x95': ('bull', '2022'),
483 |                     b'\x96': ('ndash', '2013'),
484 |                     b'\x97': ('mdash', '2014'),
485 |                     b'\x98': ('tilde', '2DC'),
486 |                     b'\x99': ('trade', '2122'),
487 |                     b'\x9a': ('scaron', '161'),
488 |                     b'\x9b': ('rsaquo', '203A'),
489 |                     b'\x9c': ('oelig', '153'),
490 |                     b'\x9d': '?',
491 |                     b'\x9e': ('#x17E', '17E'),
492 |                     b'\x9f': ('Yuml', ''),}
493 | 
494 |         # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
495 |         # horrors like stripping diacritical marks to turn á into a, but also
496 |         # contains non-horrors like turning “ into ".
497 |         MS_CHARS_TO_ASCII = {
498 |             b'\x80' : 'EUR',
499 |             b'\x81' : ' ',
500 |             b'\x82' : ',',
501 |             b'\x83' : 'f',
502 |             b'\x84' : ',,',
503 |             b'\x85' : '...',
504 |             b'\x86' : '+',
505 |             b'\x87' : '++',
506 |             b'\x88' : '^',
507 |             b'\x89' : '%',
508 |             b'\x8a' : 'S',
509 |             b'\x8b' : '<',
510 |             b'\x8c' : 'OE',
511 |             b'\x8d' : '?',
512 |             b'\x8e' : 'Z',
513 |             b'\x8f' : '?',
514 |             b'\x90' : '?',
515 |             b'\x91' : "'",
516 |             b'\x92' : "'",
517 |             b'\x93' : '"',
518 |             b'\x94' : '"',
519 |             b'\x95' : '*',
520 |             b'\x96' : '-',
521 |             b'\x97' : '--',
522 |             b'\x98' : '~',
523 |             b'\x99' : '(TM)',
524 |             b'\x9a' : 's',
525 |             b'\x9b' : '>',
526 |             b'\x9c' : 'oe',
527 |             b'\x9d' : '?',
528 |             b'\x9e' : 'z',
529 |             b'\x9f' : 'Y',
530 |             b'\xa0' : ' ',
531 |             b'\xa1' : '!',
532 |             b'\xa2' : 'c',
533 |             b'\xa3' : 'GBP',
534 |             b'\xa4' : '$', #This approximation is especially parochial--this is the
535 |                            #generic currency symbol.
536 |             b'\xa5' : 'YEN',
537 |             b'\xa6' : '|',
538 |             b'\xa7' : 'S',
539 |             b'\xa8' : '..',
540 |             b'\xa9' : '',
541 |             b'\xaa' : '(th)',
542 |             b'\xab' : '<<',
543 |             b'\xac' : '!',
544 |             b'\xad' : ' ',
545 |             b'\xae' : '(R)',
546 |             b'\xaf' : '-',
547 |             b'\xb0' : 'o',
548 |             b'\xb1' : '+-',
549 |             b'\xb2' : '2',
550 |             b'\xb3' : '3',
551 |             b'\xb4' : ("'", 'acute'),
552 |             b'\xb5' : 'u',
553 |             b'\xb6' : 'P',
554 |             b'\xb7' : '*',
555 |             b'\xb8' : ',',
556 |             b'\xb9' : '1',
557 |             b'\xba' : '(th)',
558 |             b'\xbb' : '>>',
559 |             b'\xbc' : '1/4',
560 |             b'\xbd' : '1/2',
561 |             b'\xbe' : '3/4',
562 |             b'\xbf' : '?',
563 |             b'\xc0' : 'A',
564 |             b'\xc1' : 'A',
565 |             b'\xc2' : 'A',
566 |             b'\xc3' : 'A',
567 |             b'\xc4' : 'A',
568 |             b'\xc5' : 'A',
569 |             b'\xc6' : 'AE',
570 |             b'\xc7' : 'C',
571 |             b'\xc8' : 'E',
572 |             b'\xc9' : 'E',
573 |             b'\xca' : 'E',
574 |             b'\xcb' : 'E',
575 |             b'\xcc' : 'I',
576 |             b'\xcd' : 'I',
577 |             b'\xce' : 'I',
578 |             b'\xcf' : 'I',
579 |             b'\xd0' : 'D',
580 |             b'\xd1' : 'N',
581 |             b'\xd2' : 'O',
582 |             b'\xd3' : 'O',
583 |             b'\xd4' : 'O',
584 |             b'\xd5' : 'O',
585 |             b'\xd6' : 'O',
586 |             b'\xd7' : '*',
587 |             b'\xd8' : 'O',
588 |             b'\xd9' : 'U',
589 |             b'\xda' : 'U',
590 |             b'\xdb' : 'U',
591 |             b'\xdc' : 'U',
592 |             b'\xdd' : 'Y',
593 |             b'\xde' : 'b',
594 |             b'\xdf' : 'B',
595 |             b'\xe0' : 'a',
596 |             b'\xe1' : 'a',
597 |             b'\xe2' : 'a',
598 |             b'\xe3' : 'a',
599 |             b'\xe4' : 'a',
600 |             b'\xe5' : 'a',
601 |             b'\xe6' : 'ae',
602 |             b'\xe7' : 'c',
603 |             b'\xe8' : 'e',
604 |             b'\xe9' : 'e',
605 |             b'\xea' : 'e',
606 |             b'\xeb' : 'e',
607 |             b'\xec' : 'i',
608 |             b'\xed' : 'i',
609 |             b'\xee' : 'i',
610 |             b'\xef' : 'i',
611 |             b'\xf0' : 'o',
612 |             b'\xf1' : 'n',
613 |             b'\xf2' : 'o',
614 |             b'\xf3' : 'o',
615 |             b'\xf4' : 'o',
616 |             b'\xf5' : 'o',
617 |             b'\xf6' : 'o',
618 |             b'\xf7' : '/',
619 |             b'\xf8' : 'o',
620 |             b'\xf9' : 'u',
621 |             b'\xfa' : 'u',
622 |             b'\xfb' : 'u',
623 |             b'\xfc' : 'u',
624 |             b'\xfd' : 'y',
625 |             b'\xfe' : 'b',
626 |             b'\xff' : 'y',
627 |             }
628 | 
629 |         # A map used when removing rogue Windows-1252/ISO-8859-1
630 |         # characters in otherwise UTF-8 documents.
631 |         #
632 |         # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
633 |         # Windows-1252.
634 |         WINDOWS_1252_TO_UTF8 = {
635 |             0x80 : b'\xe2\x82\xac', # €
636 |             0x82 : b'\xe2\x80\x9a', # ‚
637 |             0x83 : b'\xc6\x92',     # ƒ
638 |             0x84 : b'\xe2\x80\x9e', # „
639 |             0x85 : b'\xe2\x80\xa6', # …
640 |             0x86 : b'\xe2\x80\xa0', # †
641 |             0x87 : b'\xe2\x80\xa1', # ‡
642 |             0x88 : b'\xcb\x86',     # ˆ
643 |             0x89 : b'\xe2\x80\xb0', # ‰
644 |             0x8a : b'\xc5\xa0',     # Š
645 |             0x8b : b'\xe2\x80\xb9', # ‹
646 |             0x8c : b'\xc5\x92',     # Œ
647 |             0x8e : b'\xc5\xbd',     # Ž
648 |             0x91 : b'\xe2\x80\x98', # ‘
649 |             0x92 : b'\xe2\x80\x99', # ’
650 |             0x93 : b'\xe2\x80\x9c', # “
651 |             0x94 : b'\xe2\x80\x9d', # ”
652 |             0x95 : b'\xe2\x80\xa2', # •
653 |             0x96 : b'\xe2\x80\x93', # –
654 |             0x97 : b'\xe2\x80\x94', # —
655 |             0x98 : b'\xcb\x9c',     # ˜
656 |             0x99 : b'\xe2\x84\xa2', # ™
657 |             0x9a : b'\xc5\xa1',     # š
658 |             0x9b : b'\xe2\x80\xba', # ›
659 |             0x9c : b'\xc5\x93',     # œ
660 |             0x9e : b'\xc5\xbe',     # ž
661 |             0x9f : b'\xc5\xb8',     # Ÿ
662 |             0xa0 : b'\xc2\xa0',     #  
663 |             0xa1 : b'\xc2\xa1',     # ¡
664 |             0xa2 : b'\xc2\xa2',     # ¢
665 |             0xa3 : b'\xc2\xa3',     # £
666 |             0xa4 : b'\xc2\xa4',     # ¤
667 |             0xa5 : b'\xc2\xa5',     # ¥
668 |             0xa6 : b'\xc2\xa6',     # ¦
669 |             0xa7 : b'\xc2\xa7',     # §
670 |             0xa8 : b'\xc2\xa8',     # ¨
671 |             0xa9 : b'\xc2\xa9',     # ©
672 |             0xaa : b'\xc2\xaa',     # ª
673 |             0xab : b'\xc2\xab',     # «
674 |             0xac : b'\xc2\xac',     # ¬
675 |             0xad : b'\xc2\xad',     # ­
676 |             0xae : b'\xc2\xae',     # ®
677 |             0xaf : b'\xc2\xaf',     # ¯
678 |             0xb0 : b'\xc2\xb0',     # °
679 |             0xb1 : b'\xc2\xb1',     # ±
680 |             0xb2 : b'\xc2\xb2',     # ²
681 |             0xb3 : b'\xc2\xb3',     # ³
682 |             0xb4 : b'\xc2\xb4',     # ´
683 |             0xb5 : b'\xc2\xb5',     # µ
684 |             0xb6 : b'\xc2\xb6',     # ¶
685 |             0xb7 : b'\xc2\xb7',     # ·
686 |             0xb8 : b'\xc2\xb8',     # ¸
687 |             0xb9 : b'\xc2\xb9',     # ¹
688 |             0xba : b'\xc2\xba',     # º
689 |             0xbb : b'\xc2\xbb',     # »
690 |             0xbc : b'\xc2\xbc',     # ¼
691 |             0xbd : b'\xc2\xbd',     # ½
692 |             0xbe : b'\xc2\xbe',     # ¾
693 |             0xbf : b'\xc2\xbf',     # ¿
694 |             0xc0 : b'\xc3\x80',     # À
695 |             0xc1 : b'\xc3\x81',     # Á
696 |             0xc2 : b'\xc3\x82',     # Â
697 |             0xc3 : b'\xc3\x83',     # Ã
698 |             0xc4 : b'\xc3\x84',     # Ä
699 |             0xc5 : b'\xc3\x85',     # Å
700 |             0xc6 : b'\xc3\x86',     # Æ
701 |             0xc7 : b'\xc3\x87',     # Ç
702 |             0xc8 : b'\xc3\x88',     # È
703 |             0xc9 : b'\xc3\x89',     # É
704 |             0xca : b'\xc3\x8a',     # Ê
705 |             0xcb : b'\xc3\x8b',     # Ë
706 |             0xcc : b'\xc3\x8c',     # Ì
707 |             0xcd : b'\xc3\x8d',     # Í
708 |             0xce : b'\xc3\x8e',     # Î
709 |             0xcf : b'\xc3\x8f',     # Ï
710 |             0xd0 : b'\xc3\x90',     # Ð
711 |             0xd1 : b'\xc3\x91',     # Ñ
712 |             0xd2 : b'\xc3\x92',     # Ò
713 |             0xd3 : b'\xc3\x93',     # Ó
714 |             0xd4 : b'\xc3\x94',     # Ô
715 |             0xd5 : b'\xc3\x95',     # Õ
716 |             0xd6 : b'\xc3\x96',     # Ö
717 |             0xd7 : b'\xc3\x97',     # ×
718 |             0xd8 : b'\xc3\x98',     # Ø
719 |             0xd9 : b'\xc3\x99',     # Ù
720 |             0xda : b'\xc3\x9a',     # Ú
721 |             0xdb : b'\xc3\x9b',     # Û
722 |             0xdc : b'\xc3\x9c',     # Ü
723 |             0xdd : b'\xc3\x9d',     # Ý
724 |             0xde : b'\xc3\x9e',     # Þ
725 |             0xdf : b'\xc3\x9f',     # ß
726 |             0xe0 : b'\xc3\xa0',     # à
727 |             0xe1 : b'\xa1',     # á
728 |             0xe2 : b'\xc3\xa2',     # â
729 |             0xe3 : b'\xc3\xa3',     # ã
730 |             0xe4 : b'\xc3\xa4',     # ä
731 |             0xe5 : b'\xc3\xa5',     # å
732 |             0xe6 : b'\xc3\xa6',     # æ
733 |             0xe7 : b'\xc3\xa7',     # ç
734 |             0xe8 : b'\xc3\xa8',     # è
735 |             0xe9 : b'\xc3\xa9',     # é
736 |             0xea : b'\xc3\xaa',     # ê
737 |             0xeb : b'\xc3\xab',     # ë
738 |             0xec : b'\xc3\xac',     # ì
739 |             0xed : b'\xc3\xad',     # í
740 |             0xee : b'\xc3\xae',     # î
741 |             0xef : b'\xc3\xaf',     # ï
742 |             0xf0 : b'\xc3\xb0',     # ð
743 |             0xf1 : b'\xc3\xb1',     # ñ
744 |             0xf2 : b'\xc3\xb2',     # ò
745 |             0xf3 : b'\xc3\xb3',     # ó
746 |             0xf4 : b'\xc3\xb4',     # ô
747 |             0xf5 : b'\xc3\xb5',     # õ
748 |             0xf6 : b'\xc3\xb6',     # ö
749 |             0xf7 : b'\xc3\xb7',     # ÷
750 |             0xf8 : b'\xc3\xb8',     # ø
751 |             0xf9 : b'\xc3\xb9',     # ù
752 |             0xfa : b'\xc3\xba',     # ú
753 |             0xfb : b'\xc3\xbb',     # û
754 |             0xfc : b'\xc3\xbc',     # ü
755 |             0xfd : b'\xc3\xbd',     # ý
756 |             0xfe : b'\xc3\xbe',     # þ
757 |             }
758 | 
759 |         MULTIBYTE_MARKERS_AND_SIZES = [
760 |             (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
761 |             (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
762 |             (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
763 |             ]
764 | 
765 |         FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
766 |         LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
767 | 
768 |         @classmethod
769 |         def detwingle(cls, in_bytes, main_encoding="utf8",
770 |                       embedded_encoding="windows-1252"):
771 |             """Fix characters from one encoding embedded in some other encoding.
772 | 
773 |             Currently the only situation supported is Windows-1252 (or its
774 |             subset ISO-8859-1), embedded in UTF-8.
775 | 
776 |             The input must be a bytestring. If you've already converted
777 |             the document to Unicode, you're too late.
778 | 
779 |             The output is a bytestring in which `embedded_encoding`
780 |             characters have been converted to their `main_encoding`
781 |             equivalents.
782 |             """
783 |             if embedded_encoding.replace('_', '-').lower() not in (
784 |                 'windows-1252', 'windows_1252'):
785 |                 raise NotImplementedError(
786 |                     "Windows-1252 and ISO-8859-1 are the only currently supported "
787 |                     "embedded encodings.")
788 | 
789 |             if main_encoding.lower() not in ('utf8', 'utf-8'):
790 |                 raise NotImplementedError(
791 |                     "UTF-8 is the only currently supported main encoding.")
792 | 
793 |             byte_chunks = []
794 | 
795 |             chunk_start = 0
796 |             pos = 0
797 |             while pos < len(in_bytes):
798 |                 byte = in_bytes[pos]
799 |                 if not isinstance(byte, int):
800 |                     # Python 2.x
801 |                     byte = ord(byte)
802 |                 if (byte >= cls.FIRST_MULTIBYTE_MARKER
803 |                     and byte <= cls.LAST_MULTIBYTE_MARKER):
804 |                     # This is the start of a UTF-8 multibyte character. Skip
805 |                     # to the end.
806 |                     for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
807 |                         if byte >= start and byte <= end:
808 |                             pos += size
809 |                             break
810 |                 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
811 |                     # We found a Windows-1252 character!
812 |                     # Save the string up to this point as a chunk.
813 |                     byte_chunks.append(in_bytes[chunk_start:pos])
814 | 
815 |                     # Now translate the Windows-1252 character into UTF-8
816 |                     # and add it as another, one-byte chunk.
817 |                     byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
818 |                     pos += 1
819 |                     chunk_start = pos
820 |                 else:
821 |                     # Go on to the next character.
822 |                     pos += 1
823 |             if chunk_start == 0:
824 |                 # The string is unchanged.
825 |                 return in_bytes
826 |             else:
827 |                 # Store the final chunk.
828 |                 byte_chunks.append(in_bytes[chunk_start:])
829 |             return b''.join(byte_chunks)
830 | 
831 | 


--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/diagnose.py:
--------------------------------------------------------------------------------
  1 | """Diagnostic functions, mainly for use when doing tech support."""
  2 | 
  3 | import sys
  4 | 
  5 | if sys.version_info[0] < 3:
  6 | 
  7 |     from StringIO import StringIO
  8 |     from HTMLParser import HTMLParser
  9 |     from pylinkvalidator.included.bs4 import BeautifulSoup, __version__
 10 |     from pylinkvalidator.included.bs4.builder import builder_registry
 11 |     import os
 12 |     import random
 13 |     import time
 14 |     import traceback
 15 |     import sys
 16 |     import cProfile
 17 | 
 18 |     def diagnose(data):
 19 |         """Diagnostic suite for isolating common problems."""
 20 |         print "Diagnostic running on Beautiful Soup %s" % __version__
 21 |         print "Python version %s" % sys.version
 22 | 
 23 |         basic_parsers = ["html.parser", "html5lib", "lxml"]
 24 |         for name in basic_parsers:
 25 |             for builder in builder_registry.builders:
 26 |                 if name in builder.features:
 27 |                     break
 28 |             else:
 29 |                 basic_parsers.remove(name)
 30 |                 print (
 31 |                     "I noticed that %s is not installed. Installing it may help." %
 32 |                     name)
 33 | 
 34 |         if 'lxml' in basic_parsers:
 35 |             basic_parsers.append(["lxml", "xml"])
 36 |             from lxml import etree
 37 |             print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
 38 | 
 39 |         if 'html5lib' in basic_parsers:
 40 |             import html5lib
 41 |             print "Found html5lib version %s" % html5lib.__version__
 42 | 
 43 |         if hasattr(data, 'read'):
 44 |             data = data.read()
 45 |         elif os.path.exists(data):
 46 |             print '"%s" looks like a filename. Reading data from the file.' % data
 47 |             data = open(data).read()
 48 |         elif data.startswith("http:") or data.startswith("https:"):
 49 |             print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
 50 |             print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
 51 |             return
 52 |         print
 53 | 
 54 |         for parser in basic_parsers:
 55 |             print "Trying to parse your markup with %s" % parser
 56 |             success = False
 57 |             try:
 58 |                 soup = BeautifulSoup(data, parser)
 59 |                 success = True
 60 |             except Exception, e:
 61 |                 print "%s could not parse the markup." % parser
 62 |                 traceback.print_exc()
 63 |             if success:
 64 |                 print "Here's what %s did with the markup:" % parser
 65 |                 print soup.prettify()
 66 | 
 67 |             print "-" * 80
 68 | 
 69 |     def lxml_trace(data, html=True):
 70 |         """Print out the lxml events that occur during parsing.
 71 | 
 72 |         This lets you see how lxml parses a document when no Beautiful
 73 |         Soup code is running.
 74 |         """
 75 |         from lxml import etree
 76 |         for event, element in etree.iterparse(StringIO(data), html=html):
 77 |             print("%s, %4s, %s" % (event, element.tag, element.text))
 78 | 
 79 |     class AnnouncingParser(HTMLParser):
 80 |         """Announces HTMLParser parse events, without doing anything else."""
 81 | 
 82 |         def _p(self, s):
 83 |             print(s)
 84 | 
 85 |         def handle_starttag(self, name, attrs):
 86 |             self._p("%s START" % name)
 87 | 
 88 |         def handle_endtag(self, name):
 89 |             self._p("%s END" % name)
 90 | 
 91 |         def handle_data(self, data):
 92 |             self._p("%s DATA" % data)
 93 | 
 94 |         def handle_charref(self, name):
 95 |             self._p("%s CHARREF" % name)
 96 | 
 97 |         def handle_entityref(self, name):
 98 |             self._p("%s ENTITYREF" % name)
 99 | 
100 |         def handle_comment(self, data):
101 |             self._p("%s COMMENT" % data)
102 | 
103 |         def handle_decl(self, data):
104 |             self._p("%s DECL" % data)
105 | 
106 |         def unknown_decl(self, data):
107 |             self._p("%s UNKNOWN-DECL" % data)
108 | 
109 |         def handle_pi(self, data):
110 |             self._p("%s PI" % data)
111 | 
112 |     def htmlparser_trace(data):
113 |         """Print out the HTMLParser events that occur during parsing.
114 | 
115 |         This lets you see how HTMLParser parses a document when no
116 |         Beautiful Soup code is running.
117 |         """
118 |         parser = AnnouncingParser()
119 |         parser.feed(data)
120 | 
121 |     _vowels = "aeiou"
122 |     _consonants = "bcdfghjklmnpqrstvwxyz"
123 | 
124 |     def rword(length=5):
125 |         "Generate a random word-like string."
126 |         s = ''
127 |         for i in range(length):
128 |             if i % 2 == 0:
129 |                 t = _consonants
130 |             else:
131 |                 t = _vowels
132 |             s += random.choice(t)
133 |         return s
134 | 
135 |     def rsentence(length=4):
136 |         "Generate a random sentence-like string."
137 |         return " ".join(rword(random.randint(4,9)) for i in range(length))
138 | 
139 |     def rdoc(num_elements=1000):
140 |         """Randomly generate an invalid HTML document."""
141 |         tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
142 |         elements = []
143 |         for i in range(num_elements):
144 |             choice = random.randint(0,3)
145 |             if choice == 0:
146 |                 # New tag.
147 |                 tag_name = random.choice(tag_names)
148 |                 elements.append("<%s>" % tag_name)
149 |             elif choice == 1:
150 |                 elements.append(rsentence(random.randint(1,4)))
151 |             elif choice == 2:
152 |                 # Close a tag.
153 |                 tag_name = random.choice(tag_names)
154 |                 elements.append("</%s>" % tag_name)
155 |         return "<html>" + "\n".join(elements) + "</html>"
156 | 
157 |     def benchmark_parsers(num_elements=100000):
158 |         """Very basic head-to-head performance benchmark."""
159 |         print "Comparative parser benchmark on Beautiful Soup %s" % __version__
160 |         data = rdoc(num_elements)
161 |         print "Generated a large invalid HTML document (%d bytes)." % len(data)
162 | 
163 |         for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
164 |             success = False
165 |             try:
166 |                 a = time.time()
167 |                 soup = BeautifulSoup(data, parser)
168 |                 b = time.time()
169 |                 success = True
170 |             except Exception, e:
171 |                 print "%s could not parse the markup." % parser
172 |                 traceback.print_exc()
173 |             if success:
174 |                 print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
175 | 
176 |         from lxml import etree
177 |         a = time.time()
178 |         etree.HTML(data)
179 |         b = time.time()
180 |         print "Raw lxml parsed the markup in %.2fs." % (b-a)
181 | 
182 |     if __name__ == '__main__':
183 |         diagnose(sys.stdin.read())
184 | 


--------------------------------------------------------------------------------
/pylinkvalidator/models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Contains the crawling models. We use namedtuple for most models (easier to
  4 | pickle, lower footprint, indicates that it is immutable) and we use classes for
  5 | objects with mutable states and helper methods.
  6 | 
  7 | Classes with crawling logic are declared in the crawler module.
  8 | """
  9 | from __future__ import unicode_literals, absolute_import
 10 | 
 11 | from collections import namedtuple, Mapping, defaultdict
 12 | from optparse import OptionParser, OptionGroup
 13 | import re
 14 | 
 15 | from pylinkvalidator.included.bs4 import BeautifulSoup
 16 | from pylinkvalidator.compat import get_safe_str
 17 | from pylinkvalidator.urlutil import (
 18 |     get_clean_url_split, get_absolute_url_split)
 19 | 
 20 | 
 21 | PREFIX_ALL = "*"
 22 | 
 23 | REGEX_CONTENT = "regex:"
 24 | 
 25 | 
 26 | def namedtuple_with_defaults(typename, field_names, default_values=[]):
 27 |     """Creates a namedtuple with default values so they don't have to be
 28 |     provided for each argument.
 29 |     """
 30 |     T = namedtuple(typename, field_names)
 31 | 
 32 |     # Set None everywhere
 33 |     T.__new__.__defaults__ = (None,) * len(T._fields)
 34 | 
 35 |     # Set provided default values
 36 |     if isinstance(default_values, Mapping):
 37 |         prototype = T(**default_values)
 38 |     else:
 39 |         prototype = T(*default_values)
 40 |     T.__new__.__defaults__ = tuple(prototype)
 41 | 
 42 |     # Return new type
 43 |     return T
 44 | 
 45 | 
 46 | DEFAULT_TYPES = ['a', 'img', 'script', 'link']
 47 | 
 48 | 
 49 | TYPE_ATTRIBUTES = {
 50 |     'a': 'href',
 51 |     'img': 'src',
 52 |     'script': 'src',
 53 |     'link': 'href',
 54 | }
 55 | 
 56 | 
 57 | DEFAULT_TIMEOUT = 10
 58 | 
 59 | 
 60 | MODE_THREAD = "thread"
 61 | MODE_PROCESS = "process"
 62 | MODE_GREEN = "green"
 63 | 
 64 | 
 65 | DEFAULT_WORKERS = {
 66 |     MODE_THREAD: 1,
 67 |     MODE_PROCESS: 1,
 68 |     MODE_GREEN: 1000,
 69 | }
 70 | 
 71 | 
 72 | PARSER_STDLIB = "html.parser"
 73 | PARSER_LXML = "lxml"
 74 | PARSER_HTML5 = "html5lib"
 75 | 
 76 | # TODO Add support for gumbo. Will require some refactoring of the parsing
 77 | # logic.
 78 | # PARSER_GUMBO = "gumbo"
 79 | 
 80 | 
 81 | FORMAT_PLAIN = "plain"
 82 | FORMAT_HTML = "html"
 83 | FORMAT_JSON = "json"
 84 | 
 85 | 
 86 | WHEN_ALWAYS = "always"
 87 | WHEN_ON_ERROR = "error"
 88 | 
 89 | 
 90 | REPORT_TYPE_ERRORS = "errors"
 91 | REPORT_TYPE_SUMMARY = "summary"
 92 | REPORT_TYPE_ALL = "all"
 93 | 
 94 | 
 95 | VERBOSE_QUIET = "0"
 96 | VERBOSE_NORMAL = "1"
 97 | VERBOSE_INFO = "2"
 98 | 
 99 | 
100 | HTML_MIME_TYPE = "text/html"
101 | 
102 | 
103 | PAGE_QUEUED = '__PAGE_QUEUED__'
104 | PAGE_CRAWLED = '__PAGE_CRAWLED__'
105 | 
106 | # Note: we use namedtuple to exchange data with workers because they are
107 | # immutable and easy to pickle (as opposed to a class).
108 | 
109 | WorkerInit = namedtuple_with_defaults(
110 |     "WorkerInit",
111 |     ["worker_config", "input_queue", "output_queue", "logger"])
112 | 
113 | 
114 | WorkerConfig = namedtuple_with_defaults(
115 |     "WorkerConfig",
116 |     ["username", "password", "types", "timeout", "parser", "strict_mode",
117 |      "prefer_server_encoding", "extra_headers", "ignore_bad_tel_urls",
118 |      "allow_insecure_content"])
119 | 
120 | 
121 | WorkerInput = namedtuple_with_defaults(
122 |     "WorkerInput",
123 |     ["url_split", "should_crawl", "depth", "site_origin", "content_check"])
124 | 
125 | 
126 | Response = namedtuple_with_defaults(
127 |     "Response", ["content", "status", "exception", "original_url",
128 |                  "final_url", "is_redirect", "is_timeout", "response_time"])
129 | 
130 | 
131 | ExceptionStr = namedtuple_with_defaults(
132 |     "ExceptionStr", ["type_name", "message"])
133 | 
134 | 
135 | Link = namedtuple_with_defaults(
136 |     "Link",
137 |     ["type", "url_split", "original_url_split", "source_str"])
138 | 
139 | 
140 | PageCrawl = namedtuple_with_defaults(
141 |     "PageCrawl", ["original_url_split", "final_url_split",
142 |                   "status", "is_timeout", "is_redirect", "links",
143 |                   "exception", "is_html", "depth", "response_time",
144 |                   "process_time", "site_origin", "missing_content",
145 |                   "erroneous_content"])
146 | 
147 | 
148 | PageStatus = namedtuple_with_defaults(
149 |     "PageStatus", ["status", "sources"])
150 | 
151 | 
152 | PageSource = namedtuple_with_defaults(
153 |     "PageSource", ["origin", "origin_str"])
154 | 
155 | 
156 | ContentCheck = namedtuple_with_defaults(
157 |     "ContentCheck",
158 |     ["html_presence", "html_absence", "text_presence", "text_absence",
159 |      "has_something_to_check"])
160 | 
161 | HTMLCheck = namedtuple_with_defaults(
162 |     "HTMLCheck", ["tag", "attrs", "content"])
163 | 
164 | 
165 | class UTF8Class(object):
166 |     """Handles unicode string from __unicode__() in: __str__() and __repr__()
167 |     """
168 |     def __str__(self):
169 |         return get_safe_str(self.__unicode__())
170 | 
171 |     def __repr__(self):
172 |         return get_safe_str(self.__unicode__())
173 | 
174 | 
175 | class LazyLogParam(object):
176 |     """Lazy Log Parameter that is only evaluated if the logging statement
177 |        is printed"""
178 | 
179 |     def __init__(self, func):
180 |         self.func = func
181 | 
182 |     def __str__(self):
183 |         return str(self.func())
184 | 
185 | 
186 | class Config(UTF8Class):
187 |     """Contains all the configuration options."""
188 | 
189 |     def __init__(self):
190 |         # Design note: we only use attributes when options need to be
191 |         # transformed. Otherwise, we use options.
192 |         self.parser = self._build_parser()
193 |         self.options = None
194 |         self.start_urls = []
195 |         self.start_url_splits = []
196 |         self.worker_config = None
197 | 
198 |         self.accepted_hosts = []
199 |         """Set of accepted hosts. Dictionary of accepted hosts if in multi
200 |         mode: key: start url host, value: set of accepted hosts."""
201 | 
202 |         self.ignored_prefixes = []
203 |         self.worker_size = 0
204 |         self.content_check = None
205 | 
206 |     def should_crawl(self, url_split, depth):
207 |         """Returns True if url split is local AND depth is acceptable"""
208 |         return (self.options.depth < 0 or depth < self.options.depth) and\
209 |             self.is_local(url_split)
210 | 
211 |     def is_local(self, url_split, site_origin=None):
212 |         """Returns true if url split is in the accepted hosts. site_origin must
213 |         be provided if multi sites mode is enabled."""
214 | 
215 |         if self.options.multi and site_origin:
216 |             accepted_hosts = self.accepted_hosts[site_origin]
217 |         else:
218 |             accepted_hosts = self.accepted_hosts
219 | 
220 |         return url_split.netloc in accepted_hosts
221 | 
222 |     def should_download(self, url_split):
223 |         """Returns True if the url does not start with an ignored prefix and if
224 |         it is local or outside links are allowed."""
225 |         local = self.is_local(url_split)
226 | 
227 |         if not self.options.test_outside and not local:
228 |             return False
229 | 
230 |         url = url_split.geturl()
231 | 
232 |         for ignored_prefix in self.ignored_prefixes:
233 |             if url.startswith(ignored_prefix):
234 |                 return False
235 | 
236 |         return True
237 | 
238 |     def parse_cli_config(self):
239 |         """Builds the options and args based on the command line options."""
240 |         (self.options, self.start_urls) = self.parser.parse_args()
241 |         self._parse_config()
242 | 
243 |     def parse_api_config(self, start_urls, options_dict=None):
244 |         """Builds the options and args based on passed parameters."""
245 |         # TODO Add options
246 |         options = self._get_options(options_dict)
247 |         (self.options, self.start_urls) = self.parser.parse_args(
248 |             options + start_urls)
249 |         self._parse_config()
250 | 
251 |     def _get_options(self, options_dict):
252 |         if not options_dict:
253 |             options_dict = {}
254 |         options = []
255 |         for key, value in options_dict.items():
256 |             if isinstance(value, bool) and value:
257 |                 options.append("--{0}".format(key))
258 |             else:
259 |                 options.append("--{0}={1}".format(key, value))
260 |         return options
261 | 
262 |     def _parse_config(self):
263 |         if self.options.url_file_path:
264 |             self.start_urls = self._read_start_urls(self.options.url_file_path)
265 |         self._process_start_urls()
266 | 
267 |         self.worker_config = self._build_worker_config(self.options)
268 |         self.accepted_hosts = self._build_accepted_hosts(
269 |             self.options, self.start_urls)
270 | 
271 |         if self.options.ignored_prefixes:
272 |             self.ignored_prefixes = self.options.ignored_prefixes.split(',')
273 | 
274 |         if self.options.workers:
275 |             self.worker_size = self.options.workers
276 |         else:
277 |             self.worker_size = DEFAULT_WORKERS[self.options.mode]
278 | 
279 |         if self.options.run_once:
280 |             self.options.depth = 0
281 | 
282 |         self.content_check = self._compute_content_check(self.options)
283 | 
284 |         self._add_content_check_urls(self.start_url_splits, self.content_check)
285 | 
286 |     def _read_start_urls(self, url_file_path):
287 |         urls = []
288 |         with open(url_file_path, "r") as url_file:
289 |             urls = [url for url in url_file.read().split() if url]
290 |         return urls
291 | 
292 |     def _process_start_urls(self):
293 |         for start_url in self.start_urls:
294 |             self.start_url_splits.append(get_clean_url_split(start_url))
295 | 
296 |     def _build_worker_config(self, options):
297 |         types = options.types.split(',')
298 |         for element_type in types:
299 |             if element_type not in DEFAULT_TYPES:
300 |                 raise ValueError("This type is not supported: {0}"
301 |                                  .format(element_type))
302 | 
303 |         headers = {}
304 |         if options.headers:
305 |             for item in options.headers:
306 |                 split = item.split(":")
307 |                 if len(split) == 2:
308 |                     headers[split[0]] = split[1]
309 | 
310 |         return WorkerConfig(
311 |             options.username, options.password, types, options.timeout,
312 |             options.parser, options.strict_mode,
313 |             options.prefer_server_encoding, headers,
314 |             options.ignore_bad_tel_urls)
315 | 
316 |     def _build_accepted_hosts(self, options, start_urls):
317 |         if options.multi:
318 |             return self._build_multi_hosts(options, start_urls)
319 |         else:
320 |             return self._build_single_hosts(options, start_urls)
321 | 
322 |     def _build_multi_hosts(self, options, start_urls):
323 |         hosts = {}
324 | 
325 |         extra_hosts = set()
326 |         if options.accepted_hosts:
327 |             for url in options.accepted_hosts.split(','):
328 |                 split_result = get_clean_url_split(url)
329 |                 extra_hosts.add(split_result.netloc)
330 | 
331 |         for start_url in start_urls:
332 |             split_result = get_clean_url_split(start_url)
333 |             host = split_result.netloc
334 |             hosts[host] = extra_hosts.union(host)
335 | 
336 |         return hosts
337 | 
338 |     def _build_single_hosts(self, options, start_urls):
339 |         hosts = set()
340 |         urls = []
341 | 
342 |         if options.accepted_hosts:
343 |             urls = options.accepted_hosts.split(',')
344 |         urls = urls + start_urls
345 | 
346 |         for url in urls:
347 |             split_result = get_clean_url_split(url)
348 |             hosts.add(split_result.netloc)
349 | 
350 |         return hosts
351 | 
352 |     def _compute_content_check(self, options):
353 |         html_presence = defaultdict(list)
354 |         html_absence = defaultdict(list)
355 |         raw_presence = defaultdict(list)
356 |         raw_absence = defaultdict(list)
357 |         self._compute_single_content_check(
358 |             options.content_presence, html_presence,
359 |             raw_presence, PREFIX_ALL)
360 |         self._compute_single_content_check(
361 |             options.content_absence, html_absence,
362 |             raw_absence, PREFIX_ALL)
363 |         self._compute_single_content_check(
364 |             options.content_presence_once, html_presence,
365 |             raw_presence)
366 |         self._compute_single_content_check(
367 |             options.content_absence_once, html_absence,
368 |             raw_absence)
369 | 
370 |         has_something_to_check = bool(
371 |             html_presence or html_absence or raw_presence or raw_absence)
372 | 
373 |         return ContentCheck(
374 |             html_presence, html_absence, raw_presence, raw_absence,
375 |             has_something_to_check)
376 | 
377 |     def _add_content_check_urls(self, start_urls, content_check):
378 |         self._add_urls_from_single_content_check(
379 |             start_urls, content_check.html_presence)
380 |         self._add_urls_from_single_content_check(
381 |             start_urls, content_check.html_absence)
382 |         self._add_urls_from_single_content_check(
383 |             start_urls, content_check.text_presence)
384 |         self._add_urls_from_single_content_check(
385 |             start_urls, content_check.text_absence)
386 | 
387 |     def _add_urls_from_single_content_check(
388 |             self, start_urls, single_content_check):
389 |         for key in single_content_check.keys():
390 |             if key == PREFIX_ALL:
391 |                 continue
392 |             if key.netloc and key not in start_urls:
393 |                 start_urls.append(key)
394 |             else:
395 |                 for url_split in start_urls:
396 |                     new_url = get_absolute_url_split(
397 |                         key.geturl(), url_split)
398 |                     if new_url not in start_urls:
399 |                         start_urls.append(new_url)
400 | 
401 |     def _compute_single_content_check(
402 |             self, content_list, html_dict, raw_dict, prefix=None):
403 |         if not content_list:
404 |             # Catch None
405 |             return
406 | 
407 |         for content in content_list:
408 |             temp_prefix, content = self._get_prefix_content(content, prefix)
409 |             content = content.strip()
410 |             if content.startswith("<"):
411 |                 # html.parser because we do not want to automatically create
412 |                 # surrounding tags
413 |                 soup = BeautifulSoup(content, "html.parser")
414 |                 children = list(soup.children)
415 |                 if children:
416 |                     child = children[0]
417 |                     string = child.string
418 |                     if child.string and child.string.startswith(REGEX_CONTENT):
419 |                         string = re.compile(child.string[len(REGEX_CONTENT):],
420 |                                             re.MULTILINE)
421 |                     html_check = HTMLCheck(
422 |                         child.name, child.attrs, string)
423 |                     html_dict[temp_prefix].append(html_check)
424 |             else:
425 |                 if content and content.startswith(REGEX_CONTENT):
426 |                     content = re.compile(content[len(REGEX_CONTENT):],
427 |                                          re.MULTILINE)
428 |                 raw_dict[temp_prefix].append(content)
429 | 
430 |     def _get_prefix_content(self, content, prefix=None):
431 |         if not prefix:
432 |             index = content.find(",")
433 |             prefix = get_clean_url_split(content[:index])
434 |             content = content[index+1:]
435 | 
436 |         return (prefix, content)
437 | 
438 |     def _build_parser(self):
439 |         # avoid circular references
440 |         import pylinkvalidator
441 |         version = pylinkvalidator.__version__
442 | 
443 |         parser = OptionParser(
444 |             usage="%prog [options] URL ...",
445 |             version="%prog {0}".format(version))
446 | 
447 |         parser.add_option(
448 |             "-V", "--verbose", dest="verbose", action="store",
449 |             default=VERBOSE_QUIET, choices=[VERBOSE_QUIET, VERBOSE_NORMAL,
450 |                                             VERBOSE_INFO])
451 | 
452 |         crawler_group = OptionGroup(
453 |             parser, "Crawler Options",
454 |             "These options modify the way the crawler traverses the site.")
455 |         crawler_group.add_option(
456 |             "-O", "--test-outside", dest="test_outside",
457 |             action="store_true", default=False,
458 |             help="fetch resources from other domains without crawling them")
459 |         crawler_group.add_option(
460 |             "-H", "--accepted-hosts",
461 |             dest="accepted_hosts",  action="store", default=None,
462 |             help="comma-separated list of additional hosts to crawl (e.g., "
463 |             "example.com,subdomain.another.com)")
464 |         crawler_group.add_option(
465 |             "-i", "--ignore", dest="ignored_prefixes",
466 |             action="store", default=None,
467 |             help="comma-separated list of host/path prefixes to ignore "
468 |             "(e.g., www.example.com/ignore_this_and_after/)")
469 |         crawler_group.add_option(
470 |             "-b", "--ignore-bad-tel-urls", dest="ignore_bad_tel_urls",
471 |             action="store_true", default=False,
472 |             help="ignore badly formed tel URLs missing the leading + sign, "
473 |             "e.g., tel:1234567890 - only necessary for Python > 2.6")
474 |         crawler_group.add_option(
475 |             "-u", "--username", dest="username",
476 |             action="store", default=None,
477 |             help="username to use with basic HTTP authentication")
478 |         crawler_group.add_option(
479 |             "-p", "--password", dest="password",
480 |             action="store", default=None,
481 |             help="password to use with basic HTTP authentication")
482 |         crawler_group.add_option(
483 |             "-M", "--multi", dest="multi",
484 |             action="store_true", default=False,
485 |             help="each argument is considered to be a different site")
486 |         crawler_group.add_option(
487 |             "-D", "--header",
488 |             dest="headers",  action="append", metavar="HEADER",
489 |             help="custom header of the form Header: Value "
490 |             "(repeat for multiple headers)")
491 |         crawler_group.add_option(
492 |             "--url-file-path", dest="url_file_path",
493 |             action="store", default=None,
494 |             help="get starting URLs from a line-separated file")
495 |         # crawler_group.add_option("-U", "--unique", dest="unique",
496 |         #         action="store_true", default=False)
497 |         crawler_group.add_option(
498 |             "-t", "--types", dest="types", action="store",
499 |             default=",".join(DEFAULT_TYPES),
500 |             help="Comma-separated values of tags to look for when crawling"
501 |             "a site. Default (and supported types): a,img,link,script")
502 |         crawler_group.add_option(
503 |             "-T", "--timeout", dest="timeout",
504 |             type="int", action="store", default=DEFAULT_TIMEOUT,
505 |             help="Seconds to wait before considering that a page timed out")
506 |         crawler_group.add_option(
507 |             "-C", "--strict", dest="strict_mode",
508 |             action="store_true", default=False,
509 |             help="Does not strip href and src attributes from whitespaces")
510 |         crawler_group.add_option(
511 |             "-P", "--progress", dest="progress",
512 |             action="store_true", default=False,
513 |             help="Prints crawler progress in the console")
514 |         crawler_group.add_option(
515 |             "-N", "--run-once", dest="run_once",
516 |             action="store_true", default=False,
517 |             help="Only crawl the first page (eq. to depth=0).")
518 |         crawler_group.add_option(
519 |             "-d", "--depth", dest="depth",
520 |             type="int", action="store", default=-1,
521 |             help="Maximum crawl depth")
522 |         crawler_group.add_option(
523 |             "-e", "--prefer-server-encoding", dest="prefer_server_encoding",
524 |             action="store_true", default=False,
525 |             help="Prefer server encoding if specified. Else detect encoding")
526 |         crawler_group.add_option(
527 |             "--check-presence", dest="content_presence",
528 |             action="append",
529 |             help="Check presence of raw or HTML content on all pages. e.g., "
530 |             "<tag attr1=\"val\">regex:content</tag>. "
531 |             "Content can be either regex:pattern or plain content")
532 |         crawler_group.add_option(
533 |             "--check-absence", dest="content_absence",
534 |             action="append",
535 |             help="Check absence of raw or HTML content on all pages. e.g., "
536 |             "<tag attr1=\"val\">regex:content</tag>. "
537 |             "Content can be either regex:pattern or plain content")
538 |         crawler_group.add_option(
539 |             "--check-presence-once", dest="content_presence_once",
540 |             action="append",
541 |             help="Check presence of raw or HTML content for one page: "
542 |             "path,content, e.g.,: "
543 |             "/path,<tag attr1=\"val\">regex:content</tag>. "
544 |             "Content can be either regex:pattern or plain content. "
545 |             "Path can be either relative or absolute with domain.")
546 |         crawler_group.add_option(
547 |             "--check-absence-once", dest="content_absence_once",
548 |             action="append",
549 |             help="Check absence of raw or HTML content for one page: "
550 |             "path,content, e.g.,"
551 |             "path,<tag attr1=\"val\">regex:content</tag>. "
552 |             "Content can be either regex:pattern or plain content. "
553 |             "Path can be either relative or absolute with domain.")
554 |         crawler_group.add_option(
555 |             "--allow-insecure-content", dest="allow_insecure_content",
556 |             action="store_true", default=False,
557 |             help="Allow insecure content for HTTPS sites with certificate errors")
558 | 
559 |         # TODO Add follow redirect option.
560 | 
561 |         parser.add_option_group(crawler_group)
562 | 
563 |         perf_group = OptionGroup(
564 |             parser, "Performance Options",
565 |             "These options can impact the performance of the crawler.")
566 | 
567 |         perf_group.add_option(
568 |             "-w", "--workers", dest="workers", action="store",
569 |             default=None, type="int",
570 |             help="Number of workers to spawn")
571 |         perf_group.add_option(
572 |             "-m", "--mode", dest="mode", action="store",
573 |             help="Types of workers: thread (default), process, or green",
574 |             default=MODE_THREAD, choices=[MODE_THREAD, MODE_PROCESS,
575 |                                           MODE_GREEN])
576 |         perf_group.add_option(
577 |             "-R", "--parser", dest="parser", action="store",
578 |             help="Types of HTML parse: html.parser (default), lxml, html5lib",
579 |             default=PARSER_STDLIB, choices=[PARSER_STDLIB, PARSER_LXML,
580 |                                             PARSER_HTML5])
581 | 
582 |         parser.add_option_group(perf_group)
583 | 
584 |         output_group = OptionGroup(
585 |             parser, "Output Options",
586 |             "These options change the output of the crawler.")
587 | 
588 |         output_group.add_option(
589 |             "-f", "--format", dest="format", action="store",
590 |             default=FORMAT_PLAIN, choices=[FORMAT_PLAIN],
591 |             help="Format of the report: plain")
592 |         output_group.add_option(
593 |             "-o", "--output", dest="output", action="store",
594 |             default=None,
595 |             help="Path of the file where the report will be printed.")
596 |         output_group.add_option(
597 |             "-W", "--when", dest="when", action="store",
598 |             default=WHEN_ALWAYS, choices=[WHEN_ALWAYS, WHEN_ON_ERROR],
599 |             help="When to print the report. error (only if a "
600 |             "crawling error occurs) or always (default)")
601 |         output_group.add_option(
602 |             "-E", "--report-type", dest="report_type",
603 |             help="Type of report to print: errors (default, summary and "
604 |             "erroneous links), summary, all (summary and all links)",
605 |             action="store", default=REPORT_TYPE_ERRORS,
606 |             choices=[REPORT_TYPE_ERRORS, REPORT_TYPE_SUMMARY, REPORT_TYPE_ALL])
607 |         output_group.add_option(
608 |             "-c", "--console", dest="console",
609 |             action="store_true", default=False,
610 |             help="Prints report to the console in addition to other output"
611 |             " options such as file or email.")
612 |         crawler_group.add_option(
613 |             "-S", "--show-source", dest="show_source",
614 |             action="store_true", default=False,
615 |             help="Show source of links (html) in the report.")
616 | 
617 |         parser.add_option_group(output_group)
618 | 
619 |         email_group = OptionGroup(
620 |             parser, "Email Options",
621 |             "These options allows the crawler to send a report by email.")
622 | 
623 |         email_group.add_option(
624 |             "-a", "--address", dest="address", action="store",
625 |             default=None,
626 |             help="Comma-separated list of email addresses used to send a "
627 |             "report")
628 |         email_group.add_option(
629 |             "--from", dest="from_address", action="store",
630 |             default=None,
631 |             help="Email address to use in the from field of the email "
632 |             "(optional)")
633 |         email_group.add_option(
634 |             "-s", "--smtp", dest="smtp", action="store",
635 |             default=None,
636 |             help="Host of the smtp server")
637 |         email_group.add_option(
638 |             "--port", dest="port", action="store",
639 |             default=25, type="int",
640 |             help="Port of the smtp server (optional)")
641 |         email_group.add_option(
642 |             "--tls", dest="tls", action="store_true",
643 |             default=False,
644 |             help="Use TLS with the email server.")
645 |         email_group.add_option(
646 |             "--subject", dest="subject", action="store",
647 |             default=None,
648 |             help="Subject of the email (optional)")
649 |         email_group.add_option(
650 |             "--smtp-username", dest="smtp_username",
651 |             action="store", default=None,
652 |             help="Username to use with the smtp server (optional)")
653 |         email_group.add_option(
654 |             "--smtp-password", dest="smtp_password",
655 |             action="store", default=None,
656 |             help="Password to use with the smtp server (optional)")
657 | 
658 |         parser.add_option_group(email_group)
659 | 
660 |         return parser
661 | 
662 |     def __unicode__(self):
663 |         return "Configuration - Start URLs: {0} - Options: {1}".format(
664 |             self.start_urls, self.options)
665 | 
666 | 
667 | class SitePage(UTF8Class):
668 |     """Contains the crawling result for a page.
669 | 
670 |     This is a class because we need to keep track of the various sources
671 |     linking to this page and it must be modified as the crawl progresses.
672 |     """
673 | 
674 |     def __init__(self, url_split, status=200, is_timeout=False, exception=None,
675 |                  is_html=True, is_local=True, response_time=None,
676 |                  process_time=None, site_origin=None, missing_content=None,
677 |                  erroneous_content=None):
678 |         self.url_split = url_split
679 | 
680 |         self.original_source = None
681 |         self.sources = []
682 | 
683 |         self.type = type
684 |         self.status = status
685 |         self.is_timeout = is_timeout
686 |         self.exception = exception
687 |         self.is_html = is_html
688 |         self.is_local = is_local
689 |         self.is_ok = status and status < 400 and not missing_content and\
690 |             not erroneous_content
691 |         self.response_time = response_time
692 |         self.process_time = process_time
693 |         self.site_origin = site_origin
694 | 
695 |         if missing_content:
696 |             self.missing_content = missing_content
697 |         else:
698 |             self.missing_content = []
699 | 
700 |         if erroneous_content:
701 |             self.erroneous_content = erroneous_content
702 |         else:
703 |             self.erroneous_content = []
704 | 
705 |     def add_sources(self, page_sources):
706 |         self.sources.extend(page_sources)
707 | 
708 |     def get_status_message(self):
709 |         if self.status:
710 |             if self.status < 400:
711 |                 return self._compute_ok_status(self.status)
712 |             elif self.status == 404:
713 |                 return "not found (404)"
714 |             else:
715 |                 return "error (status={0})".format(self.status)
716 |         elif self.is_timeout:
717 |             return "error (timeout)"
718 |         elif self.exception:
719 |             return "error ({0}): {1}".format(
720 |                 self.exception.type_name, self.exception.message)
721 |         else:
722 |             return "error"
723 | 
724 |     def _compute_ok_status(self, status_code):
725 |         if self.missing_content and not self.erroneous_content:
726 |             return "error ({0}) missing content".format(status_code)
727 |         elif self.erroneous_content and not self.missing_content:
728 |             return "error ({0}) erroneous content".format(status_code)
729 |         elif self.erroneous_content and self.missing_content:
730 |             return "error ({0}) missing and erroneous content".format(
731 |                 status_code)
732 |         else:
733 |             return "ok ({0})".format(self.status)
734 | 
735 |     def get_content_messages(self):
736 |         """Gets missing and erroneous content
737 |         """
738 |         messages = [
739 |             "missing content: {0}".format(content) for content in
740 |             self.missing_content] + [
741 |             "erroneous content: {0}".format(content) for content in
742 |             self.erroneous_content]
743 | 
744 |         return messages
745 | 
746 |     def __unicode__(self):
747 |         return "Resource {0} - {1}".format(
748 |             self.url_split.geturl(), self.status)
749 | 


--------------------------------------------------------------------------------
/pylinkvalidator/reporter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains the reporting functions
  3 | """
  4 | from __future__ import unicode_literals, absolute_import, print_function
  5 | 
  6 | import codecs
  7 | import re
  8 | import smtplib
  9 | import sys
 10 | 
 11 | from email.mime.text import MIMEText
 12 | 
 13 | from pylinkvalidator.compat import StringIO
 14 | from pylinkvalidator.models import (
 15 |     REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_PLAIN)
 16 | 
 17 | 
 18 | PLAIN_TEXT = "text/plain"
 19 | HTML = "text/html"
 20 | 
 21 | WHITESPACES = re.compile(r"\s+")
 22 | 
 23 | 
 24 | EMAIL_HEADER = "from: {0}\r\nsubject: {1}\r\nto: {2}\r\nmime-version: 1.0\r\n"\
 25 |                "content-type: {3}\r\n\r\n{4}"
 26 | 
 27 | 
 28 | def close_quietly(a_file):
 29 |     """Closes a file and does not report an error."""
 30 |     try:
 31 |         if a_file:
 32 |             a_file.close()
 33 |     except Exception:
 34 |         pass
 35 | 
 36 | 
 37 | def report(site, config, total_time, logger=None):
 38 |     """Prints reports to console, file, and email."""
 39 |     output_files = []
 40 |     output_file = None
 41 |     email_file = None
 42 | 
 43 |     if config.options.output:
 44 |         output_file = codecs.open(config.options.output, "w", "utf-8")
 45 |         output_files.append(output_file)
 46 | 
 47 |     if config.options.smtp:
 48 |         email_file = StringIO()
 49 |         output_files.append(email_file)
 50 | 
 51 |     if config.options.console or not output_files:
 52 |         output_files.append(sys.stdout)
 53 | 
 54 |     try:
 55 |         if config.options.format == FORMAT_PLAIN:
 56 |             _write_plain_text_report(site, config, output_files, total_time)
 57 |     except Exception:
 58 |         if logger:
 59 |             logger.exception("An exception occurred while writing the report")
 60 | 
 61 |     if output_file:
 62 |         close_quietly(output_file)
 63 | 
 64 |     if email_file:
 65 |         send_email(email_file, site, config)
 66 | 
 67 | 
 68 | def _write_plain_text_report(site, config, output_files, total_time):
 69 |     if config.options.multi:
 70 |         _write_plain_text_report_multi(site, config, output_files, total_time)
 71 |     else:
 72 |         _write_plain_text_report_single(site, config, output_files, total_time)
 73 | 
 74 | 
 75 | def _write_plain_text_report_multi(site, config, output_files, total_time):
 76 |     total_urls = len(site.pages)
 77 |     total_errors = len(site.error_pages)
 78 | 
 79 |     if not site.is_ok:
 80 |         global_status = "ERROR"
 81 |         error_summary = "with {0} error(s) ".format(total_errors)
 82 |     else:
 83 |         global_status = "SUCCESS"
 84 |         error_summary = ""
 85 | 
 86 |     try:
 87 |         avg_response_time = site.get_average_response_time()
 88 |         avg_process_time = site.get_average_process_time()
 89 | 
 90 |         oprint(
 91 |             "{0} Crawled {1} urls {2}from {4} sites in {3:.2f} seconds"
 92 |             .format(
 93 |                 global_status, total_urls, error_summary, total_time,
 94 |                 len(site.start_url_splits)),
 95 |             files=output_files)
 96 | 
 97 |         oprint("  average response time: {0:.2f} seconds".format(
 98 |             avg_response_time), files=output_files)
 99 | 
100 |         oprint("  average process time: {0:.2f} seconds".format(
101 |             avg_process_time), files=output_files)
102 | 
103 |         pages = {}
104 | 
105 |         if config.options.report_type == REPORT_TYPE_ERRORS:
106 |             pages = site.multi_error_pages
107 |         elif config.options.report_type == REPORT_TYPE_ALL:
108 |             pages = site.multi_pages
109 | 
110 |         for domain, pages_dict in pages.items():
111 |             if pages_dict:
112 |                 oprint(
113 |                     "\n\n  Start Domain: {0}".format(domain),
114 |                     files=output_files)
115 | 
116 |                 _print_details(pages_dict.values(), output_files, config, 4)
117 |     except Exception:
118 |         from traceback import print_exc
119 |         print_exc()
120 | 
121 | 
122 | def _write_plain_text_report_single(site, config, output_files, total_time):
123 |     start_urls = ",".join((start_url_split.geturl() for start_url_split in
124 |                            site.start_url_splits))
125 | 
126 |     total_urls = len(site.pages)
127 |     total_errors = len(site.error_pages)
128 | 
129 |     if not site.is_ok:
130 |         global_status = "ERROR"
131 |         error_summary = "with {0} error(s) ".format(total_errors)
132 |     else:
133 |         global_status = "SUCCESS"
134 |         error_summary = ""
135 | 
136 |     try:
137 |         avg_response_time = site.get_average_response_time()
138 |         avg_process_time = site.get_average_process_time()
139 | 
140 |         oprint("{0} Crawled {1} urls {2}in {3:.2f} seconds".format(
141 |             global_status, total_urls, error_summary, total_time),
142 |             files=output_files)
143 | 
144 |         oprint("  average response time: {0:.2f} seconds".format(
145 |             avg_response_time), files=output_files)
146 | 
147 |         oprint("  average process time: {0:.2f} seconds".format(
148 |             avg_process_time), files=output_files)
149 | 
150 |     except Exception:
151 |         from traceback import print_exc
152 |         print_exc()
153 | 
154 |     pages = {}
155 | 
156 |     if config.options.report_type == REPORT_TYPE_ERRORS:
157 |         pages = site.error_pages
158 |     elif config.options.report_type == REPORT_TYPE_ALL:
159 |         pages = site.pages
160 | 
161 |     if pages:
162 |         oprint("\n  Start URL(s): {0}".format(start_urls), files=output_files)
163 |         _print_details(pages.values(), output_files, config)
164 | 
165 | 
166 | def _print_details(page_iterator, output_files, config, indent=2):
167 |     initial_indent = " " * indent
168 |     for page in page_iterator:
169 |         oprint("\n{2}{0}: {1}".format(
170 |             page.get_status_message(), page.url_split.geturl(),
171 |             initial_indent),
172 |             files=output_files)
173 |         for content_message in page.get_content_messages():
174 |             oprint("{1}  {0}".format(content_message, initial_indent),
175 |                    files=output_files)
176 |         for source in page.sources:
177 |             oprint("{1}  from {0}".format(
178 |                 source.origin.geturl(), initial_indent), files=output_files)
179 |             if config.options.show_source:
180 |                 oprint("{1}    {0}".format(
181 |                     truncate(source.origin_str), initial_indent),
182 |                        files=output_files)
183 | 
184 | 
185 | def oprint(message, files):
186 |     """Prints to a sequence of files."""
187 |     for file in files:
188 |         print(message, file=file)
189 | 
190 | 
191 | def truncate(value, size=72):
192 |     """Truncates a string if its length is higher than size."""
193 |     value = value.replace("\n", " ").replace("\r", "").replace("\t", " ")
194 |     value = value.strip()
195 |     value = WHITESPACES.sub(" ", value)
196 | 
197 |     if len(value) > size:
198 |         value = "{0}...".format(value[:size-3])
199 | 
200 |     return value
201 | 
202 | 
203 | def send_email(email_file, site, config):
204 |     options = config.options
205 |     if options.subject:
206 |         subject = options.subject
207 |     else:
208 |         if site.is_ok:
209 |             subject = "SUCCESS - {0}".format(site.start_url_splits[0].geturl())
210 |         else:
211 |             subject = "ERROR - {0}".format(site.start_url_splits[0].geturl())
212 | 
213 |     if options.from_address:
214 |         from_address = options.from_address
215 |     else:
216 |         from_address = "pylinkvalidator@localhost"
217 | 
218 |     if not options.address:
219 |         print("Email address must be specified when using smtp.")
220 |         sys.exit(1)
221 | 
222 |     addresses = options.address.split(",")
223 | 
224 |     msg = MIMEText(email_file.getvalue(), 'plain', "UTF-8")
225 | 
226 |     msg['From'] = from_address
227 |     msg['To'] = ", ".join(addresses)
228 |     msg['Subject'] = subject
229 | 
230 |     smtpserver = smtplib.SMTP(options.smtp, options.port)
231 | 
232 |     if options.tls:
233 |         smtpserver.ehlo()
234 |         smtpserver.starttls()
235 |         smtpserver.ehlo
236 | 
237 |     if options.smtp_username and options.smtp_password:
238 |         smtpserver.login(options.smtp_username, options.smtp_password)
239 | 
240 |     smtpserver.sendmail(from_address, addresses, msg.as_string())
241 | 
242 |     smtpserver.quit()
243 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/a.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <p class="test1 test2">Hello World</p>
4 |     </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/alone.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |         <base href="http://www.example.com/">
 4 |     </head>
 5 |     <body>
 6 |         <a href="test.html">Test URL</a>
 7 |         <a href="mailto:foo@bar.com">Mail me</a>
 8 |         <a href="tel:foo@bar.com">Call me</a>
 9 |     </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/badtel.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <p>Go to <a href="/a.html">next page</a>
4 |         <p>Go to <a href="tel:+1234567890">good tel link</a>
5 |         <p>Go to <a href="tel:1234567890">bad tel link</a>
6 |         </p>
7 |     </body>
8 | </html>
9 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/c.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |     </body>
4 | </html>


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/d.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |     </body>
4 | </html>


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/depth/0.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="1.html">1</a>
4 |     </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/depth/0b.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="0.html">0</a>
4 |     </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/depth/1.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="2.html">2</a>
4 |     </body>
5 | </html>
6 | 
7 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/depth/2.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="3.html">3</a>
4 |     </body>
5 | </html>
6 | 
7 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/depth/3.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="4.html">4</a>
4 |     </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/depth/root.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="0.html">0</a>
4 |         <a href="0b.html">0b</a>
5 |     </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/f.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="nothing.html">Nothing</a>
4 |         <a href="a.html">Nothing</a>
5 |     </body>
6 | </html>


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/index.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |         <link href="sub/style.css" rel='stylesheet' type='text/css'>
 4 |     </head>
 5 |     <body>
 6 |         <a href="#">Test hash</a>
 7 |         <a name="hello">Test name</a>
 8 |         <a href="a.html">Test A</a>
 9 |         <a href="sub/b.html">Test B</a>
10 |         <a href="/c.html">Test C</a>
11 |         <a href="d.html">Test D</a>
12 |         <a href="//www.perdu.com">Test External</a>
13 |         <img src="sub/small_image.gif">
14 |         <img src="data:image/jpeg;base64, LzlqLzRBQ">
15 |         <script src="sub/test.js"></script>
16 |         <script></script>
17 |     </body>
18 | </html>


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Crawl-delay: 1
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/sub/b.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <a href="e.html">E</a>
4 |         <a href="../f.html">F</a>
5 |         <!-- Used to test strict mode -->
6 |         <img src=" http://www.placehold.it/10x10">
7 |     </body>
8 | </html>
9 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/sub/e.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |     </body>
4 | </html>


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/sub/small_image.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bartdag/pylinkvalidator/aac5934d88a9c99d0e4f40a8884ad942b6b10ea0/pylinkvalidator/testfiles/sub/small_image.gif


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/sub/style.css:
--------------------------------------------------------------------------------
1 | a {
2 |     color: #00ff00;
3 | }


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/sub/test.js:
--------------------------------------------------------------------------------
1 | document.write('Hello World');


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/à.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <p>Hello World</p>
4 |     </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/pylinkvalidator/testfiles/é.html:
--------------------------------------------------------------------------------
1 | <html>
2 |     <body>
3 |         <p>Go to <a href="/à.html">next page</a>
4 |         </p>
5 |     </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/pylinkvalidator/tests.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Unit and integration tests for pylinkvalidator
  4 | """
  5 | from __future__ import unicode_literals, absolute_import
  6 | 
  7 | import os
  8 | import logging
  9 | import sys
 10 | from tempfile import mkstemp
 11 | import time
 12 | import threading
 13 | import unittest
 14 | 
 15 | from pylinkvalidator import api
 16 | import pylinkvalidator.compat as compat
 17 | from pylinkvalidator.compat import (
 18 |     SocketServer, SimpleHTTPServer, get_url_open, get_url_request)
 19 | from pylinkvalidator.crawler import (
 20 |     open_url, PageCrawler, WORK_DONE, ThreadSiteCrawler, ProcessSiteCrawler,
 21 |     get_logger)
 22 | from pylinkvalidator.models import (
 23 |     Config, WorkerInit, WorkerConfig, WorkerInput, PARSER_STDLIB)
 24 | from pylinkvalidator.urlutil import get_clean_url_split, get_absolute_url_split
 25 | 
 26 | 
 27 | TEST_FILES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)),
 28 |                               'testfiles')
 29 | 
 30 | # Quiet all logging
 31 | logging.basicConfig(level=logging.CRITICAL)
 32 | 
 33 | 
 34 | # UTILITY CLASSES AND FUNCTIONS ###
 35 | 
 36 | class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):
 37 |     pass
 38 | 
 39 | 
 40 | def start_http_server():
 41 |     """Starts a simple http server for the test files"""
 42 |     # For the http handler
 43 |     os.chdir(TEST_FILES_DIR)
 44 |     handler = SimpleHTTPServer.SimpleHTTPRequestHandler
 45 |     handler.extensions_map['.html'] = 'text/html; charset=UTF-8'
 46 |     httpd = ThreadedTCPServer(("localhost", 0), handler)
 47 |     ip, port = httpd.server_address
 48 | 
 49 |     httpd_thread = threading.Thread(target=httpd.serve_forever)
 50 |     httpd_thread.setDaemon(True)
 51 |     httpd_thread.start()
 52 | 
 53 |     return (ip, port, httpd, httpd_thread)
 54 | 
 55 | 
 56 | def has_multiprocessing():
 57 |     has_multi = False
 58 | 
 59 |     try:
 60 |         import multiprocessing  # noqa
 61 |         has_multi = True
 62 |     except Exception:
 63 |         pass
 64 | 
 65 |     return has_multi
 66 | 
 67 | 
 68 | def has_gevent():
 69 |     has_gevent = False
 70 | 
 71 |     try:
 72 |         import gevent  # noqa
 73 |         has_gevent = True
 74 |     except Exception:
 75 |         pass
 76 | 
 77 |     return has_gevent
 78 | 
 79 | 
 80 | # UNIT AND INTEGRATION TESTS ###
 81 | 
 82 | 
 83 | class ConfigTest(unittest.TestCase):
 84 | 
 85 |     def setUp(self):
 86 |         self.argv = sys.argv
 87 | 
 88 |     def tearDown(self):
 89 |         sys.argv = self.argv
 90 | 
 91 |     def test_accepted_hosts(self):
 92 |         sys.argv = ['pylinkvalidator', 'http://www.example.com/']
 93 |         config = Config()
 94 |         config.parse_cli_config()
 95 |         self.assertTrue('www.example.com' in config.accepted_hosts)
 96 | 
 97 |         sys.argv = ['pylinkvalidator', '-H', 'www.example.com',
 98 |                     'http://example.com', 'foo.com', 'http://www.example.com/',
 99 |                     'baz.com']
100 |         config = Config()
101 |         config.parse_cli_config()
102 | 
103 |         self.assertTrue('www.example.com' in config.accepted_hosts)
104 |         self.assertTrue('example.com' in config.accepted_hosts)
105 |         self.assertTrue('foo.com' in config.accepted_hosts)
106 |         self.assertTrue('baz.com' in config.accepted_hosts)
107 | 
108 | 
109 | class URLUtilTest(unittest.TestCase):
110 | 
111 |     def test_clean_url_split(self):
112 |         self.assertEqual(
113 |             "http://www.example.com",
114 |             get_clean_url_split("www.example.com").geturl())
115 |         self.assertEqual(
116 |             "http://www.example.com",
117 |             get_clean_url_split("//www.example.com").geturl())
118 |         self.assertEqual(
119 |             "http://www.example.com",
120 |             get_clean_url_split("http://www.example.com").geturl())
121 | 
122 |         self.assertEqual(
123 |             "http://www.example.com/",
124 |             get_clean_url_split("www.example.com/").geturl())
125 |         self.assertEqual(
126 |             "http://www.example.com/",
127 |             get_clean_url_split("//www.example.com/").geturl())
128 |         self.assertEqual(
129 |             "http://www.example.com/",
130 |             get_clean_url_split("http://www.example.com/").geturl())
131 | 
132 |     def test_get_absolute_url(self):
133 |         base_url_split = get_clean_url_split(
134 |             "https://www.example.com/hello/index.html")
135 |         self.assertEqual(
136 |             "https://www.example2.com/test.js",
137 |             get_absolute_url_split(
138 |                 "//www.example2.com/test.js", base_url_split).geturl())
139 |         self.assertEqual(
140 |             "https://www.example.com/hello2/test.html",
141 |             get_absolute_url_split(
142 |                 "/hello2/test.html", base_url_split).geturl())
143 |         self.assertEqual(
144 |             "https://www.example.com/hello/test.html",
145 |             get_absolute_url_split("test.html", base_url_split).geturl())
146 |         self.assertEqual(
147 |             "https://www.example.com/test.html",
148 |             get_absolute_url_split("../test.html", base_url_split).geturl())
149 | 
150 | 
151 | class CrawlerTest(unittest.TestCase):
152 | 
153 |     @classmethod
154 |     def setUpClass(cls):
155 |         (cls.ip, cls.port, cls.httpd, cls.httpd_thread) = start_http_server()
156 | 
157 |         # FIXME replace by thread synchronization on start
158 |         time.sleep(0.2)
159 | 
160 |     @classmethod
161 |     def tearDownClass(cls):
162 |         cls.httpd.shutdown()
163 | 
164 |     def setUp(self):
165 |         # We must do this because Python 2.6 does not have setUpClass
166 |         # This will only be executed if setUpClass is ignored.
167 |         # It will not be shutdown properly though, but this does not prevent
168 |         # the unit test to run properly
169 |         if not hasattr(self, 'port'):
170 |             (self.ip, self.port, self.httpd, self.httpd_thread) =\
171 |                 start_http_server()
172 |             # FIXME replace by thread synchronization on start
173 |             time.sleep(0.2)
174 |         self.argv = sys.argv
175 | 
176 |         # Need to override root logger level (reset by something)
177 |         logger = logging.getLogger()
178 |         logger.setLevel(logging.CRITICAL)
179 | 
180 |     def tearDown(self):
181 |         sys.argv = self.argv
182 | 
183 |     def get_url(self, test_url):
184 |         return "http://{0}:{1}{2}".format(self.ip, self.port, test_url)
185 | 
186 |     def get_page_crawler(self, url):
187 |         url = self.get_url(url)
188 |         url_split = get_clean_url_split(url)
189 |         input_queue = compat.Queue.Queue()
190 |         output_queue = compat.Queue.Queue()
191 | 
192 |         worker_config = WorkerConfig(
193 |             username=None, password=None, types=['a', 'img', 'link', 'script'],
194 |             timeout=5, parser=PARSER_STDLIB,
195 |             strict_mode=False, prefer_server_encoding=False,
196 |             extra_headers=[])
197 | 
198 |         worker_init = WorkerInit(
199 |             worker_config=worker_config,
200 |             input_queue=input_queue, output_queue=output_queue,
201 |             logger=get_logger())
202 | 
203 |         page_crawler = PageCrawler(worker_init)
204 | 
205 |         return page_crawler, url_split
206 | 
207 |     def test_404(self):
208 |         urlopen = get_url_open()
209 |         import socket
210 |         url = self.get_url("/does_not_exist.html")
211 |         response = open_url(
212 |             urlopen, get_url_request(), url, 5, socket.timeout)
213 | 
214 |         self.assertEqual(404, response.status)
215 |         self.assertTrue(response.exception is not None)
216 | 
217 |     def test_200(self):
218 |         urlopen = get_url_open()
219 |         import socket
220 |         url = self.get_url("/index.html")
221 |         response = open_url(urlopen, get_url_request(), url, 5, socket.timeout)
222 | 
223 |         self.assertEqual(200, response.status)
224 |         self.assertTrue(response.exception is None)
225 | 
226 |     def test_301(self):
227 |         urlopen = get_url_open()
228 |         import socket
229 |         url = self.get_url("/sub")
230 |         response = open_url(urlopen, get_url_request(), url, 5, socket.timeout)
231 | 
232 |         self.assertEqual(200, response.status)
233 |         self.assertTrue(response.is_redirect)
234 | 
235 |     def test_crawl_page(self):
236 |         page_crawler, url_split = self.get_page_crawler("/index.html")
237 |         page_crawl = page_crawler._crawl_page(
238 |             WorkerInput(url_split, True, 0, url_split.netloc))
239 | 
240 |         self.assertEqual(200, page_crawl.status)
241 |         self.assertTrue(page_crawl.is_html)
242 |         self.assertFalse(page_crawl.is_timeout)
243 |         self.assertFalse(page_crawl.is_redirect)
244 |         self.assertTrue(page_crawl.exception is None)
245 | 
246 |         a_links = [link for link in page_crawl.links if link.type == 'a']
247 |         img_links = [link for link in page_crawl.links if link.type == 'img']
248 |         script_links = [link for link in page_crawl.links
249 |                         if link.type == 'script']
250 |         link_links = [link for link in page_crawl.links if link.type == 'link']
251 | 
252 |         self.assertEqual(5, len(a_links))
253 |         self.assertEqual(1, len(img_links))
254 |         self.assertEqual(1, len(script_links))
255 |         self.assertEqual(1, len(link_links))
256 | 
257 |     def test_crawl_resource(self):
258 |         page_crawler, url_split = self.get_page_crawler("/sub/small_image.gif")
259 |         page_crawl = page_crawler._crawl_page(
260 |             WorkerInput(url_split, True, 0, url_split.netloc))
261 | 
262 |         self.assertEqual(200, page_crawl.status)
263 |         self.assertFalse(page_crawl.links)
264 |         self.assertFalse(page_crawl.is_html)
265 |         self.assertFalse(page_crawl.is_timeout)
266 |         self.assertFalse(page_crawl.is_redirect)
267 |         self.assertTrue(page_crawl.exception is None)
268 | 
269 |     def test_base_url(self):
270 |         page_crawler, url_split = self.get_page_crawler("/alone.html")
271 |         page_crawl = page_crawler._crawl_page(
272 |             WorkerInput(url_split, True, 0, url_split.netloc))
273 | 
274 |         self.assertEqual(1, len(page_crawl.links))
275 |         self.assertEqual(
276 |             'http://www.example.com/test.html',
277 |             page_crawl.links[0].url_split.geturl())
278 | 
279 |     def test_crawl_404(self):
280 |         page_crawler, url_split = self.get_page_crawler(
281 |             "/sub/small_image_bad.gif")
282 |         page_crawl = page_crawler._crawl_page(
283 |             WorkerInput(url_split, True, 0, url_split.netloc))
284 | 
285 |         self.assertEqual(404, page_crawl.status)
286 |         self.assertFalse(page_crawl.links)
287 |         self.assertFalse(page_crawl.is_html)
288 |         self.assertFalse(page_crawl.is_timeout)
289 |         self.assertFalse(page_crawl.is_redirect)
290 | 
291 |     def test_page_crawler(self):
292 |         page_crawler, url_split = self.get_page_crawler("/index.html")
293 |         input_queue = page_crawler.input_queue
294 |         output_queue = page_crawler.output_queue
295 | 
296 |         input_queue.put(WorkerInput(url_split, True, 0, url_split.netloc))
297 |         input_queue.put(WORK_DONE)
298 |         page_crawler.crawl_page_forever()
299 | 
300 |         page_crawl = output_queue.get()
301 | 
302 |         self.assertEqual(200, page_crawl.status)
303 |         self.assertTrue(len(page_crawl.links) > 0)
304 | 
305 |     def _run_crawler_plain(
306 |             self, crawler_class, other_options=None, url="/index.html"):
307 |         url = self.get_url(url)
308 |         sys.argv = ['pylinkvalidator', "-m", "process", url]
309 |         if not other_options:
310 |             other_options = []
311 |         sys.argv.extend(other_options)
312 |         config = Config()
313 |         config.parse_cli_config()
314 | 
315 |         crawler = crawler_class(config, get_logger())
316 |         crawler.crawl()
317 | 
318 |         if config.options.multi:
319 |             crawler.site.collect_multi_sites()
320 | 
321 |         return crawler.site
322 | 
323 |     def test_site_thread_crawler_plain(self):
324 |         site = self._run_crawler_plain(ThreadSiteCrawler)
325 |         self.assertEqual(11, len(site.pages))
326 |         self.assertEqual(1, len(site.error_pages))
327 | 
328 |     def test_site_process_crawler_plain(self):
329 |         if not has_multiprocessing():
330 |             return
331 |         site = self._run_crawler_plain(ProcessSiteCrawler)
332 |         self.assertEqual(11, len(site.pages))
333 |         self.assertEqual(1, len(site.error_pages))
334 | 
335 |     def test_run_once(self):
336 |         site = self._run_crawler_plain(ThreadSiteCrawler, ["--run-once"])
337 | 
338 |         # 8 pages linked on the index
339 |         self.assertEqual(8, len(site.pages))
340 |         self.assertEqual(0, len(site.error_pages))
341 | 
342 |     def test_multi_sites(self):
343 |         site = self._run_crawler_plain(ThreadSiteCrawler, ["--multi"])
344 |         self.assertEqual(11, len(site.pages))
345 |         self.assertEqual(1, len(site.error_pages))
346 | 
347 |         multi_pages_for_site = list(site.multi_pages.values())[0]
348 |         multi_error_pages_for_site = list(site.multi_error_pages.values())[0]
349 |         self.assertEqual(11, len(multi_pages_for_site))
350 |         self.assertEqual(1, len(multi_error_pages_for_site))
351 | 
352 |     def test_content_check(self):
353 |         site = self._run_crawler_plain(
354 |             ThreadSiteCrawler,
355 |             [
356 |                 "--check-absence", "tata12345",
357 |                 "--check-absence", "<b>BOOM</b>",
358 |                 "--check-presence", "<html></html>",
359 |             ])
360 |         self.assertEqual(11, len(site.pages))
361 |         self.assertEqual(1, len(site.error_pages))
362 | 
363 |         site = self._run_crawler_plain(
364 |             ThreadSiteCrawler,
365 |             [
366 |                 "--check-presence-once",
367 |                 "/a.html,<p class=\"test1\">Hello World</p>",
368 |                 "--check-presence-once",
369 |                 "/robots.txt,regex:^Disallow:\s*$",
370 |             ])
371 |         self.assertEqual(12, len(site.pages))
372 |         self.assertEqual(1, len(site.error_pages))
373 | 
374 |         site = self._run_crawler_plain(
375 |             ThreadSiteCrawler,
376 |             ["--check-presence-once",
377 |              "/a.html,<p class=\"test1\">regex:Hello</p>"])
378 |         self.assertEqual(11, len(site.pages))
379 |         self.assertEqual(1, len(site.error_pages))
380 | 
381 |         site = self._run_crawler_plain(
382 |             ThreadSiteCrawler,
383 |             ["--check-absence-once",
384 |              "/a.html,<p class=\"test1\">regex:Hello</p>"])
385 |         self.assertEqual(11, len(site.pages))
386 |         self.assertEqual(2, len(site.error_pages))
387 | 
388 |     def test_url_file_path(self):
389 |         (_, temp_file_path) = mkstemp()
390 |         url = self.get_url("/index.html")
391 |         url2 = self.get_url("/robots.txt")
392 |         with open(temp_file_path, "w") as temp_file:
393 |             temp_file.write(url + "\n")
394 |             temp_file.write(url2 + "\n")
395 | 
396 |         sys.argv = [
397 |             "pylinkvalidator", "-m", "process", "--url-file-path",
398 |             temp_file_path]
399 |         config = Config()
400 |         config.parse_cli_config()
401 | 
402 |         crawler = ThreadSiteCrawler(config, get_logger())
403 |         crawler.crawl()
404 | 
405 |         site = crawler.site
406 |         self.assertEqual(12, len(site.pages))
407 |         self.assertEqual(1, len(site.error_pages))
408 |         os.unlink(temp_file_path)
409 | 
410 |     def test_depth_0(self):
411 |         site = self._run_crawler_plain(
412 |             ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")
413 |         # 3 pages linked on the root (root, 0, 0b)
414 |         self.assertEqual(3, len(site.pages))
415 |         self.assertEqual(0, len(site.error_pages))
416 | 
417 |         site = self._run_crawler_plain(
418 |             ThreadSiteCrawler, ["--run-once"], "/depth/root.html")
419 |         # Same as depth = 0
420 |         self.assertEqual(3, len(site.pages))
421 |         self.assertEqual(0, len(site.error_pages))
422 | 
423 |         site = self._run_crawler_plain(
424 |             ThreadSiteCrawler, ["--depth", "1"], "/depth/root.html")
425 |         # 4 pages linked on the root (root, 0, 0b, 1)
426 |         self.assertEqual(4, len(site.pages))
427 |         self.assertEqual(0, len(site.error_pages))
428 | 
429 |         site = self._run_crawler_plain(
430 |             ThreadSiteCrawler, ["--depth", "10"], "/depth/root.html")
431 |         # 3 pages linked on the root (root, 0, 0b)
432 |         self.assertEqual(7, len(site.pages))
433 |         self.assertEqual(1, len(site.error_pages))
434 | 
435 |     def test_strict_mode(self):
436 |         site = self._run_crawler_plain(ThreadSiteCrawler, ["--strict"])
437 | 
438 |         # The placeholdit is interpreted as a relative url
439 |         # So 12 "good" urls and 1 bad.
440 |         self.assertEqual(12, len(site.pages))
441 | 
442 |         # Python 3 returns an error. There was a change in urllib.
443 |         # In general, strict mode should be false, which is the default
444 |         # This avoids these silly differences
445 |         self.assertTrue(len(site.error_pages) >= 1)
446 | 
447 |     def test_site_gevent_crawler_plain(self):
448 |         if not has_gevent():
449 |             return
450 |         # TODO test gevent. Cannot use threaded simple http server :-(
451 |         self.assertTrue(True)
452 | 
453 |     def test_api(self):
454 |         url = self.get_url("/index.html")
455 | 
456 |         site = api.crawl(url)
457 |         self.assertEqual(11, len(site.pages))
458 |         self.assertEqual(1, len(site.error_pages))
459 | 
460 |     def test_api_with_options(self):
461 |         url = self.get_url("/index.html")
462 | 
463 |         site = api.crawl_with_options([url], {"run-once": True, "workers": 2})
464 |         self.assertEqual(8, len(site.pages))
465 |         self.assertEqual(0, len(site.error_pages))
466 | 
467 |     def test_api_with_options_2(self):
468 |         site = self._run_crawler_plain(
469 |             ThreadSiteCrawler,
470 |             ["--prefer-server-encoding", "--header", "\"XKey: XValue\"",
471 |              "--header", "\"XKey2: XValue2\"", "--run-once"], "/index.html")
472 |         self.assertEqual(8, len(site.pages))
473 |         self.assertEqual(0, len(site.error_pages))
474 | 
475 |     def test_unicode(self):
476 |         site = self._run_crawler_plain(
477 |             ThreadSiteCrawler, ["--prefer-server-encoding"], "/é.html")
478 |         # 3 pages linked on the root (root, 0, 0b)
479 |         self.assertEqual(2, len(site.pages))
480 |         self.assertEqual(0, len(site.error_pages))
481 | 
482 |     def test_bad_tel_link(self):
483 |         site = self._run_crawler_plain(
484 |             ThreadSiteCrawler, ["--ignore-bad-tel-urls"], "/badtel.html")
485 |         # root + one page linked. bad tel link and tel link are ignored.
486 |         self.assertEqual(2, len(site.pages))
487 |         self.assertEqual(0, len(site.error_pages))
488 | 
489 |         if sys.version_info[:2] > (2, 6):
490 |             site = self._run_crawler_plain(
491 |                 ThreadSiteCrawler, [], "/badtel.html")
492 |             # root + one page + one bad tel link. One correct tel link ignored
493 |             self.assertEqual(3, len(site.pages))
494 |             self.assertEqual(1, len(site.error_pages))
495 | 


--------------------------------------------------------------------------------
/pylinkvalidator/urlutil.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Contains the crawling logic.
  4 | """
  5 | from __future__ import unicode_literals, absolute_import
  6 | 
  7 | import re
  8 | 
  9 | from pylinkvalidator.compat import urlparse, quote
 10 | 
 11 | 
 12 | SCHEME_HTTP = "http"
 13 | SCHEME_HTTPS = "https"
 14 | SUPPORTED_SCHEMES = (SCHEME_HTTP, SCHEME_HTTPS)
 15 | 
 16 | 
 17 | NOT_LINK = [
 18 |     'data',
 19 |     '#',
 20 | ]
 21 | 
 22 | 
 23 | def is_link(url):
 24 |     """Return True if the url is not base 64 data or a local ref (#)"""
 25 |     for prefix in NOT_LINK:
 26 |         if url.startswith(prefix):
 27 |             return False
 28 |     return True
 29 | 
 30 | 
 31 | def get_clean_url_split(url):
 32 |     """Returns a clean SplitResult with a scheme and a valid path
 33 | 
 34 |     :param url: The url to clean
 35 |     :rtype: A urlparse.SplitResult
 36 |     """
 37 |     if not url:
 38 |         raise ValueError('The URL must not be empty')
 39 |     split_result = urlparse.urlsplit(url)
 40 | 
 41 |     if not split_result.scheme:
 42 |         if split_result.netloc:
 43 |             url = SCHEME_HTTP + ":" + url
 44 |         else:
 45 |             url = SCHEME_HTTP + "://" + url
 46 |         split_result = urlparse.urlsplit(url)
 47 | 
 48 |     split_result = convert_iri_to_uri(split_result)
 49 | 
 50 |     return split_result
 51 | 
 52 | 
 53 | def convert_iri_to_uri(url_split):
 54 |     """Attempts to convert potential IRI to URI.
 55 | 
 56 |     IRI may contain non-ascii characters.
 57 |     """
 58 |     new_parts = []
 59 |     for i, part in enumerate(url_split):
 60 |         if i == 1:
 61 |             # domain name
 62 |             new_parts.append(part.encode('idna').decode('ascii'))
 63 |         else:
 64 |             # other parts such as path or query string.
 65 |             new_parts.append(url_encode_non_ascii(part))
 66 |     return urlparse.SplitResult(*new_parts)
 67 | 
 68 | 
 69 | def url_encode_non_ascii(url_part):
 70 |     """For each byte in url_part, if the byte is outside ascii range, quote the
 71 |     byte. UTF characters that take two bytes will be correctly converted using
 72 |     this technique.
 73 | 
 74 |     We do not quote the whole url part because it might contain already quoted
 75 |     characters, which would then be double-quoted.
 76 | 
 77 |     The url part is converted from utf-8 and then to utf-8, which might not
 78 |     always work if there is mixed or bad encoding.
 79 |     """
 80 |     return re.sub(
 81 |         b'[\x80-\xFF]',
 82 |         lambda match: quote(match.group(0)).encode("utf-8"),
 83 |         url_part.encode("utf-8")).decode("ascii")
 84 | 
 85 | 
 86 | def get_absolute_url_split(url, base_url_split):
 87 |     """Returns a SplitResult containing the new URL.
 88 | 
 89 |     :param url: The url (relative or absolute).
 90 |     :param base_url_split: THe SplitResult of the base URL.
 91 |     :rtype: A SplitResult
 92 |     """
 93 |     new_url = urlparse.urljoin(base_url_split.geturl(), url)
 94 | 
 95 |     return get_clean_url_split(new_url)
 96 | 
 97 | 
 98 | def is_similar_url_split(url_split_1, url_split_2):
 99 |     """Returns True if the two url split shares
100 |     the same path and netloc.
101 | 
102 |     Also returns True if one of the url split does not have a netloc and both
103 |     shares the same path.
104 |     """
105 |     if not url_split_1.netloc or not url_split_2.netloc:
106 |         return url_split_1.path == url_split_2.path
107 |     else:
108 |         return url_split_1.path == url_split_2.path and\
109 |             url_split_1.netloc == url_split_2.netloc
110 | 
111 | 
112 | def is_bad_tel_url_split(url_split):
113 |     """Returns True if the URL is using a badly formed tel scheme
114 |     that is not detected by Python urlparse.
115 |     """
116 |     return url_split.netloc.startswith("tel:") or\
117 |         url_split.path.startswith("/tel:")
118 | 
119 | 
120 | def is_supported_scheme(url_split, ignore_bad_tel_urls=False):
121 |     """Returns True if the URL has a supported scheme and can be crawled.
122 |     """
123 |     if url_split.scheme not in SUPPORTED_SCHEMES:
124 |         return False
125 |     elif ignore_bad_tel_urls and is_bad_tel_url_split(url_split):
126 |         # issue #16
127 |         return False
128 |     return True
129 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.2.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | try:
 4 |     from setuptools import setup
 5 | except ImportError:
 6 |     from distutils.core import setup
 7 | 
 8 | import sys
 9 | 
10 | version = __import__('pylinkvalidator').__version__
11 | 
12 | if sys.version_info[0] >= 3:
13 |     requires = ['beautifulsoup4>=4.2.0']
14 | else:
15 |     requires = []
16 | 
17 | setup(
18 |     name='pylinkvalidator',
19 |     version=version,
20 |     description='Simple crawler that detects link errors such as 404 and 500.',
21 |     long_description='''
22 | pylinkvalidator is a simple crawler that traverses a web sites and reports
23 | errors (e.g., 500 and 404 errors) encountered. The crawler can try to download
24 | resources like images.
25 |     ''',
26 |     author='Barthelemy Dagenais',
27 |     author_email='barthelemy@infobart.com',
28 |     license='BSD License',
29 |     url='https://github.com/bartdag/pylinkvalidator',
30 |     packages=['pylinkvalidator', 'pylinkvalidator.included',
31 |               'pylinkvalidator.included.bs4',
32 |               'pylinkvalidator.included.bs4.builder'],
33 |     scripts=['pylinkvalidator/bin/pylinkvalidate.py'],
34 |     classifiers=[
35 |         'Environment :: Console',
36 |         'Intended Audience :: Developers',
37 |         'License :: OSI Approved :: BSD License',
38 |         'Operating System :: OS Independent',
39 |         'Programming Language :: Python',
40 |         'Programming Language :: Python :: 2',
41 |         'Programming Language :: Python :: 2.6',
42 |         'Programming Language :: Python :: 2.7',
43 |         'Programming Language :: Python :: 3',
44 |         'Programming Language :: Python :: 3.3',
45 |         'Programming Language :: Python :: 3.4',
46 |         'Programming Language :: Python :: 3.5',
47 |         'Programming Language :: Python :: 3.6',
48 |         'Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking',
49 |         'Topic :: Utilities',
50 |     ],
51 |     install_requires=requires,
52 | )
53 | 


--------------------------------------------------------------------------------