├── .gitignore ├── .travis.yml ├── AUTHORS.txt ├── CHANGELOG.rst ├── LICENSE.txt ├── README.rst ├── pylinkvalidator ├── __init__.py ├── api.py ├── bin │ └── pylinkvalidate.py ├── compat.py ├── crawler.py ├── included │ ├── __init__.py │ └── bs4 │ │ ├── __init__.py │ │ ├── builder │ │ ├── __init__.py │ │ ├── _html5lib.py │ │ ├── _htmlparser.py │ │ └── _lxml.py │ │ ├── dammit.py │ │ ├── diagnose.py │ │ └── element.py ├── models.py ├── reporter.py ├── testfiles │ ├── a.html │ ├── alone.html │ ├── badtel.html │ ├── c.html │ ├── d.html │ ├── depth │ │ ├── 0.html │ │ ├── 0b.html │ │ ├── 1.html │ │ ├── 2.html │ │ ├── 3.html │ │ └── root.html │ ├── f.html │ ├── index.html │ ├── robots.txt │ ├── sub │ │ ├── b.html │ │ ├── e.html │ │ ├── small_image.gif │ │ ├── style.css │ │ └── test.js │ ├── à.html │ └── é.html ├── tests.py └── urlutil.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | pylinkvalidator.egg-info/ 3 | dist/ 4 | build/ 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.4" 6 | - "3.6" 7 | install: 8 | - "pip install ." 9 | script: nosetests 10 | sudo: false 11 | -------------------------------------------------------------------------------- /AUTHORS.txt: -------------------------------------------------------------------------------- 1 | Pylinkvalidator was originally created as part of pylinkchecker in 2013 by 2 | Barthelemy Dagenais while he was working at Xprima Inc. It has been forked on 3 | June 24th 2014 with the name pylinkvalidator. 4 | 5 | Here is an inevitably incomplete list of MUCH-APPRECIATED CONTRIBUTORS -- 6 | people who have submitted patches, reported bugs, and generally made 7 | pylinkvalidator that much better: 8 | 9 | Arun Elias 10 | Jim Priest 11 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | CHANGELOG 2 | ========= 3 | 4 | 0.3 (to be published) 5 | -------------------- 6 | 7 | - Added --ignore-bad-tel-urls option to ignore phone URLs that do not conform 8 | to the telephone number URI RFC 3966. 9 | - Added --allow-insecure-content option to crawl pages with HTTPS errors (e.g., 10 | self signed certificate). 11 | 12 | 0.2 (July 22th 2015) 13 | -------------------- 14 | 15 | - Added the --depth option to limit crawling to certain depths. 16 | 17 | 0.1 (June 24th 2014) 18 | -------------------- 19 | 20 | Initial fork of pylinkchecker 21 | 22 | - Changed pylinkchecker to pylinkvalidator 23 | - Changed pylinkcheck.py to pylinkvalidate 24 | - Updated license 25 | - PEP 8 compliance 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2015 Barthelemy Dagenais and individual contributors. All 2 | rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | - Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | - Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | - The name of the author may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | Pylinkvalidator is a fork of Pylinkchecker, which was licensed at the date of 31 | the fork, 24 June 2014, under these conditions: 32 | 33 | Copyright (c) 2013, Technologies Xprima Inc All rights reserved. 34 | 35 | Redistribution and use in source and binary forms, with or without 36 | modification, are permitted provided that the following conditions are met: 37 | 38 | - Redistributions of source code must retain the above copyright notice, this 39 | list of conditions and the following disclaimer. 40 | 41 | - Redistributions in binary form must reproduce the above copyright notice, 42 | this list of conditions and the following disclaimer in the documentation 43 | and/or other materials provided with the distribution. 44 | 45 | - The name of the author may not be used to endorse or promote products 46 | derived from this software without specific prior written permission. 47 | 48 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 49 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 52 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 53 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 54 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 55 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 56 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 57 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 58 | POSSIBILITY OF SUCH DAMAGE. 59 | 60 | 61 | Pylinkvalidator includes a copy of BeautifulSoup which is licensed under the 62 | MIT License. 63 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | pylinkvalidator 2 | =============== 3 | 4 | :Version: 0.3 5 | 6 | pylinkvalidator is a standalone and pure python link validator and crawler that 7 | traverses a web site and reports errors (e.g., 500 and 404 errors) encountered. 8 | The crawler can also download resources such as images, scripts and 9 | stylesheets. 10 | 11 | pylinkvalidator's performance can be improved by installing additional libraries 12 | that require a C compiler, but these libraries are optional. 13 | 14 | We created pylinkvalidator so that it could be executed in environments without 15 | access to a compiler (e.g., Microsoft Windows, some posix production 16 | environments) or with an old version of python (e.g., Centos). 17 | 18 | pylinkvalidator is highly modular and has many configuration options, but the 19 | only required parameter is the starting url: pylinkvalidate.py 20 | http://www.example.com/ 21 | 22 | pylinkvalidator can also be used programmatically by calling one of the functions 23 | in ``pylinkvalidator.api`` 24 | 25 | .. image:: https://api.travis-ci.org/bartdag/pylinkvalidator.png 26 | 27 | 28 | Quick Start 29 | ----------- 30 | 31 | Install pylinkvalidator with pip or easy_install: 32 | 33 | :: 34 | 35 | pip install pylinkvalidator 36 | 37 | 38 | Crawl all pages from a site and show progress: 39 | 40 | :: 41 | 42 | pylinkvalidate.py -P http://www.example.com/ 43 | 44 | 45 | Requirements 46 | ------------ 47 | 48 | pylinkvalidator does not require external libraries if executed with python 2.x. 49 | It requires beautifulsoup4 if executed with python 3.x. It has been tested on 50 | python 2.6, python 2.7, and python 3.6. 51 | 52 | For production use, it is strongly recommended to use lxml or html5lib because 53 | the default HTML parser provided by python is not very lenient. 54 | 55 | 56 | Optional Requirements 57 | --------------------- 58 | 59 | These libraries can be installed to enable certain modes in pylinkvalidator: 60 | 61 | lxml 62 | beautifulsoup can use lxml to speed up the parsing of HTML pages. Because 63 | lxml requires C libraries, this is only an optional requirement. 64 | 65 | html5lib 66 | beautifulsoup can use html5lib to process incorrect or strange markup. It is 67 | slower than lxml, but believed to be more lenient. 68 | 69 | gevent 70 | this non-blocking io library enables pylinkvalidator to use green threads 71 | instead of processes or threads. gevent could potentially speed up the 72 | crawling speed on web sites with many small pages. 73 | 74 | cchardet 75 | this library speeds up the detection of document encoding. 76 | 77 | 78 | Usage 79 | ----- 80 | 81 | This is a list of all available options. See the end of the README file for 82 | usage examples. 83 | 84 | :: 85 | 86 | Usage: pylinkvalidate.py [options] URL ... 87 | 88 | Options: 89 | --version Show program's version number and exit 90 | -h, --help Show this help message and exit 91 | -V VERBOSE, --verbose=VERBOSE 92 | Display debugging info 93 | None: --verbose=0 (default) 94 | Quiet: --verbose=1 95 | Info: --verbose=2 96 | 97 | Crawler Options: 98 | These options modify the way the crawler traverses the site. 99 | 100 | -O, --test-outside Fetch resources from other domains without crawling 101 | them 102 | -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS 103 | Comma-separated list of additional hosts to crawl 104 | (e.g., example.com,subdomain.another.com) 105 | -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES 106 | Comma-separated list of host/path prefixes to ignore 107 | (e.g., www.example.com/ignore_this_and_after/) 108 | -b, --ignore-bad-tel-urls 109 | ignore badly formed tel URLs missing the leading + 110 | sign, e.g., tel:1234567890 - only necessary for Python 111 | > 2.6 112 | -u USERNAME, --username=USERNAME 113 | Username to use with basic HTTP authentication 114 | -p PASSWORD, --password=PASSWORD 115 | Password to use with basic HTTP authentication 116 | -M, --multi each argument is considered to be a different site 117 | -D HEADER, --header=HEADER 118 | custom header of the form Header: Value (repeat for 119 | multiple headers) 120 | --url-file-path=URL_FILE_PATH 121 | get starting URLs from a line-separated file 122 | -t TYPES, --types=TYPES 123 | Comma-separated values of tags to look for when 124 | crawling a site. Default (and supported types): 125 | a,img,link,script 126 | -T TIMEOUT, --timeout=TIMEOUT 127 | Seconds to wait before considering that a page timed 128 | out (default = 10) 129 | -C, --strict Does not strip href and src attributes from 130 | whitespaces 131 | -P, --progress Prints crawler progress in the console 132 | -N, --run-once Only crawl the first page (eq. to depth=0) 133 | -d DEPTH, --depth=DEPTH 134 | Maximum crawl depth (default = 1) 135 | -e, --prefer-server-encoding 136 | Prefer server encoding if specified. Else detect 137 | encoding 138 | --check-presence=CONTENT_PRESENCE 139 | Check presence of raw or HTML content on all pages. 140 | e.g., regex:content. Content 141 | can be either regex:pattern or plain content 142 | --check-absence=CONTENT_ABSENCE 143 | Check absence of raw or HTML content on all pages. 144 | e.g., regex:content. Content 145 | can be either regex:pattern or plain content 146 | --check-presence-once=CONTENT_PRESENCE_ONCE 147 | Check presence of raw or HTML content for one page: 148 | path,content, e.g.,: /path,regex:content. Content can be either 150 | regex:pattern or plain content. Path can be either 151 | relative or absolute with domain. 152 | --check-absence-once=CONTENT_ABSENCE_ONCE 153 | Check absence of raw or HTML content for one page: 154 | path,content, e.g.,path,regex:content. Content can be either 156 | regex:pattern or plain content. Path can be either 157 | relative or absolute with domain. 158 | -S, --show-source Show source of links (html) in the report. 159 | --allow-insecure-content 160 | Allow insecure content for HTTPS sites with 161 | certificate errors 162 | 163 | Performance Options: 164 | These options can impact the performance of the crawler. 165 | 166 | -w WORKERS, --workers=WORKERS 167 | Number of workers to spawn (default = 1) 168 | -m MODE, --mode=MODE 169 | Types of workers: thread (default), process, or green 170 | -R PARSER, --parser=PARSER 171 | Types of HTML parse: html.parser (default) or lxml 172 | 173 | Output Options: 174 | These options change the output of the crawler. 175 | 176 | -f FORMAT, --format=FORMAT 177 | Format of the report: plain (default) 178 | -o OUTPUT, --output=OUTPUT 179 | Path of the file where the report will be printed. 180 | -W WHEN, --when=WHEN 181 | When to print the report. error (only if a 182 | crawling error occurs) or always (default) 183 | -E REPORT_TYPE, --report-type=REPORT_TYPE 184 | Type of report to print: errors (default, summary and 185 | erroneous links), summary, all (summary and all links) 186 | -c, --console Prints report to the console in addition to other 187 | output options such as file or email. 188 | 189 | Email Options: 190 | These options allows the crawler to send a report by email. 191 | 192 | -a ADDRESS, --address=ADDRESS 193 | Comma-separated list of email addresses used to send a 194 | report 195 | --from=FROM_ADDRESS 196 | Email address to use in the from field of the email 197 | (optional) 198 | -s SMTP, --smtp=SMTP 199 | Host of the smtp server 200 | --port=PORT Port of the smtp server (optional) 201 | --tls Use TLS with the email server. 202 | --subject=SUBJECT Subject of the email (optional) 203 | --smtp-username=SMTP_USERNAME 204 | Username to use with the smtp server (optional) 205 | --smtp-password=SMTP_PASSWORD 206 | Password to use with the smtp server (optional) 207 | 208 | Usage Example 209 | ------------- 210 | 211 | Crawl a site and show progress 212 | ``pylinkvalidate.py --progress http://example.com/`` 213 | 214 | Crawl a site starting from 2 URLs 215 | ``pylinkvalidate.py http://example.com/ http://example2.com/`` 216 | 217 | Crawl a site (example.com) and all pages belonging to another host 218 | ``pylinkvalidate.py -H additionalhost.com http://example.com/`` 219 | 220 | Report status of all links (even successful ones) 221 | ``pylinkvalidate.py --report-type=all http://example.com/`` 222 | 223 | Report status of all links and HTML show source of these links 224 | ``pylinkvalidate.py --report-type=all --show-source http://example.com/`` 225 | 226 | Only crawl starting URLs and access all linked resources 227 | ``pylinkvalidate.py --run-once http://example.com/`` 228 | 229 | Crawl two levels (one more than run-once) and access all linked resources 230 | ``pylinkvalidate.py --depth=1 http://example.com/`` 231 | 232 | Only access links (a href) and ignore images, stylesheets and scripts 233 | ``pylinkvalidate.py --types=a http://example.com/`` 234 | 235 | Crawl a site with 4 threads (default is one thread) 236 | ``pylinkvalidate.py --workers=4 http://example.com/`` 237 | 238 | Crawl a site with 4 processes (default is one thread) 239 | ``pylinkvalidate.py --mode=process --workers=4 http://example.com/`` 240 | 241 | Crawl a site and use LXML to parse HTML (faster, must be installed) 242 | ``pylinkvalidate.py --parser=LXML http://example.com/`` 243 | 244 | Print debugging info 245 | ``pylinkvalidate.py --verbose=2 http://example.com/`` 246 | 247 | Change User-Agent request header 248 | ``pylinkvalidate.py --header="User-Agent: Mozilla/5.0" http://example.com/`` 249 | 250 | Crawl multiple sites and report results per site 251 | ``pylinkvalidate.py --multi http://example.com/ http://www.example2.net/`` 252 | 253 | Check that all HTML pages have a body tag with a specific class: 254 | ``pylinkvalidate.py --check-content '' http://example.com/`` 255 | 256 | Check that no HTML pages have a paragraph tag with a pattern: 257 | ``pylinkvalidate.py --check-absence '

regex:Hello\s+World' http://example.com/`` 258 | 259 | Check that robots.txt have a Disallow none: 260 | ``pylinkvalidate.py --check-content-once '/robots.txt,regex:^Disallow:\s*$' http://example.com/`` 261 | 262 | Allow insecure content for HTTPS sites with certificate errors [SSL: CERTIFICATE_VERIFY_FAILED] 263 | ``pylinkvalidate.py --allow-insecure-content https://self-signed.example.com/`` 264 | 265 | 266 | API Usage 267 | --------- 268 | 269 | To crawl a site from a single URL: 270 | 271 | .. code-block:: python 272 | 273 | from pylinkvalidator.api import crawl 274 | crawled_site = crawl("http://www.example.com/") 275 | number_of_crawled_pages = len(crawled_site.pages) 276 | number_of_errors = len(crawled_sites.error_pages) 277 | 278 | 279 | To crawl a site and pass some configuration options (the same supported by the 280 | command line interface): 281 | 282 | 283 | .. code-block:: python 284 | 285 | from pylinkvalidator.api import crawl_with_options 286 | crawled_site = crawl_with_options(["http://www.example.com/"], {"run-once": 287 | True, "workers": 10}) 288 | number_of_crawled_pages = len(crawled_site.pages) 289 | number_of_errors = len(crawled_sites.error_pages) 290 | 291 | 292 | FAQ and Troubleshooting 293 | ----------------------- 294 | 295 | I cannot find pylinkvalidate.py on Windows with virtualenv 296 | This is a known problem with virtualenv on windows. The interpreter is 297 | different than the one used by the virtualenv. Prefix pylinkvalidate.py with the 298 | full path: ``python c:\myvirtualenv\Scripts\pylinkvalidate.py`` 299 | 300 | I see Exception KeyError ... module 'threading' when using --mode=green 301 | This output is generally harmless and is generated by gevent patching the 302 | python thread module. If someone knows how to make it go away, patches are 303 | more than welcome :-) 304 | 305 | 306 | License 307 | ------- 308 | 309 | This software is licensed under the `New BSD License`. See the `LICENSE` file 310 | in the for the full license text. It includes the beautifulsoup library which 311 | is licensed under the MIT license. 312 | -------------------------------------------------------------------------------- /pylinkvalidator/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Main pylinkvalidator package 4 | """ 5 | 6 | __version__ = "0.3" 7 | -------------------------------------------------------------------------------- /pylinkvalidator/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Contains a simple crawling API to use pylinkvalidator programmatically. 4 | 5 | We will do everything to keep functions in this module backward compatible 6 | across versions. 7 | """ 8 | from __future__ import unicode_literals, absolute_import 9 | 10 | from pylinkvalidator.crawler import configure_logger, execute_from_config 11 | from pylinkvalidator.models import Config 12 | 13 | 14 | def crawl(url): 15 | """Crawls a URL and returns a pylinkvalidator.crawler.Site instance. 16 | 17 | :rtype: A pylinkvalidator.crawler.Site instance 18 | """ 19 | config = Config() 20 | config.parse_api_config([url]) 21 | logger = configure_logger(config) 22 | crawler = execute_from_config(config, logger) 23 | 24 | return crawler.site 25 | 26 | 27 | def crawl_with_options(urls, options_dict=None, logger_builder=None): 28 | """Crawls URLs with provided options and logger. 29 | 30 | :param options_dict: Must contain the long name of the command line 31 | options. (optional) 32 | 33 | :param logger_builder: Function that will be called to instantiate a 34 | logger. (optional) 35 | 36 | :rtype: A pylinkvalidator.crawler.Site instance 37 | """ 38 | 39 | config = Config() 40 | 41 | config.parse_api_config(urls, options_dict) 42 | 43 | if not logger_builder: 44 | logger = configure_logger(config) 45 | else: 46 | logger = logger_builder() 47 | 48 | # TODO In the future, we will pass the logger builder and not the logger 49 | # to enable the ProcessSiteCrawler to instantiate its own custom logger. 50 | crawler = execute_from_config(config, logger) 51 | 52 | return crawler.site 53 | -------------------------------------------------------------------------------- /pylinkvalidator/bin/pylinkvalidate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pylinkvalidator import crawler 4 | 5 | if __name__ == "__main__": 6 | crawler.execute_from_command_line() 7 | -------------------------------------------------------------------------------- /pylinkvalidator/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa 3 | """ 4 | Contains the compatibility layer for python 2 & 3 5 | """ 6 | from __future__ import unicode_literals, absolute_import 7 | 8 | import sys 9 | 10 | if sys.version_info[0] < 3: 11 | range = xrange 12 | import urlparse 13 | from urllib import quote 14 | import SimpleHTTPServer 15 | import SocketServer 16 | from urllib2 import HTTPError 17 | import Queue 18 | unicode = unicode 19 | 20 | def get_content_type(m): 21 | return m.gettype() 22 | 23 | def get_charset(m): 24 | return m.getparam("charset") 25 | 26 | def get_safe_str(s): 27 | return s.encode("utf-8") 28 | 29 | from StringIO import StringIO 30 | else: 31 | range = range 32 | import urllib.parse as urlparse 33 | from urllib.parse import quote 34 | import http.server as SimpleHTTPServer 35 | import socketserver as SocketServer 36 | from urllib.error import HTTPError 37 | import queue as Queue 38 | unicode = str 39 | 40 | def get_content_type(m): 41 | return m.get_content_type() 42 | 43 | def get_charset(m): 44 | return m.get_content_charset() 45 | 46 | def get_safe_str(s): 47 | return s 48 | from io import StringIO 49 | 50 | try: 51 | from logging import NullHandler 52 | except ImportError: 53 | from logging import Handler 54 | 55 | class NullHandler(Handler): 56 | def emit(self, record): 57 | pass 58 | 59 | def handle(self, record): 60 | pass 61 | 62 | def createLock(self): 63 | return None 64 | 65 | 66 | def get_url_open(): 67 | # Not automatically imported to allow monkey patching. 68 | if sys.version_info[0] < 3: 69 | from urllib2 import urlopen 70 | else: 71 | from urllib.request import urlopen 72 | return urlopen 73 | 74 | 75 | def get_url_request(): 76 | if sys.version_info[0] < 3: 77 | from urllib2 import Request 78 | else: 79 | from urllib.request import Request 80 | return Request 81 | -------------------------------------------------------------------------------- /pylinkvalidator/crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Contains the crawling logic. 4 | """ 5 | from __future__ import unicode_literals, absolute_import 6 | 7 | import base64 8 | from collections import defaultdict 9 | import logging 10 | import sys 11 | import time 12 | 13 | from pylinkvalidator.included.bs4 import BeautifulSoup, UnicodeDammit 14 | 15 | import pylinkvalidator.compat as compat 16 | from pylinkvalidator.compat import ( 17 | range, HTTPError, get_url_open, unicode, 18 | get_content_type, get_url_request, get_charset) 19 | from pylinkvalidator.models import ( 20 | Config, WorkerInit, Response, PageCrawl, 21 | ExceptionStr, Link, SitePage, WorkerInput, TYPE_ATTRIBUTES, HTML_MIME_TYPE, 22 | MODE_THREAD, MODE_PROCESS, MODE_GREEN, WHEN_ALWAYS, UTF8Class, 23 | PageStatus, PageSource, PAGE_QUEUED, PAGE_CRAWLED, VERBOSE_QUIET, 24 | VERBOSE_NORMAL, LazyLogParam, PREFIX_ALL) 25 | from pylinkvalidator.reporter import report 26 | from pylinkvalidator.urlutil import ( 27 | get_clean_url_split, get_absolute_url_split, 28 | is_link, is_similar_url_split, is_supported_scheme) 29 | 30 | 31 | WORK_DONE = '__WORK_DONE__' 32 | 33 | 34 | def get_logger(propagate=False): 35 | """Returns a logger.""" 36 | root_logger = logging.getLogger() 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | handler = logging.StreamHandler() 41 | 42 | formatter = logging.Formatter( 43 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 44 | handler.setFormatter(formatter) 45 | 46 | if root_logger.level != logging.CRITICAL: 47 | logger.addHandler(handler) 48 | logger.propagate = propagate 49 | else: 50 | logger.addHandler(compat.NullHandler()) 51 | 52 | return logger 53 | 54 | 55 | class SiteCrawler(object): 56 | """Main crawler/orchestrator""" 57 | 58 | def __init__(self, config, logger): 59 | self.config = config 60 | self.start_url_splits = list(config.start_url_splits) 61 | self.workers = [] 62 | self.input_queue = self.build_queue(config) 63 | self.output_queue = self.build_queue(config) 64 | self.logger = logger 65 | self.site = Site(self.start_url_splits, config, self.logger) 66 | 67 | def build_logger(self): 68 | return self.logger 69 | 70 | def crawl(self): 71 | worker_init = WorkerInit( 72 | self.config.worker_config, self.input_queue, 73 | self.output_queue, self.build_logger()) 74 | self.workers = self.get_workers(self.config, worker_init) 75 | 76 | queue_size = len(self.start_url_splits) 77 | for start_url_split in self.start_url_splits: 78 | self.input_queue.put( 79 | WorkerInput( 80 | start_url_split, True, 0, start_url_split.netloc, 81 | self.config.content_check), 82 | False) 83 | 84 | self.start_workers(self.workers, self.input_queue, self.output_queue) 85 | 86 | self.start_progress() 87 | 88 | while True: 89 | page_crawl = self.output_queue.get() 90 | queue_size -= 1 91 | new_worker_inputs = self.process_page_crawl(page_crawl) 92 | 93 | # We only process new pages if we did not exceed configured depth 94 | for worker_input in new_worker_inputs: 95 | queue_size += 1 96 | self.input_queue.put(worker_input, False) 97 | 98 | self.progress(page_crawl, len(self.site.pages), queue_size) 99 | 100 | if queue_size <= 0: 101 | self.stop_workers(self.workers, self.input_queue, 102 | self.output_queue) 103 | self.stop_progress() 104 | return self.site 105 | 106 | def start_progress(self): 107 | if self.config.options.progress: 108 | print("Starting crawl...") 109 | 110 | def stop_progress(self): 111 | if self.config.options.progress: 112 | print("Crawling Done...\n") 113 | 114 | def progress(self, page_crawl, done_size, queue_size): 115 | if not self.config.options.progress: 116 | return 117 | 118 | total = done_size + queue_size 119 | percent = float(done_size) / float(total) * 100.0 120 | 121 | url = "" 122 | if page_crawl.final_url_split: 123 | url = page_crawl.final_url_split.geturl() 124 | elif page_crawl.original_url_split: 125 | url = page_crawl.original_url_split.geturl() 126 | 127 | status = page_crawl.status 128 | if not status: 129 | status = "error" 130 | 131 | print("{0} - {1} ({2} of {3} - {4:.0f}%)".format( 132 | status, url, done_size, total, percent)) 133 | 134 | def build_queue(self, config): 135 | """Returns an object implementing the Queue interface.""" 136 | raise NotImplementedError() 137 | 138 | def get_workers(self, config, worker_init): 139 | """Returns a sequence of workers of the desired type.""" 140 | raise NotImplementedError() 141 | 142 | def start_workers(self, workers, input_queue, output_queue): 143 | """Start the workers.""" 144 | raise NotImplementedError() 145 | 146 | def stop_workers(self, workers, input_queue, output_queue): 147 | """Stops the workers.""" 148 | for worker in workers: 149 | input_queue.put(WORK_DONE) 150 | 151 | def process_page_crawl(self, page_crawl): 152 | """Returns a sequence of SplitResult to crawl.""" 153 | return self.site.add_crawled_page(page_crawl) 154 | 155 | 156 | class ThreadSiteCrawler(SiteCrawler): 157 | """Site Crawler with thread workers.""" 158 | 159 | def build_queue(self, config): 160 | return compat.Queue.Queue() 161 | 162 | def get_workers(self, config, worker_init): 163 | from threading import Thread 164 | workers = [] 165 | for _ in range(config.worker_size): 166 | workers.append( 167 | Thread(target=crawl_page, kwargs={'worker_init': worker_init})) 168 | 169 | return workers 170 | 171 | def start_workers(self, workers, input_queue, output_queue): 172 | for worker in workers: 173 | worker.start() 174 | 175 | 176 | class ProcessSiteCrawler(SiteCrawler): 177 | """Site Crawler with process workers.""" 178 | 179 | def __init__(self, *args, **kwargs): 180 | import multiprocessing 181 | self.manager = multiprocessing.Manager() 182 | self.ProcessClass = multiprocessing.Process 183 | super(ProcessSiteCrawler, self).__init__(*args, **kwargs) 184 | 185 | def build_logger(self): 186 | """We do not want to share a logger.""" 187 | return None 188 | 189 | def build_queue(self, config): 190 | return self.manager.Queue() 191 | 192 | def get_workers(self, config, worker_init): 193 | workers = [] 194 | for _ in range(config.worker_size): 195 | workers.append(self.ProcessClass( 196 | target=crawl_page, kwargs={'worker_init': worker_init})) 197 | 198 | return workers 199 | 200 | def start_workers(self, workers, input_queue, output_queue): 201 | for worker in workers: 202 | worker.start() 203 | 204 | 205 | class GreenSiteCrawler(SiteCrawler): 206 | """Site Crawler with green thread workers.""" 207 | 208 | def __init__(self, *args, **kwargs): 209 | from gevent import monkey, queue, Greenlet 210 | # TODO thread=false should be used to remove useless exception 211 | # But weird behavior sometimes happen when it is not patched... 212 | monkey.patch_all() 213 | self.QueueClass = queue.Queue 214 | self.GreenClass = Greenlet 215 | super(GreenSiteCrawler, self).__init__(*args, **kwargs) 216 | 217 | def build_queue(self, config): 218 | return self.QueueClass() 219 | 220 | def get_workers(self, config, worker_init): 221 | workers = [] 222 | for _ in range(config.worker_size): 223 | workers.append(self.GreenClass( 224 | crawl_page, worker_init=worker_init)) 225 | 226 | return workers 227 | 228 | def start_workers(self, workers, input_queue, output_queue): 229 | for worker in workers: 230 | worker.start() 231 | 232 | 233 | class PageCrawler(object): 234 | """Worker that parses a page and extracts links""" 235 | 236 | def __init__(self, worker_init): 237 | self.worker_config = worker_init.worker_config 238 | self.input_queue = worker_init.input_queue 239 | self.output_queue = worker_init.output_queue 240 | self.urlopen = get_url_open() 241 | self.request_class = get_url_request() 242 | self.logger = worker_init.logger 243 | if not self.logger: 244 | # Get a new one! 245 | self.logger = get_logger() 246 | 247 | # We do this here to allow patching by gevent 248 | import socket 249 | self.timeout_exception = socket.timeout 250 | 251 | self.auth_header = None 252 | 253 | if self.worker_config.username and self.worker_config.password: 254 | base64string = unicode( 255 | base64.encodestring( 256 | '{0}:{1}'.format( 257 | self.worker_config.username, 258 | self.worker_config.password) 259 | .encode("utf-8")), "utf-8") 260 | self.auth_header = ("Authorization", 261 | "Basic {0}".format(base64string)) 262 | 263 | def crawl_page_forever(self): 264 | """Starts page crawling loop for this worker.""" 265 | 266 | while True: 267 | worker_input = self.input_queue.get() 268 | 269 | if worker_input == WORK_DONE: 270 | # No more work! Pfew! 271 | return 272 | else: 273 | page_crawl = self._crawl_page(worker_input) 274 | self.output_queue.put(page_crawl) 275 | 276 | def _crawl_page(self, worker_input): 277 | page_crawl = None 278 | erroneous_content = [] 279 | missing_content = [] 280 | url_split_to_crawl = worker_input.url_split 281 | 282 | try: 283 | response = open_url( 284 | self.urlopen, self.request_class, 285 | url_split_to_crawl.geturl(), self.worker_config.timeout, 286 | self.timeout_exception, self.auth_header, 287 | extra_headers=self.worker_config.extra_headers, 288 | logger=self.logger) 289 | 290 | if response.exception: 291 | if response.status: 292 | # This is a http error. Good. 293 | page_crawl = PageCrawl( 294 | original_url_split=url_split_to_crawl, 295 | final_url_split=None, status=response.status, 296 | is_timeout=False, is_redirect=False, links=[], 297 | exception=None, is_html=False, 298 | depth=worker_input.depth, 299 | response_time=response.response_time, 300 | process_time=None, 301 | site_origin=worker_input.site_origin) 302 | elif response.is_timeout: 303 | # This is a timeout. No need to wrap the exception 304 | page_crawl = PageCrawl( 305 | original_url_split=url_split_to_crawl, 306 | final_url_split=None, status=None, 307 | is_timeout=True, is_redirect=False, links=[], 308 | exception=None, is_html=False, 309 | depth=worker_input.depth, 310 | response_time=response.response_time, 311 | process_time=0, 312 | site_origin=worker_input.site_origin) 313 | else: 314 | # Something bad happened when opening the url 315 | exception = ExceptionStr( 316 | unicode(type(response.exception)), 317 | unicode(response.exception)) 318 | page_crawl = PageCrawl( 319 | original_url_split=url_split_to_crawl, 320 | final_url_split=None, status=None, 321 | is_timeout=False, is_redirect=False, links=[], 322 | exception=exception, is_html=False, 323 | depth=worker_input.depth, 324 | response_time=response.response_time, 325 | process_time=0, 326 | site_origin=worker_input.site_origin) 327 | else: 328 | final_url_split = get_clean_url_split(response.final_url) 329 | 330 | message = response.content.info() 331 | mime_type = get_content_type(message) 332 | if self.worker_config.prefer_server_encoding: 333 | charset = get_charset(message) 334 | else: 335 | charset = None 336 | links = [] 337 | 338 | is_html = mime_type == HTML_MIME_TYPE 339 | process_time = None 340 | 341 | if is_html and worker_input.should_crawl: 342 | start = time.time() 343 | html_soup = BeautifulSoup( 344 | response.content, self.worker_config.parser, 345 | from_encoding=charset) 346 | links = self.get_links(html_soup, final_url_split) 347 | if self._has_content_to_check(worker_input): 348 | (missing_content, erroneous_content) =\ 349 | self.check_content( 350 | unicode(html_soup), html_soup, 351 | url_split_to_crawl, 352 | final_url_split, worker_input.content_check) 353 | process_time = time.time() - start 354 | else: 355 | self.logger.debug( 356 | "Won't crawl %s. MIME Type: %s. Should crawl: %s", 357 | final_url_split, mime_type, 358 | worker_input.should_crawl) 359 | if self._has_content_to_check(worker_input): 360 | text_content = self.get_text_content( 361 | response.content.read(), charset) 362 | (missing_content, erroneous_content) =\ 363 | self.check_content( 364 | text_content, None, url_split_to_crawl, 365 | final_url_split, worker_input.content_check) 366 | 367 | page_crawl = PageCrawl( 368 | original_url_split=url_split_to_crawl, 369 | final_url_split=final_url_split, status=response.status, 370 | is_timeout=False, is_redirect=response.is_redirect, 371 | links=links, exception=None, is_html=is_html, 372 | depth=worker_input.depth, 373 | response_time=response.response_time, 374 | process_time=process_time, 375 | site_origin=worker_input.site_origin, 376 | missing_content=missing_content, 377 | erroneous_content=erroneous_content) 378 | except Exception as exc: 379 | exception = ExceptionStr(unicode(type(exc)), unicode(exc)) 380 | page_crawl = PageCrawl( 381 | original_url_split=url_split_to_crawl, 382 | final_url_split=None, status=None, 383 | is_timeout=False, is_redirect=False, links=[], 384 | exception=exception, is_html=False, 385 | depth=worker_input.depth, 386 | response_time=None, 387 | process_time=None, 388 | site_origin=worker_input.site_origin) 389 | self.logger.exception("Exception occurred while crawling a page.") 390 | 391 | return page_crawl 392 | 393 | def _has_content_to_check(self, worker_input): 394 | return worker_input.content_check and\ 395 | worker_input.content_check.has_something_to_check 396 | 397 | def get_text_content(self, binary_blob, charset): 398 | """Retrieves unicode content from response binary blob. 399 | """ 400 | override_encodings = [] 401 | if charset: 402 | override_encodings.append(charset) 403 | 404 | return UnicodeDammit(binary_blob, override_encodings).unicode_markup 405 | 406 | def check_content( 407 | self, response_content, html_soup, original_url_split, 408 | final_url_split, content_check): 409 | """Ensures that the specified content is present (or absent). 410 | """ 411 | missing_content = [] 412 | erroneous_content = [] 413 | 414 | if html_soup: 415 | for content, found in self.check_html_content_single( 416 | content_check.html_presence, html_soup, original_url_split, 417 | final_url_split): 418 | if not found: 419 | missing_content.append(content) 420 | 421 | if html_soup: 422 | for content, found in self.check_html_content_single( 423 | content_check.html_absence, html_soup, original_url_split, 424 | final_url_split): 425 | if found: 426 | erroneous_content.append(content) 427 | 428 | for content, found in self.check_text_content_single( 429 | content_check.text_presence, response_content, 430 | original_url_split, final_url_split): 431 | if not found: 432 | missing_content.append(content) 433 | 434 | for content, found in self.check_text_content_single( 435 | content_check.text_absence, response_content, 436 | original_url_split, final_url_split): 437 | if found: 438 | erroneous_content.append(content) 439 | 440 | return (missing_content, erroneous_content) 441 | 442 | def check_html_content_single( 443 | self, html_to_check, html_soup, original_url_split, 444 | final_url_split): 445 | """Returns a list of tuple (content, presence) indicating whether an 446 | html tag was present or not in the source. 447 | """ 448 | content = [] 449 | 450 | for key, html_check_list in html_to_check.items(): 451 | if key == PREFIX_ALL or\ 452 | is_similar_url_split(key, original_url_split) or\ 453 | is_similar_url_split(key, final_url_split): 454 | # we check 455 | for html_check in html_check_list: 456 | kwargs = {} 457 | if html_check.attrs: 458 | kwargs["attrs"] = html_check.attrs 459 | if html_check.content: 460 | # XXX Use text because the included bs4 does not use 461 | # the new string parameter and text is backward 462 | # compatible. 463 | kwargs["text"] = html_check.content 464 | found = html_soup.find( 465 | html_check.tag, **kwargs) is not None 466 | content.append((str(html_check), found)) 467 | 468 | return content 469 | 470 | def check_text_content_single( 471 | self, text_content_to_check, full_text, original_url_split, 472 | final_url_split): 473 | """Returns a list of tuple (content, presence) indicating whether an 474 | html tag was present or not in the source. 475 | """ 476 | content = [] 477 | 478 | for key, text_check_list in text_content_to_check.items(): 479 | if key == PREFIX_ALL or\ 480 | is_similar_url_split(key, original_url_split) or\ 481 | is_similar_url_split(key, final_url_split): 482 | # we check 483 | for text_check in text_check_list: 484 | try: 485 | match = text_check.search(full_text) 486 | content.append((text_check.pattern, match is not None)) 487 | except AttributeError: 488 | found = text_check in full_text 489 | content.append((text_check, found)) 490 | 491 | return content 492 | 493 | def get_links(self, html_soup, original_url_split): 494 | """Gets links for desired types (e.g., a, link, img, script) 495 | 496 | :param html_soup: The page parsed by BeautifulSoup 497 | :param original_url_split: The URL of the page used to resolve relative 498 | links. 499 | :rtype: A sequence of Link objects 500 | """ 501 | 502 | # This is a weird html tag that defines the base URL of a page. 503 | base_url_split = original_url_split 504 | 505 | bases = html_soup.find_all('base') 506 | if bases: 507 | base = bases[0] 508 | if 'href' in base.attrs: 509 | base_url_split = get_clean_url_split(base['href']) 510 | 511 | links = [] 512 | for element_type in self.worker_config.types: 513 | if element_type not in TYPE_ATTRIBUTES: 514 | raise Exception( 515 | "Unknown element type: {0}".format(element_type)) 516 | attribute = TYPE_ATTRIBUTES[element_type] 517 | element_links = html_soup.find_all(element_type) 518 | links.extend(self._get_links( 519 | element_links, attribute, base_url_split, original_url_split)) 520 | return links 521 | 522 | def _get_links(self, elements, attribute, base_url_split, 523 | original_url_split): 524 | links = [] 525 | for element in elements: 526 | if attribute in element.attrs: 527 | url = element[attribute] 528 | 529 | if not self.worker_config.strict_mode: 530 | url = url.strip() 531 | 532 | if not is_link(url): 533 | continue 534 | abs_url_split = get_absolute_url_split(url, base_url_split) 535 | 536 | if not is_supported_scheme( 537 | abs_url_split, self.worker_config.ignore_bad_tel_urls): 538 | continue 539 | 540 | link = Link( 541 | type=unicode(element.name), url_split=abs_url_split, 542 | original_url_split=original_url_split, 543 | source_str=unicode(element)) 544 | links.append(link) 545 | 546 | return links 547 | 548 | 549 | class Site(UTF8Class): 550 | """Contains all the visited and visiting pages of a site. 551 | 552 | This class is NOT thread-safe and should only be accessed by one thread at 553 | a time! 554 | """ 555 | 556 | def __init__(self, start_url_splits, config, logger=None): 557 | self.start_url_splits = start_url_splits 558 | 559 | self.pages = {} 560 | """Map of url:SitePage""" 561 | 562 | self.multi_pages = defaultdict(dict) 563 | """Map of netloc:map(url:SitePage). Only used in multi sites mode.""" 564 | 565 | self.error_pages = {} 566 | """Map of url:SitePage with is_ok=False""" 567 | 568 | self.multi_error_pages = defaultdict(dict) 569 | """Map of netloc:map(url:SitePage). Only used in multi sites 570 | mode.""" 571 | 572 | self.page_statuses = {} 573 | """Map of url:PageStatus (PAGE_QUEUED, PAGE_CRAWLED)""" 574 | 575 | self.config = config 576 | 577 | self.logger = logger 578 | 579 | for start_url_split in self.start_url_splits: 580 | self.page_statuses[start_url_split] = PageStatus(PAGE_QUEUED, []) 581 | 582 | def collect_multi_sites(self): 583 | """Collects page results and maps them to their respective domain in 584 | multi_pages and multi_error_pages. 585 | """ 586 | for url, page in self.pages.items(): 587 | self.multi_pages[page.site_origin][url] = page 588 | 589 | for url, page in self.error_pages.items(): 590 | self.multi_error_pages[page.site_origin][url] = page 591 | 592 | @property 593 | def is_ok(self): 594 | """Returns True if there is no error page.""" 595 | return len(self.error_pages) == 0 596 | 597 | def add_crawled_page(self, page_crawl): 598 | """Adds a crawled page. Returns a list of url split to crawl""" 599 | if page_crawl.original_url_split not in self.page_statuses: 600 | self.logger.warning("Original URL not seen before!") 601 | return [] 602 | 603 | status = self.page_statuses[page_crawl.original_url_split] 604 | 605 | # Mark it as crawled 606 | self.page_statuses[page_crawl.original_url_split] = PageStatus( 607 | PAGE_CRAWLED, None) 608 | 609 | if page_crawl.original_url_split in self.pages: 610 | self.logger.warning( 611 | "Original URL already crawled! Concurrency issue!") 612 | return [] 613 | 614 | final_url_split = page_crawl.final_url_split 615 | if not final_url_split: 616 | # Happens on 404/500/timeout/error 617 | final_url_split = page_crawl.original_url_split 618 | 619 | if final_url_split in self.pages: 620 | # This means that we already processed this final page. 621 | # It's a redirect. Just add a source 622 | site_page = self.pages[final_url_split] 623 | site_page.add_sources(status.sources) 624 | else: 625 | # We never crawled this page before 626 | is_local = self.config.is_local(final_url_split) 627 | site_page = SitePage( 628 | final_url_split, page_crawl.status, 629 | page_crawl.is_timeout, page_crawl.exception, 630 | page_crawl.is_html, is_local, 631 | response_time=page_crawl.response_time, 632 | process_time=page_crawl.process_time, 633 | site_origin=page_crawl.site_origin, 634 | missing_content=page_crawl.missing_content, 635 | erroneous_content=page_crawl.erroneous_content) 636 | site_page.add_sources(status.sources) 637 | self.pages[final_url_split] = site_page 638 | 639 | if not site_page.is_ok: 640 | self.error_pages[final_url_split] = site_page 641 | 642 | return self.process_links(page_crawl) 643 | 644 | def process_links(self, page_crawl): 645 | links_to_process = [] 646 | 647 | source_url_split = page_crawl.original_url_split 648 | if page_crawl.final_url_split: 649 | source_url_split = page_crawl.final_url_split 650 | 651 | for link in page_crawl.links: 652 | url_split = link.url_split 653 | if not self.config.should_download(url_split): 654 | self.logger.debug( 655 | "Won't download %s. Is local? %s", 656 | url_split, 657 | LazyLogParam(lambda: self.config.is_local(url_split))) 658 | continue 659 | 660 | page_status = self.page_statuses.get(url_split, None) 661 | page_source = PageSource(source_url_split, link.source_str) 662 | 663 | if not page_status: 664 | # We never encountered this url before 665 | self.page_statuses[url_split] = PageStatus( 666 | PAGE_QUEUED, [page_source]) 667 | should_crawl = self.config.should_crawl( 668 | url_split, page_crawl.depth) 669 | links_to_process.append(WorkerInput( 670 | url_split, should_crawl, page_crawl.depth + 1, 671 | page_crawl.site_origin, self.config.content_check)) 672 | elif page_status.status == PAGE_CRAWLED: 673 | # Already crawled. Add source 674 | if url_split in self.pages: 675 | self.pages[url_split].add_sources([page_source]) 676 | else: 677 | # TODO the final url is different. need a way to link it... 678 | pass 679 | elif page_status.status == PAGE_QUEUED: 680 | # Already queued for crawling. Add source. 681 | page_status.sources.append(page_source) 682 | 683 | return links_to_process 684 | 685 | def get_average_response_time(self): 686 | """Computes the average response time of pages that returned an HTTP 687 | code (good or bad). Exceptions such as timeout are ignored. 688 | """ 689 | response_time_sum = 0 690 | total = 0 691 | for page in self.pages.values(): 692 | if page.response_time is not None: 693 | response_time_sum += page.response_time 694 | total += 1 695 | 696 | if total > 0: 697 | return float(response_time_sum) / float(total) 698 | else: 699 | return 0 700 | 701 | def get_average_process_time(self): 702 | """Computes the average process (parse) time of pages that returned an HTTP 703 | code (good or bad). Exceptions are ignored. 704 | """ 705 | process_time_sum = 0 706 | total = 0 707 | for page in self.pages.values(): 708 | if page.process_time is not None: 709 | process_time_sum += page.process_time 710 | total += 1 711 | 712 | if total > 0: 713 | return float(process_time_sum) / float(total) 714 | else: 715 | return 0 716 | 717 | def __unicode__(self): 718 | return "Site for {0}".format(self.start_url_splits) 719 | 720 | 721 | def crawl_page(worker_init): 722 | """Safe redirection to the page crawler""" 723 | page_crawler = PageCrawler(worker_init) 724 | page_crawler.crawl_page_forever() 725 | 726 | 727 | def open_url(open_func, request_class, url, timeout, timeout_exception, 728 | auth_header=None, extra_headers=None, logger=None): 729 | """Opens a URL and returns a Response object. 730 | 731 | All parameters are required to be able to use a patched version of the 732 | Python standard library (i.e., patched by gevent) 733 | 734 | :param open_func: url open function, typicaly urllib2.urlopen 735 | :param request_class: the request class to use 736 | :param url: the url to open 737 | :param timeout: number of seconds to wait before timing out 738 | :param timeout_exception: the exception thrown by open_func if a timeout 739 | occurs 740 | :param auth_header: authentication header 741 | :param extra_headers: dict of {Header: Value} 742 | :param logger: logger used to log exceptions 743 | :rtype: A Response object 744 | """ 745 | try: 746 | request = request_class(url) 747 | 748 | if auth_header: 749 | request.add_header(auth_header[0], auth_header[1]) 750 | 751 | if extra_headers: 752 | for header, value in extra_headers.items(): 753 | request.add_header(header, value) 754 | 755 | start = time.time() 756 | output_value = open_func(request, timeout=timeout) 757 | stop = time.time() 758 | final_url = output_value.geturl() 759 | code = output_value.getcode() 760 | response = Response( 761 | content=output_value, status=code, exception=None, 762 | original_url=url, final_url=final_url, 763 | is_redirect=final_url != url, is_timeout=False, 764 | response_time=stop-start) 765 | except HTTPError as http_error: 766 | stop = time.time() 767 | code = http_error.code 768 | response = Response( 769 | content=None, status=code, exception=http_error, 770 | original_url=url, final_url=None, is_redirect=False, 771 | is_timeout=False, response_time=stop-start) 772 | except timeout_exception as t_exception: 773 | response = Response( 774 | content=None, status=None, exception=t_exception, 775 | original_url=url, final_url=None, is_redirect=False, 776 | is_timeout=True, response_time=None) 777 | except Exception as exc: 778 | if logger: 779 | logger.warning("Exception while opening an URL", exc_info=True) 780 | response = Response( 781 | content=None, status=None, exception=exc, 782 | original_url=url, final_url=None, is_redirect=False, 783 | is_timeout=False, response_time=None) 784 | 785 | return response 786 | 787 | 788 | def execute_from_command_line(): 789 | """Runs the crawler and retrieves the configuration from the command 790 | line. 791 | """ 792 | try: 793 | start = time.time() 794 | config = Config() 795 | config.parse_cli_config() 796 | 797 | logger = configure_logger(config) 798 | crawler = execute_from_config(config, logger) 799 | 800 | stop = time.time() 801 | 802 | if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS: 803 | report(crawler.site, config, stop - start, logger) 804 | 805 | if not crawler.site.is_ok: 806 | sys.exit(1) 807 | except Exception as e: 808 | print(e) 809 | sys.exit(1) 810 | 811 | 812 | def configure_logger(config): 813 | """Configures a logger based on the configuration.""" 814 | if config.options.verbose == VERBOSE_QUIET: 815 | logging.basicConfig(level=logging.CRITICAL) 816 | elif config.options.verbose == VERBOSE_NORMAL: 817 | logging.basicConfig(level=logging.WARNING) 818 | else: 819 | logging.basicConfig(level=logging.DEBUG) 820 | 821 | logger = get_logger() 822 | 823 | return logger 824 | 825 | 826 | def execute_from_config(config, logger): 827 | """Executes a crawler given a config and logger.""" 828 | if not config.start_urls: 829 | raise Exception("At least one starting URL must be supplied.") 830 | 831 | if config.options.allow_insecure_content: 832 | # Ref: https://www.python.org/dev/peps/pep-0476/#opting-out 833 | import ssl 834 | try: 835 | _create_unverified_https_context = ssl._create_unverified_context 836 | except AttributeError: 837 | # Legacy Python that doesn't verify HTTPS certificates by default 838 | pass 839 | else: 840 | # Handle target environment that doesn't support HTTPS verification 841 | ssl._create_default_https_context = _create_unverified_https_context 842 | 843 | if config.options.mode == MODE_THREAD: 844 | crawler = ThreadSiteCrawler(config, logger) 845 | elif config.options.mode == MODE_PROCESS: 846 | crawler = ProcessSiteCrawler(config, logger) 847 | elif config.options.mode == MODE_GREEN: 848 | crawler = GreenSiteCrawler(config, logger) 849 | 850 | if not crawler: 851 | raise Exception("Invalid crawling mode supplied.") 852 | 853 | crawler.crawl() 854 | 855 | if config.options.multi: 856 | crawler.site.collect_multi_sites() 857 | 858 | return crawler 859 | -------------------------------------------------------------------------------- /pylinkvalidator/included/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartdag/pylinkvalidator/aac5934d88a9c99d0e4f40a8884ad942b6b10ea0/pylinkvalidator/included/__init__.py -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/__init__.py: -------------------------------------------------------------------------------- 1 | """Beautiful Soup 2 | Elixir and Tonic 3 | "The Screen-Scraper's Friend" 4 | http://www.crummy.com/software/BeautifulSoup/ 5 | 6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a 7 | (possibly invalid) document into a tree representation. Beautiful Soup 8 | provides provides methods and Pythonic idioms that make it easy to 9 | navigate, search, and modify the parse tree. 10 | 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml 12 | and/or html5lib is installed. 13 | 14 | For more than you ever wanted to know about Beautiful Soup, see the 15 | documentation: 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 17 | """ 18 | 19 | from __future__ import absolute_import 20 | import sys 21 | 22 | __author__ = "Leonard Richardson (leonardr@segfault.org)" 23 | __version__ = "4.2.1" 24 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" 25 | __license__ = "MIT" 26 | 27 | 28 | use_system_version = False 29 | 30 | try: 31 | # The system-installed version has priority providing it is not an 32 | # earlier version. The embedded bs4 only works for Python 2. 33 | import bs4 34 | if (bs4.__version__.split('.') >= __version__.split('.')) or\ 35 | sys.version_info[0] >= 3: 36 | from bs4 import * 37 | 38 | # Necessary for direct import in pylinkvalidator 39 | UnicodeDammit = bs4.UnicodeDammit 40 | use_system_version = True 41 | # Make sure we copy over the version. See #17071 42 | __version__ = bs4.__version__ 43 | except ImportError: 44 | if sys.version_info[0] >= 3: 45 | raise 46 | 47 | if not use_system_version: 48 | 49 | __all__ = ['BeautifulSoup'] 50 | 51 | import re 52 | import warnings 53 | 54 | from .builder import builder_registry 55 | from .dammit import UnicodeDammit 56 | from .element import ( 57 | CData, 58 | Comment, 59 | DEFAULT_OUTPUT_ENCODING, 60 | Declaration, 61 | Doctype, 62 | NavigableString, 63 | PageElement, 64 | ProcessingInstruction, 65 | ResultSet, 66 | SoupStrainer, 67 | Tag, 68 | ) 69 | 70 | # The very first thing we do is give a useful error if someone is 71 | # running this code under Python 3 without converting it. 72 | syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 73 | 74 | class BeautifulSoup(Tag): 75 | """ 76 | This class defines the basic interface called by the tree builders. 77 | 78 | These methods will be called by the parser: 79 | reset() 80 | feed(markup) 81 | 82 | The tree builder may call these methods from its feed() implementation: 83 | handle_starttag(name, attrs) # See note about return value 84 | handle_endtag(name) 85 | handle_data(data) # Appends to the current data node 86 | endData(containerClass=NavigableString) # Ends the current data node 87 | 88 | No matter how complicated the underlying parser is, you should be 89 | able to build a tree using 'start tag' events, 'end tag' events, 90 | 'data' events, and "done with data" events. 91 | 92 | If you encounter an empty-element tag (aka a self-closing tag, 93 | like HTML's
tag), call handle_starttag and then 94 | handle_endtag. 95 | """ 96 | ROOT_TAG_NAME = u'[document]' 97 | 98 | # If the end-user gives no indication which tree builder they 99 | # want, look for one with these features. 100 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 101 | 102 | # Used when determining whether a text node is all whitespace and 103 | # can be replaced with a single space. A text node that contains 104 | # fancy Unicode spaces (usually non-breaking) should be left 105 | # alone. 106 | STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } 107 | 108 | def __init__(self, markup="", features=None, builder=None, 109 | parse_only=None, from_encoding=None, **kwargs): 110 | """The Soup object is initialized as the 'root tag', and the 111 | provided markup (which can be a string or a file-like object) 112 | is fed into the underlying parser.""" 113 | 114 | if 'convertEntities' in kwargs: 115 | warnings.warn( 116 | "BS4 does not respect the convertEntities argument to the " 117 | "BeautifulSoup constructor. Entities are always converted " 118 | "to Unicode characters.") 119 | 120 | if 'markupMassage' in kwargs: 121 | del kwargs['markupMassage'] 122 | warnings.warn( 123 | "BS4 does not respect the markupMassage argument to the " 124 | "BeautifulSoup constructor. The tree builder is responsible " 125 | "for any necessary markup massage.") 126 | 127 | if 'smartQuotesTo' in kwargs: 128 | del kwargs['smartQuotesTo'] 129 | warnings.warn( 130 | "BS4 does not respect the smartQuotesTo argument to the " 131 | "BeautifulSoup constructor. Smart quotes are always converted " 132 | "to Unicode characters.") 133 | 134 | if 'selfClosingTags' in kwargs: 135 | del kwargs['selfClosingTags'] 136 | warnings.warn( 137 | "BS4 does not respect the selfClosingTags argument to the " 138 | "BeautifulSoup constructor. The tree builder is responsible " 139 | "for understanding self-closing tags.") 140 | 141 | if 'isHTML' in kwargs: 142 | del kwargs['isHTML'] 143 | warnings.warn( 144 | "BS4 does not respect the isHTML argument to the " 145 | "BeautifulSoup constructor. You can pass in features='html' " 146 | "or features='xml' to get a builder capable of handling " 147 | "one or the other.") 148 | 149 | def deprecated_argument(old_name, new_name): 150 | if old_name in kwargs: 151 | warnings.warn( 152 | 'The "%s" argument to the BeautifulSoup constructor ' 153 | 'has been renamed to "%s."' % (old_name, new_name)) 154 | value = kwargs[old_name] 155 | del kwargs[old_name] 156 | return value 157 | return None 158 | 159 | parse_only = parse_only or deprecated_argument( 160 | "parseOnlyThese", "parse_only") 161 | 162 | from_encoding = from_encoding or deprecated_argument( 163 | "fromEncoding", "from_encoding") 164 | 165 | if len(kwargs) > 0: 166 | arg = kwargs.keys().pop() 167 | raise TypeError( 168 | "__init__() got an unexpected keyword argument '%s'" % arg) 169 | 170 | if builder is None: 171 | if isinstance(features, basestring): 172 | features = [features] 173 | if features is None or len(features) == 0: 174 | features = self.DEFAULT_BUILDER_FEATURES 175 | builder_class = builder_registry.lookup(*features) 176 | if builder_class is None: 177 | raise FeatureNotFound( 178 | "Couldn't find a tree builder with the features you " 179 | "requested: %s. Do you need to install a parser library?" 180 | % ",".join(features)) 181 | builder = builder_class() 182 | self.builder = builder 183 | self.is_xml = builder.is_xml 184 | self.builder.soup = self 185 | 186 | self.parse_only = parse_only 187 | 188 | self.reset() 189 | 190 | if hasattr(markup, 'read'): # It's a file-type object. 191 | markup = markup.read() 192 | (self.markup, self.original_encoding, self.declared_html_encoding, 193 | self.contains_replacement_characters) = ( 194 | self.builder.prepare_markup(markup, from_encoding)) 195 | 196 | try: 197 | self._feed() 198 | except StopParsing: 199 | pass 200 | 201 | # Clear out the markup and remove the builder's circular 202 | # reference to this object. 203 | self.markup = None 204 | self.builder.soup = None 205 | 206 | def _feed(self): 207 | # Convert the document to Unicode. 208 | self.builder.reset() 209 | 210 | self.builder.feed(self.markup) 211 | # Close out any unfinished strings and close all the open tags. 212 | self.endData() 213 | while self.currentTag.name != self.ROOT_TAG_NAME: 214 | self.popTag() 215 | 216 | def reset(self): 217 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 218 | self.hidden = 1 219 | self.builder.reset() 220 | self.currentData = [] 221 | self.currentTag = None 222 | self.tagStack = [] 223 | self.pushTag(self) 224 | 225 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 226 | """Create a new tag associated with this soup.""" 227 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) 228 | 229 | def new_string(self, s, subclass=NavigableString): 230 | """Create a new NavigableString associated with this soup.""" 231 | navigable = subclass(s) 232 | navigable.setup() 233 | return navigable 234 | 235 | def insert_before(self, successor): 236 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 237 | 238 | def insert_after(self, successor): 239 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 240 | 241 | def popTag(self): 242 | tag = self.tagStack.pop() 243 | #print "Pop", tag.name 244 | if self.tagStack: 245 | self.currentTag = self.tagStack[-1] 246 | return self.currentTag 247 | 248 | def pushTag(self, tag): 249 | #print "Push", tag.name 250 | if self.currentTag: 251 | self.currentTag.contents.append(tag) 252 | self.tagStack.append(tag) 253 | self.currentTag = self.tagStack[-1] 254 | 255 | def endData(self, containerClass=NavigableString): 256 | if self.currentData: 257 | currentData = u''.join(self.currentData) 258 | if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 259 | not set([tag.name for tag in self.tagStack]).intersection( 260 | self.builder.preserve_whitespace_tags)): 261 | if '\n' in currentData: 262 | currentData = '\n' 263 | else: 264 | currentData = ' ' 265 | self.currentData = [] 266 | if self.parse_only and len(self.tagStack) <= 1 and \ 267 | (not self.parse_only.text or \ 268 | not self.parse_only.search(currentData)): 269 | return 270 | o = containerClass(currentData) 271 | self.object_was_parsed(o) 272 | 273 | def object_was_parsed(self, o, parent=None, most_recent_element=None): 274 | """Add an object to the parse tree.""" 275 | parent = parent or self.currentTag 276 | most_recent_element = most_recent_element or self._most_recent_element 277 | o.setup(parent, most_recent_element) 278 | if most_recent_element is not None: 279 | most_recent_element.next_element = o 280 | self._most_recent_element = o 281 | parent.contents.append(o) 282 | 283 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): 284 | """Pops the tag stack up to and including the most recent 285 | instance of the given tag. If inclusivePop is false, pops the tag 286 | stack up to but *not* including the most recent instqance of 287 | the given tag.""" 288 | #print "Popping to %s" % name 289 | if name == self.ROOT_TAG_NAME: 290 | return 291 | 292 | numPops = 0 293 | mostRecentTag = None 294 | 295 | for i in range(len(self.tagStack) - 1, 0, -1): 296 | if (name == self.tagStack[i].name 297 | and nsprefix == self.tagStack[i].prefix): 298 | numPops = len(self.tagStack) - i 299 | break 300 | if not inclusivePop: 301 | numPops = numPops - 1 302 | 303 | for i in range(0, numPops): 304 | mostRecentTag = self.popTag() 305 | return mostRecentTag 306 | 307 | def handle_starttag(self, name, namespace, nsprefix, attrs): 308 | """Push a start tag on to the stack. 309 | 310 | If this method returns None, the tag was rejected by the 311 | SoupStrainer. You should proceed as if the tag had not occured 312 | in the document. For instance, if this was a self-closing tag, 313 | don't call handle_endtag. 314 | """ 315 | 316 | # print "Start tag %s: %s" % (name, attrs) 317 | self.endData() 318 | 319 | if (self.parse_only and len(self.tagStack) <= 1 320 | and (self.parse_only.text 321 | or not self.parse_only.search_tag(name, attrs))): 322 | return None 323 | 324 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 325 | self.currentTag, self._most_recent_element) 326 | if tag is None: 327 | return tag 328 | if self._most_recent_element: 329 | self._most_recent_element.next_element = tag 330 | self._most_recent_element = tag 331 | self.pushTag(tag) 332 | return tag 333 | 334 | def handle_endtag(self, name, nsprefix=None): 335 | #print "End tag: " + name 336 | self.endData() 337 | self._popToTag(name, nsprefix) 338 | 339 | def handle_data(self, data): 340 | self.currentData.append(data) 341 | 342 | def decode(self, pretty_print=False, 343 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, 344 | formatter="minimal"): 345 | """Returns a string or Unicode representation of this document. 346 | To get Unicode, pass None for encoding.""" 347 | 348 | if self.is_xml: 349 | # Print the XML declaration 350 | encoding_part = '' 351 | if eventual_encoding != None: 352 | encoding_part = ' encoding="%s"' % eventual_encoding 353 | prefix = u'\n' % encoding_part 354 | else: 355 | prefix = u'' 356 | if not pretty_print: 357 | indent_level = None 358 | else: 359 | indent_level = 0 360 | return prefix + super(BeautifulSoup, self).decode( 361 | indent_level, eventual_encoding, formatter) 362 | 363 | # Alias to make it easier to type import: 'from bs4 import _soup' 364 | _s = BeautifulSoup 365 | _soup = BeautifulSoup 366 | 367 | class BeautifulStoneSoup(BeautifulSoup): 368 | """Deprecated interface to an XML parser.""" 369 | 370 | def __init__(self, *args, **kwargs): 371 | kwargs['features'] = 'xml' 372 | warnings.warn( 373 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' 374 | 'it, pass features="xml" into the BeautifulSoup constructor.') 375 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 376 | 377 | 378 | class StopParsing(Exception): 379 | pass 380 | 381 | 382 | class FeatureNotFound(ValueError): 383 | pass 384 | 385 | 386 | #By default, act as an HTML pretty-printer. 387 | if __name__ == '__main__': 388 | import sys 389 | soup = BeautifulSoup(sys.stdin) 390 | print(soup.prettify()) 391 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | if sys.version_info[0] < 3: 5 | from collections import defaultdict 6 | import itertools 7 | import sys 8 | from pylinkvalidator.included.bs4.element import ( 9 | CharsetMetaAttributeValue, 10 | ContentMetaAttributeValue, 11 | whitespace_re 12 | ) 13 | 14 | __all__ = [ 15 | 'HTMLTreeBuilder', 16 | 'SAXTreeBuilder', 17 | 'TreeBuilder', 18 | 'TreeBuilderRegistry', 19 | ] 20 | 21 | # Some useful features for a TreeBuilder to have. 22 | FAST = 'fast' 23 | PERMISSIVE = 'permissive' 24 | STRICT = 'strict' 25 | XML = 'xml' 26 | HTML = 'html' 27 | HTML_5 = 'html5' 28 | 29 | 30 | class TreeBuilderRegistry(object): 31 | 32 | def __init__(self): 33 | self.builders_for_feature = defaultdict(list) 34 | self.builders = [] 35 | 36 | def register(self, treebuilder_class): 37 | """Register a treebuilder based on its advertised features.""" 38 | for feature in treebuilder_class.features: 39 | self.builders_for_feature[feature].insert(0, treebuilder_class) 40 | self.builders.insert(0, treebuilder_class) 41 | 42 | def lookup(self, *features): 43 | if len(self.builders) == 0: 44 | # There are no builders at all. 45 | return None 46 | 47 | if len(features) == 0: 48 | # They didn't ask for any features. Give them the most 49 | # recently registered builder. 50 | return self.builders[0] 51 | 52 | # Go down the list of features in order, and eliminate any builders 53 | # that don't match every feature. 54 | features = list(features) 55 | features.reverse() 56 | candidates = None 57 | candidate_set = None 58 | while len(features) > 0: 59 | feature = features.pop() 60 | we_have_the_feature = self.builders_for_feature.get(feature, []) 61 | if len(we_have_the_feature) > 0: 62 | if candidates is None: 63 | candidates = we_have_the_feature 64 | candidate_set = set(candidates) 65 | else: 66 | # Eliminate any candidates that don't have this feature. 67 | candidate_set = candidate_set.intersection( 68 | set(we_have_the_feature)) 69 | 70 | # The only valid candidates are the ones in candidate_set. 71 | # Go through the original list of candidates and pick the first one 72 | # that's in candidate_set. 73 | if candidate_set is None: 74 | return None 75 | for candidate in candidates: 76 | if candidate in candidate_set: 77 | return candidate 78 | return None 79 | 80 | # The BeautifulSoup class will take feature lists from developers and use them 81 | # to look up builders in this registry. 82 | builder_registry = TreeBuilderRegistry() 83 | 84 | class TreeBuilder(object): 85 | """Turn a document into a Beautiful Soup object tree.""" 86 | 87 | features = [] 88 | 89 | is_xml = False 90 | preserve_whitespace_tags = set() 91 | empty_element_tags = None # A tag will be considered an empty-element 92 | # tag when and only when it has no contents. 93 | 94 | # A value for these tag/attribute combinations is a space- or 95 | # comma-separated list of CDATA, rather than a single CDATA. 96 | cdata_list_attributes = {} 97 | 98 | 99 | def __init__(self): 100 | self.soup = None 101 | 102 | def reset(self): 103 | pass 104 | 105 | def can_be_empty_element(self, tag_name): 106 | """Might a tag with this name be an empty-element tag? 107 | 108 | The final markup may or may not actually present this tag as 109 | self-closing. 110 | 111 | For instance: an HTMLBuilder does not consider a

tag to be 112 | an empty-element tag (it's not in 113 | HTMLBuilder.empty_element_tags). This means an empty

tag 114 | will be presented as "

", not "

". 115 | 116 | The default implementation has no opinion about which tags are 117 | empty-element tags, so a tag will be presented as an 118 | empty-element tag if and only if it has no contents. 119 | "" will become "", and "bar" will 120 | be left alone. 121 | """ 122 | if self.empty_element_tags is None: 123 | return True 124 | return tag_name in self.empty_element_tags 125 | 126 | def feed(self, markup): 127 | raise NotImplementedError() 128 | 129 | def prepare_markup(self, markup, user_specified_encoding=None, 130 | document_declared_encoding=None): 131 | return markup, None, None, False 132 | 133 | def test_fragment_to_document(self, fragment): 134 | """Wrap an HTML fragment to make it look like a document. 135 | 136 | Different parsers do this differently. For instance, lxml 137 | introduces an empty tag, and html5lib 138 | doesn't. Abstracting this away lets us write simple tests 139 | which run HTML fragments through the parser and compare the 140 | results against other HTML fragments. 141 | 142 | This method should not be used outside of tests. 143 | """ 144 | return fragment 145 | 146 | def set_up_substitutions(self, tag): 147 | return False 148 | 149 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 150 | """Replaces class="foo bar" with class=["foo", "bar"] 151 | 152 | Modifies its input in place. 153 | """ 154 | if self.cdata_list_attributes: 155 | universal = self.cdata_list_attributes.get('*', []) 156 | tag_specific = self.cdata_list_attributes.get( 157 | tag_name.lower(), []) 158 | for cdata_list_attr in itertools.chain(universal, tag_specific): 159 | if cdata_list_attr in attrs: 160 | # Basically, we have a "class" attribute whose 161 | # value is a whitespace-separated list of CSS 162 | # classes. Split it into a list. 163 | value = attrs[cdata_list_attr] 164 | if isinstance(value, basestring): 165 | values = whitespace_re.split(value) 166 | else: 167 | # html5lib sometimes calls setAttributes twice 168 | # for the same tag when rearranging the parse 169 | # tree. On the second call the attribute value 170 | # here is already a list. If this happens, 171 | # leave the value alone rather than trying to 172 | # split it again. 173 | values = value 174 | attrs[cdata_list_attr] = values 175 | return attrs 176 | 177 | class SAXTreeBuilder(TreeBuilder): 178 | """A Beautiful Soup treebuilder that listens for SAX events.""" 179 | 180 | def feed(self, markup): 181 | raise NotImplementedError() 182 | 183 | def close(self): 184 | pass 185 | 186 | def startElement(self, name, attrs): 187 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 188 | #print "Start %s, %r" % (name, attrs) 189 | self.soup.handle_starttag(name, attrs) 190 | 191 | def endElement(self, name): 192 | #print "End %s" % name 193 | self.soup.handle_endtag(name) 194 | 195 | def startElementNS(self, nsTuple, nodeName, attrs): 196 | # Throw away (ns, nodeName) for now. 197 | self.startElement(nodeName, attrs) 198 | 199 | def endElementNS(self, nsTuple, nodeName): 200 | # Throw away (ns, nodeName) for now. 201 | self.endElement(nodeName) 202 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 203 | 204 | def startPrefixMapping(self, prefix, nodeValue): 205 | # Ignore the prefix for now. 206 | pass 207 | 208 | def endPrefixMapping(self, prefix): 209 | # Ignore the prefix for now. 210 | # handler.endPrefixMapping(prefix) 211 | pass 212 | 213 | def characters(self, content): 214 | self.soup.handle_data(content) 215 | 216 | def startDocument(self): 217 | pass 218 | 219 | def endDocument(self): 220 | pass 221 | 222 | 223 | class HTMLTreeBuilder(TreeBuilder): 224 | """This TreeBuilder knows facts about HTML. 225 | 226 | Such as which tags are empty-element tags. 227 | """ 228 | 229 | preserve_whitespace_tags = set(['pre', 'textarea']) 230 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 231 | 'spacer', 'link', 'frame', 'base']) 232 | 233 | # The HTML standard defines these attributes as containing a 234 | # space-separated list of values, not a single value. That is, 235 | # class="foo bar" means that the 'class' attribute has two values, 236 | # 'foo' and 'bar', not the single value 'foo bar'. When we 237 | # encounter one of these attributes, we will parse its value into 238 | # a list of values if possible. Upon output, the list will be 239 | # converted back into a string. 240 | cdata_list_attributes = { 241 | "*" : ['class', 'accesskey', 'dropzone'], 242 | "a" : ['rel', 'rev'], 243 | "link" : ['rel', 'rev'], 244 | "td" : ["headers"], 245 | "th" : ["headers"], 246 | "td" : ["headers"], 247 | "form" : ["accept-charset"], 248 | "object" : ["archive"], 249 | 250 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 251 | "area" : ["rel"], 252 | "icon" : ["sizes"], 253 | "iframe" : ["sandbox"], 254 | "output" : ["for"], 255 | } 256 | 257 | def set_up_substitutions(self, tag): 258 | # We are only interested in tags 259 | if tag.name != 'meta': 260 | return False 261 | 262 | http_equiv = tag.get('http-equiv') 263 | content = tag.get('content') 264 | charset = tag.get('charset') 265 | 266 | # We are interested in tags that say what encoding the 267 | # document was originally in. This means HTML 5-style 268 | # tags that provide the "charset" attribute. It also means 269 | # HTML 4-style tags that provide the "content" 270 | # attribute and have "http-equiv" set to "content-type". 271 | # 272 | # In both cases we will replace the value of the appropriate 273 | # attribute with a standin object that can take on any 274 | # encoding. 275 | meta_encoding = None 276 | if charset is not None: 277 | # HTML 5 style: 278 | # 279 | meta_encoding = charset 280 | tag['charset'] = CharsetMetaAttributeValue(charset) 281 | 282 | elif (content is not None and http_equiv is not None 283 | and http_equiv.lower() == 'content-type'): 284 | # HTML 4 style: 285 | # 286 | tag['content'] = ContentMetaAttributeValue(content) 287 | 288 | return (meta_encoding is not None) 289 | 290 | def register_treebuilders_from(module): 291 | """Copy TreeBuilders from the given module into this module.""" 292 | # I'm fairly sure this is not the best way to do this. 293 | this_module = sys.modules['pylinkvalidator.included.bs4.builder'] 294 | for name in module.__all__: 295 | obj = getattr(module, name) 296 | 297 | if issubclass(obj, TreeBuilder): 298 | setattr(this_module, name, obj) 299 | this_module.__all__.append(name) 300 | # Register the builder while we're at it. 301 | this_module.builder_registry.register(obj) 302 | 303 | # Builders are registered in reverse order of priority, so that custom 304 | # builder registrations will take precedence. In general, we want lxml 305 | # to take precedence over html5lib, because it's faster. And we only 306 | # want to use HTMLParser as a last result. 307 | from . import _htmlparser 308 | register_treebuilders_from(_htmlparser) 309 | try: 310 | from . import _html5lib 311 | register_treebuilders_from(_html5lib) 312 | except ImportError: 313 | # They don't have html5lib installed. 314 | pass 315 | try: 316 | from . import _lxml 317 | register_treebuilders_from(_lxml) 318 | except ImportError: 319 | # They don't have lxml installed. 320 | pass 321 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/builder/_html5lib.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | if sys.version_info[0] < 3: 5 | __all__ = [ 6 | 'HTML5TreeBuilder', 7 | ] 8 | 9 | import warnings 10 | from pylinkvalidator.included.bs4.builder import ( 11 | PERMISSIVE, 12 | HTML, 13 | HTML_5, 14 | HTMLTreeBuilder, 15 | ) 16 | from pylinkvalidator.included.bs4.element import NamespacedAttribute 17 | import html5lib 18 | from html5lib.constants import namespaces 19 | from pylinkvalidator.included.bs4.element import ( 20 | Comment, 21 | Doctype, 22 | NavigableString, 23 | Tag, 24 | ) 25 | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): 27 | """Use html5lib to build a tree.""" 28 | 29 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] 30 | 31 | def prepare_markup(self, markup, user_specified_encoding): 32 | # Store the user-specified encoding for use later on. 33 | self.user_specified_encoding = user_specified_encoding 34 | return markup, None, None, False 35 | 36 | # These methods are defined by Beautiful Soup. 37 | def feed(self, markup): 38 | if self.soup.parse_only is not None: 39 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 40 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 41 | doc = parser.parse(markup, encoding=self.user_specified_encoding) 42 | 43 | # Set the character encoding detected by the tokenizer. 44 | if isinstance(markup, unicode): 45 | # We need to special-case this because html5lib sets 46 | # charEncoding to UTF-8 if it gets Unicode input. 47 | doc.original_encoding = None 48 | else: 49 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 50 | 51 | def create_treebuilder(self, namespaceHTMLElements): 52 | self.underlying_builder = TreeBuilderForHtml5lib( 53 | self.soup, namespaceHTMLElements) 54 | return self.underlying_builder 55 | 56 | def test_fragment_to_document(self, fragment): 57 | """See `TreeBuilder`.""" 58 | return u'%s' % fragment 59 | 60 | 61 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): 62 | 63 | def __init__(self, soup, namespaceHTMLElements): 64 | self.soup = soup 65 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 66 | 67 | def documentClass(self): 68 | self.soup.reset() 69 | return Element(self.soup, self.soup, None) 70 | 71 | def insertDoctype(self, token): 72 | name = token["name"] 73 | publicId = token["publicId"] 74 | systemId = token["systemId"] 75 | 76 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 77 | self.soup.object_was_parsed(doctype) 78 | 79 | def elementClass(self, name, namespace): 80 | tag = self.soup.new_tag(name, namespace) 81 | return Element(tag, self.soup, namespace) 82 | 83 | def commentClass(self, data): 84 | return TextNode(Comment(data), self.soup) 85 | 86 | def fragmentClass(self): 87 | self.soup = BeautifulSoup("") 88 | self.soup.name = "[document_fragment]" 89 | return Element(self.soup, self.soup, None) 90 | 91 | def appendChild(self, node): 92 | # XXX This code is not covered by the BS4 tests. 93 | self.soup.append(node.element) 94 | 95 | def getDocument(self): 96 | return self.soup 97 | 98 | def getFragment(self): 99 | return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element 100 | 101 | class AttrList(object): 102 | def __init__(self, element): 103 | self.element = element 104 | self.attrs = dict(self.element.attrs) 105 | def __iter__(self): 106 | return list(self.attrs.items()).__iter__() 107 | def __setitem__(self, name, value): 108 | "set attr", name, value 109 | self.element[name] = value 110 | def items(self): 111 | return list(self.attrs.items()) 112 | def keys(self): 113 | return list(self.attrs.keys()) 114 | def __len__(self): 115 | return len(self.attrs) 116 | def __getitem__(self, name): 117 | return self.attrs[name] 118 | def __contains__(self, name): 119 | return name in list(self.attrs.keys()) 120 | 121 | 122 | class Element(html5lib.treebuilders._base.Node): 123 | def __init__(self, element, soup, namespace): 124 | html5lib.treebuilders._base.Node.__init__(self, element.name) 125 | self.element = element 126 | self.soup = soup 127 | self.namespace = namespace 128 | 129 | def appendChild(self, node): 130 | if (node.element.__class__ == NavigableString and self.element.contents 131 | and self.element.contents[-1].__class__ == NavigableString): 132 | # Concatenate new text onto old text node 133 | # XXX This has O(n^2) performance, for input like 134 | # "aaa..." 135 | old_element = self.element.contents[-1] 136 | new_element = self.soup.new_string(old_element + node.element) 137 | old_element.replace_with(new_element) 138 | self.soup._most_recent_element = new_element 139 | else: 140 | self.soup.object_was_parsed(node.element, parent=self.element) 141 | 142 | def getAttributes(self): 143 | return AttrList(self.element) 144 | 145 | def setAttributes(self, attributes): 146 | if attributes is not None and len(attributes) > 0: 147 | 148 | converted_attributes = [] 149 | for name, value in list(attributes.items()): 150 | if isinstance(name, tuple): 151 | new_name = NamespacedAttribute(*name) 152 | del attributes[name] 153 | attributes[new_name] = value 154 | 155 | self.soup.builder._replace_cdata_list_attribute_values( 156 | self.name, attributes) 157 | for name, value in attributes.items(): 158 | self.element[name] = value 159 | 160 | # The attributes may contain variables that need substitution. 161 | # Call set_up_substitutions manually. 162 | # 163 | # The Tag constructor called this method when the Tag was created, 164 | # but we just set/changed the attributes, so call it again. 165 | self.soup.builder.set_up_substitutions(self.element) 166 | attributes = property(getAttributes, setAttributes) 167 | 168 | def insertText(self, data, insertBefore=None): 169 | text = TextNode(self.soup.new_string(data), self.soup) 170 | if insertBefore: 171 | self.insertBefore(text, insertBefore) 172 | else: 173 | self.appendChild(text) 174 | 175 | def insertBefore(self, node, refNode): 176 | index = self.element.index(refNode.element) 177 | if (node.element.__class__ == NavigableString and self.element.contents 178 | and self.element.contents[index-1].__class__ == NavigableString): 179 | # (See comments in appendChild) 180 | old_node = self.element.contents[index-1] 181 | new_str = self.soup.new_string(old_node + node.element) 182 | old_node.replace_with(new_str) 183 | else: 184 | self.element.insert(index, node.element) 185 | node.parent = self 186 | 187 | def removeChild(self, node): 188 | node.element.extract() 189 | 190 | def reparentChildren(self, newParent): 191 | while self.element.contents: 192 | child = self.element.contents[0] 193 | child.extract() 194 | if isinstance(child, Tag): 195 | newParent.appendChild( 196 | Element(child, self.soup, namespaces["html"])) 197 | else: 198 | newParent.appendChild( 199 | TextNode(child, self.soup)) 200 | 201 | def cloneNode(self): 202 | tag = self.soup.new_tag(self.element.name, self.namespace) 203 | node = Element(tag, self.soup, self.namespace) 204 | for key,value in self.attributes: 205 | node.attributes[key] = value 206 | return node 207 | 208 | def hasContent(self): 209 | return self.element.contents 210 | 211 | def getNameTuple(self): 212 | if self.namespace == None: 213 | return namespaces["html"], self.name 214 | else: 215 | return self.namespace, self.name 216 | 217 | nameTuple = property(getNameTuple) 218 | 219 | class TextNode(Element): 220 | def __init__(self, element, soup): 221 | html5lib.treebuilders._base.Node.__init__(self, None) 222 | self.element = element 223 | self.soup = soup 224 | 225 | def cloneNode(self): 226 | raise NotImplementedError 227 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/builder/_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | import sys 3 | 4 | if sys.version_info[0] < 3: 5 | 6 | __all__ = [ 7 | 'HTMLParserTreeBuilder', 8 | ] 9 | 10 | from HTMLParser import ( 11 | HTMLParser, 12 | HTMLParseError, 13 | ) 14 | import sys 15 | import warnings 16 | 17 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 18 | # argument, which we'd like to set to False. Unfortunately, 19 | # http://bugs.python.org/issue13273 makes strict=True a better bet 20 | # before Python 3.2.3. 21 | # 22 | # At the end of this file, we monkeypatch HTMLParser so that 23 | # strict=True works well on Python 3.2.2. 24 | major, minor, release = sys.version_info[:3] 25 | CONSTRUCTOR_TAKES_STRICT = ( 26 | major > 3 27 | or (major == 3 and minor > 2) 28 | or (major == 3 and minor == 2 and release >= 3)) 29 | 30 | from pylinkvalidator.included.bs4.element import ( 31 | CData, 32 | Comment, 33 | Declaration, 34 | Doctype, 35 | ProcessingInstruction, 36 | ) 37 | from pylinkvalidator.included.bs4.dammit import EntitySubstitution, UnicodeDammit 38 | 39 | from pylinkvalidator.included.bs4.builder import ( 40 | HTML, 41 | HTMLTreeBuilder, 42 | STRICT, 43 | ) 44 | 45 | 46 | HTMLPARSER = 'html.parser' 47 | 48 | class BeautifulSoupHTMLParser(HTMLParser): 49 | def handle_starttag(self, name, attrs): 50 | # XXX namespace 51 | self.soup.handle_starttag(name, None, None, dict(attrs)) 52 | 53 | def handle_endtag(self, name): 54 | self.soup.handle_endtag(name) 55 | 56 | def handle_data(self, data): 57 | self.soup.handle_data(data) 58 | 59 | def handle_charref(self, name): 60 | # XXX workaround for a bug in HTMLParser. Remove this once 61 | # it's fixed. 62 | if name.startswith('x'): 63 | real_name = int(name.lstrip('x'), 16) 64 | elif name.startswith('X'): 65 | real_name = int(name.lstrip('X'), 16) 66 | else: 67 | real_name = int(name) 68 | 69 | try: 70 | data = unichr(real_name) 71 | except (ValueError, OverflowError), e: 72 | data = u"\N{REPLACEMENT CHARACTER}" 73 | 74 | self.handle_data(data) 75 | 76 | def handle_entityref(self, name): 77 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 78 | if character is not None: 79 | data = character 80 | else: 81 | data = "&%s;" % name 82 | self.handle_data(data) 83 | 84 | def handle_comment(self, data): 85 | self.soup.endData() 86 | self.soup.handle_data(data) 87 | self.soup.endData(Comment) 88 | 89 | def handle_decl(self, data): 90 | self.soup.endData() 91 | if data.startswith("DOCTYPE "): 92 | data = data[len("DOCTYPE "):] 93 | elif data == 'DOCTYPE': 94 | # i.e. "" 95 | data = '' 96 | self.soup.handle_data(data) 97 | self.soup.endData(Doctype) 98 | 99 | def unknown_decl(self, data): 100 | if data.upper().startswith('CDATA['): 101 | cls = CData 102 | data = data[len('CDATA['):] 103 | else: 104 | cls = Declaration 105 | self.soup.endData() 106 | self.soup.handle_data(data) 107 | self.soup.endData(cls) 108 | 109 | def handle_pi(self, data): 110 | self.soup.endData() 111 | if data.endswith("?") and data.lower().startswith("xml"): 112 | # "An XHTML processing instruction using the trailing '?' 113 | # will cause the '?' to be included in data." - HTMLParser 114 | # docs. 115 | # 116 | # Strip the question mark so we don't end up with two 117 | # question marks. 118 | data = data[:-1] 119 | self.soup.handle_data(data) 120 | self.soup.endData(ProcessingInstruction) 121 | 122 | 123 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 124 | 125 | is_xml = False 126 | features = [HTML, STRICT, HTMLPARSER] 127 | 128 | def __init__(self, *args, **kwargs): 129 | if CONSTRUCTOR_TAKES_STRICT: 130 | kwargs['strict'] = False 131 | self.parser_args = (args, kwargs) 132 | 133 | def prepare_markup(self, markup, user_specified_encoding=None, 134 | document_declared_encoding=None): 135 | """ 136 | :return: A 4-tuple (markup, original encoding, encoding 137 | declared within markup, whether any characters had to be 138 | replaced with REPLACEMENT CHARACTER). 139 | """ 140 | if isinstance(markup, unicode): 141 | return markup, None, None, False 142 | 143 | try_encodings = [user_specified_encoding, document_declared_encoding] 144 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) 145 | return (dammit.markup, dammit.original_encoding, 146 | dammit.declared_html_encoding, 147 | dammit.contains_replacement_characters) 148 | 149 | def feed(self, markup): 150 | args, kwargs = self.parser_args 151 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 152 | parser.soup = self.soup 153 | try: 154 | parser.feed(markup) 155 | except HTMLParseError, e: 156 | warnings.warn(RuntimeWarning( 157 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 158 | raise e 159 | 160 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 161 | # 3.2.3 code. This ensures they don't treat markup like

as a 162 | # string. 163 | # 164 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 165 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 166 | import re 167 | attrfind_tolerant = re.compile( 168 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 169 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 170 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 171 | 172 | locatestarttagend = re.compile(r""" 173 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 174 | (?:\s+ # whitespace before attribute name 175 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 176 | (?:\s*=\s* # value indicator 177 | (?:'[^']*' # LITA-enclosed value 178 | |\"[^\"]*\" # LIT-enclosed value 179 | |[^'\">\s]+ # bare value 180 | ) 181 | )? 182 | ) 183 | )* 184 | \s* # trailing whitespace 185 | """, re.VERBOSE) 186 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 187 | 188 | from html.parser import tagfind, attrfind 189 | 190 | def parse_starttag(self, i): 191 | self.__starttag_text = None 192 | endpos = self.check_for_whole_start_tag(i) 193 | if endpos < 0: 194 | return endpos 195 | rawdata = self.rawdata 196 | self.__starttag_text = rawdata[i:endpos] 197 | 198 | # Now parse the data between i+1 and j into a tag and attrs 199 | attrs = [] 200 | match = tagfind.match(rawdata, i+1) 201 | assert match, 'unexpected call to parse_starttag()' 202 | k = match.end() 203 | self.lasttag = tag = rawdata[i+1:k].lower() 204 | while k < endpos: 205 | if self.strict: 206 | m = attrfind.match(rawdata, k) 207 | else: 208 | m = attrfind_tolerant.match(rawdata, k) 209 | if not m: 210 | break 211 | attrname, rest, attrvalue = m.group(1, 2, 3) 212 | if not rest: 213 | attrvalue = None 214 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 215 | attrvalue[:1] == '"' == attrvalue[-1:]: 216 | attrvalue = attrvalue[1:-1] 217 | if attrvalue: 218 | attrvalue = self.unescape(attrvalue) 219 | attrs.append((attrname.lower(), attrvalue)) 220 | k = m.end() 221 | 222 | end = rawdata[k:endpos].strip() 223 | if end not in (">", "/>"): 224 | lineno, offset = self.getpos() 225 | if "\n" in self.__starttag_text: 226 | lineno = lineno + self.__starttag_text.count("\n") 227 | offset = len(self.__starttag_text) \ 228 | - self.__starttag_text.rfind("\n") 229 | else: 230 | offset = offset + len(self.__starttag_text) 231 | if self.strict: 232 | self.error("junk characters in start tag: %r" 233 | % (rawdata[k:endpos][:20],)) 234 | self.handle_data(rawdata[i:endpos]) 235 | return endpos 236 | if end.endswith('/>'): 237 | # XHTML-style empty tag: 238 | self.handle_startendtag(tag, attrs) 239 | else: 240 | self.handle_starttag(tag, attrs) 241 | if tag in self.CDATA_CONTENT_ELEMENTS: 242 | self.set_cdata_mode(tag) 243 | return endpos 244 | 245 | def set_cdata_mode(self, elem): 246 | self.cdata_elem = elem.lower() 247 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 248 | 249 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 250 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 251 | 252 | CONSTRUCTOR_TAKES_STRICT = True 253 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | if sys.version_info[0] < 3: 5 | __all__ = [ 6 | 'LXMLTreeBuilderForXML', 7 | 'LXMLTreeBuilder', 8 | ] 9 | 10 | from io import BytesIO 11 | from StringIO import StringIO 12 | import collections 13 | from lxml import etree 14 | from pylinkvalidator.included.bs4.element import Comment, Doctype, NamespacedAttribute 15 | from pylinkvalidator.included.bs4.builder import ( 16 | FAST, 17 | HTML, 18 | HTMLTreeBuilder, 19 | PERMISSIVE, 20 | TreeBuilder, 21 | XML) 22 | from pylinkvalidator.included.bs4.dammit import UnicodeDammit 23 | 24 | LXML = 'lxml' 25 | 26 | class LXMLTreeBuilderForXML(TreeBuilder): 27 | DEFAULT_PARSER_CLASS = etree.XMLParser 28 | 29 | is_xml = True 30 | 31 | # Well, it's permissive by XML parser standards. 32 | features = [LXML, XML, FAST, PERMISSIVE] 33 | 34 | CHUNK_SIZE = 512 35 | 36 | # This namespace mapping is specified in the XML Namespace 37 | # standard. 38 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 39 | 40 | @property 41 | def default_parser(self): 42 | # This can either return a parser object or a class, which 43 | # will be instantiated with default arguments. 44 | return etree.XMLParser(target=self, strip_cdata=False, recover=True) 45 | 46 | def __init__(self, parser=None, empty_element_tags=None): 47 | if empty_element_tags is not None: 48 | self.empty_element_tags = set(empty_element_tags) 49 | if parser is None: 50 | # Use the default parser. 51 | parser = self.default_parser 52 | if isinstance(parser, collections.Callable): 53 | # Instantiate the parser with default arguments 54 | parser = parser(target=self, strip_cdata=False) 55 | self.parser = parser 56 | self.soup = None 57 | self.nsmaps = [self.DEFAULT_NSMAPS] 58 | 59 | def _getNsTag(self, tag): 60 | # Split the namespace URL out of a fully-qualified lxml tag 61 | # name. Copied from lxml's src/lxml/sax.py. 62 | if tag[0] == '{': 63 | return tuple(tag[1:].split('}', 1)) 64 | else: 65 | return (None, tag) 66 | 67 | def prepare_markup(self, markup, user_specified_encoding=None, 68 | document_declared_encoding=None): 69 | """ 70 | :return: A 3-tuple (markup, original encoding, encoding 71 | declared within markup). 72 | """ 73 | if isinstance(markup, unicode): 74 | return markup, None, None, False 75 | 76 | try_encodings = [user_specified_encoding, document_declared_encoding] 77 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) 78 | return (dammit.markup, dammit.original_encoding, 79 | dammit.declared_html_encoding, 80 | dammit.contains_replacement_characters) 81 | 82 | def feed(self, markup): 83 | if isinstance(markup, bytes): 84 | markup = BytesIO(markup) 85 | elif isinstance(markup, unicode): 86 | markup = StringIO(markup) 87 | # Call feed() at least once, even if the markup is empty, 88 | # or the parser won't be initialized. 89 | data = markup.read(self.CHUNK_SIZE) 90 | self.parser.feed(data) 91 | while data != '': 92 | # Now call feed() on the rest of the data, chunk by chunk. 93 | data = markup.read(self.CHUNK_SIZE) 94 | if data != '': 95 | self.parser.feed(data) 96 | self.parser.close() 97 | 98 | def close(self): 99 | self.nsmaps = [self.DEFAULT_NSMAPS] 100 | 101 | def start(self, name, attrs, nsmap={}): 102 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 103 | attrs = dict(attrs) 104 | nsprefix = None 105 | # Invert each namespace map as it comes in. 106 | if len(self.nsmaps) > 1: 107 | # There are no new namespaces for this tag, but 108 | # non-default namespaces are in play, so we need a 109 | # separate tag stack to know when they end. 110 | self.nsmaps.append(None) 111 | elif len(nsmap) > 0: 112 | # A new namespace mapping has come into play. 113 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 114 | self.nsmaps.append(inverted_nsmap) 115 | # Also treat the namespace mapping as a set of attributes on the 116 | # tag, so we can recreate it later. 117 | attrs = attrs.copy() 118 | for prefix, namespace in nsmap.items(): 119 | attribute = NamespacedAttribute( 120 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 121 | attrs[attribute] = namespace 122 | 123 | # Namespaces are in play. Find any attributes that came in 124 | # from lxml with namespaces attached to their names, and 125 | # turn then into NamespacedAttribute objects. 126 | new_attrs = {} 127 | for attr, value in attrs.items(): 128 | namespace, attr = self._getNsTag(attr) 129 | if namespace is None: 130 | new_attrs[attr] = value 131 | else: 132 | nsprefix = self._prefix_for_namespace(namespace) 133 | attr = NamespacedAttribute(nsprefix, attr, namespace) 134 | new_attrs[attr] = value 135 | attrs = new_attrs 136 | 137 | namespace, name = self._getNsTag(name) 138 | nsprefix = self._prefix_for_namespace(namespace) 139 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 140 | 141 | def _prefix_for_namespace(self, namespace): 142 | """Find the currently active prefix for the given namespace.""" 143 | if namespace is None: 144 | return None 145 | for inverted_nsmap in reversed(self.nsmaps): 146 | if inverted_nsmap is not None and namespace in inverted_nsmap: 147 | return inverted_nsmap[namespace] 148 | return None 149 | 150 | def end(self, name): 151 | self.soup.endData() 152 | completed_tag = self.soup.tagStack[-1] 153 | namespace, name = self._getNsTag(name) 154 | nsprefix = None 155 | if namespace is not None: 156 | for inverted_nsmap in reversed(self.nsmaps): 157 | if inverted_nsmap is not None and namespace in inverted_nsmap: 158 | nsprefix = inverted_nsmap[namespace] 159 | break 160 | self.soup.handle_endtag(name, nsprefix) 161 | if len(self.nsmaps) > 1: 162 | # This tag, or one of its parents, introduced a namespace 163 | # mapping, so pop it off the stack. 164 | self.nsmaps.pop() 165 | 166 | def pi(self, target, data): 167 | pass 168 | 169 | def data(self, content): 170 | self.soup.handle_data(content) 171 | 172 | def doctype(self, name, pubid, system): 173 | self.soup.endData() 174 | doctype = Doctype.for_name_and_ids(name, pubid, system) 175 | self.soup.object_was_parsed(doctype) 176 | 177 | def comment(self, content): 178 | "Handle comments as Comment objects." 179 | self.soup.endData() 180 | self.soup.handle_data(content) 181 | self.soup.endData(Comment) 182 | 183 | def test_fragment_to_document(self, fragment): 184 | """See `TreeBuilder`.""" 185 | return u'\n%s' % fragment 186 | 187 | 188 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 189 | 190 | features = [LXML, HTML, FAST, PERMISSIVE] 191 | is_xml = False 192 | 193 | @property 194 | def default_parser(self): 195 | return etree.HTMLParser 196 | 197 | def feed(self, markup): 198 | self.parser.feed(markup) 199 | self.parser.close() 200 | 201 | def test_fragment_to_document(self, fragment): 202 | """See `TreeBuilder`.""" 203 | return u'%s' % fragment 204 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/dammit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Beautiful Soup bonus library: Unicode, Dammit 3 | 4 | This class forces XML data into a standard format (usually to UTF-8 or 5 | Unicode). It is heavily based on code from Mark Pilgrim's Universal 6 | Feed Parser. It does not rewrite the XML or HTML to reflect a new 7 | encoding; that's the tree builder's job. 8 | """ 9 | 10 | import sys 11 | 12 | if sys.version_info[0] < 3: 13 | import codecs 14 | from htmlentitydefs import codepoint2name 15 | import re 16 | import logging 17 | 18 | # Import a library to autodetect character encodings. 19 | chardet_type = None 20 | try: 21 | # First try the fast C implementation. 22 | # PyPI package: cchardet 23 | import cchardet 24 | def chardet_dammit(s): 25 | return cchardet.detect(s)['encoding'] 26 | except ImportError: 27 | try: 28 | # Fall back to the pure Python implementation 29 | # Debian package: python-chardet 30 | # PyPI package: chardet 31 | import chardet 32 | def chardet_dammit(s): 33 | return chardet.detect(s)['encoding'] 34 | #import chardet.constants 35 | #chardet.constants._debug = 1 36 | except ImportError: 37 | # No chardet available. 38 | def chardet_dammit(s): 39 | return None 40 | 41 | # Available from http://cjkpython.i18n.org/. 42 | try: 43 | import iconv_codec 44 | except ImportError: 45 | pass 46 | 47 | xml_encoding_re = re.compile( 48 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 49 | html_meta_re = re.compile( 50 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 51 | 52 | class EntitySubstitution(object): 53 | 54 | """Substitute XML or HTML entities for the corresponding characters.""" 55 | 56 | def _populate_class_variables(): 57 | lookup = {} 58 | reverse_lookup = {} 59 | characters_for_re = [] 60 | for codepoint, name in list(codepoint2name.items()): 61 | character = unichr(codepoint) 62 | if codepoint != 34: 63 | # There's no point in turning the quotation mark into 64 | # ", unless it happens within an attribute value, which 65 | # is handled elsewhere. 66 | characters_for_re.append(character) 67 | lookup[character] = name 68 | # But we do want to turn " into the quotation mark. 69 | reverse_lookup[name] = character 70 | re_definition = "[%s]" % "".join(characters_for_re) 71 | return lookup, reverse_lookup, re.compile(re_definition) 72 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 73 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 74 | 75 | CHARACTER_TO_XML_ENTITY = { 76 | "'": "apos", 77 | '"': "quot", 78 | "&": "amp", 79 | "<": "lt", 80 | ">": "gt", 81 | } 82 | 83 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 84 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 85 | ")") 86 | 87 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") 88 | 89 | @classmethod 90 | def _substitute_html_entity(cls, matchobj): 91 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 92 | return "&%s;" % entity 93 | 94 | @classmethod 95 | def _substitute_xml_entity(cls, matchobj): 96 | """Used with a regular expression to substitute the 97 | appropriate XML entity for an XML special character.""" 98 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 99 | return "&%s;" % entity 100 | 101 | @classmethod 102 | def quoted_attribute_value(self, value): 103 | """Make a value into a quoted XML attribute, possibly escaping it. 104 | 105 | Most strings will be quoted using double quotes. 106 | 107 | Bob's Bar -> "Bob's Bar" 108 | 109 | If a string contains double quotes, it will be quoted using 110 | single quotes. 111 | 112 | Welcome to "my bar" -> 'Welcome to "my bar"' 113 | 114 | If a string contains both single and double quotes, the 115 | double quotes will be escaped, and the string will be quoted 116 | using double quotes. 117 | 118 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 119 | """ 120 | quote_with = '"' 121 | if '"' in value: 122 | if "'" in value: 123 | # The string contains both single and double 124 | # quotes. Turn the double quotes into 125 | # entities. We quote the double quotes rather than 126 | # the single quotes because the entity name is 127 | # """ whether this is HTML or XML. If we 128 | # quoted the single quotes, we'd have to decide 129 | # between ' and &squot;. 130 | replace_with = """ 131 | value = value.replace('"', replace_with) 132 | else: 133 | # There are double quotes but no single quotes. 134 | # We can use single quotes to quote the attribute. 135 | quote_with = "'" 136 | return quote_with + value + quote_with 137 | 138 | @classmethod 139 | def substitute_xml(cls, value, make_quoted_attribute=False): 140 | """Substitute XML entities for special XML characters. 141 | 142 | :param value: A string to be substituted. The less-than sign 143 | will become <, the greater-than sign will become >, 144 | and any ampersands will become &. If you want ampersands 145 | that appear to be part of an entity definition to be left 146 | alone, use substitute_xml_containing_entities() instead. 147 | 148 | :param make_quoted_attribute: If True, then the string will be 149 | quoted, as befits an attribute value. 150 | """ 151 | # Escape angle brackets and ampersands. 152 | value = cls.AMPERSAND_OR_BRACKET.sub( 153 | cls._substitute_xml_entity, value) 154 | 155 | if make_quoted_attribute: 156 | value = cls.quoted_attribute_value(value) 157 | return value 158 | 159 | @classmethod 160 | def substitute_xml_containing_entities( 161 | cls, value, make_quoted_attribute=False): 162 | """Substitute XML entities for special XML characters. 163 | 164 | :param value: A string to be substituted. The less-than sign will 165 | become <, the greater-than sign will become >, and any 166 | ampersands that are not part of an entity defition will 167 | become &. 168 | 169 | :param make_quoted_attribute: If True, then the string will be 170 | quoted, as befits an attribute value. 171 | """ 172 | # Escape angle brackets, and ampersands that aren't part of 173 | # entities. 174 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 175 | cls._substitute_xml_entity, value) 176 | 177 | if make_quoted_attribute: 178 | value = cls.quoted_attribute_value(value) 179 | return value 180 | 181 | 182 | @classmethod 183 | def substitute_html(cls, s): 184 | """Replace certain Unicode characters with named HTML entities. 185 | 186 | This differs from data.encode(encoding, 'xmlcharrefreplace') 187 | in that the goal is to make the result more readable (to those 188 | with ASCII displays) rather than to recover from 189 | errors. There's absolutely nothing wrong with a UTF-8 string 190 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 191 | character with "é" will make it more readable to some 192 | people. 193 | """ 194 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 195 | cls._substitute_html_entity, s) 196 | 197 | 198 | class UnicodeDammit: 199 | """A class for detecting the encoding of a *ML document and 200 | converting it to a Unicode string. If the source encoding is 201 | windows-1252, can replace MS smart quotes with their HTML or XML 202 | equivalents.""" 203 | 204 | # This dictionary maps commonly seen values for "charset" in HTML 205 | # meta tags to the corresponding Python codec names. It only covers 206 | # values that aren't in Python's aliases and can't be determined 207 | # by the heuristics in find_codec. 208 | CHARSET_ALIASES = {"macintosh": "mac-roman", 209 | "x-sjis": "shift-jis"} 210 | 211 | ENCODINGS_WITH_SMART_QUOTES = [ 212 | "windows-1252", 213 | "iso-8859-1", 214 | "iso-8859-2", 215 | ] 216 | 217 | def __init__(self, markup, override_encodings=[], 218 | smart_quotes_to=None, is_html=False): 219 | self.declared_html_encoding = None 220 | self.smart_quotes_to = smart_quotes_to 221 | self.tried_encodings = [] 222 | self.contains_replacement_characters = False 223 | 224 | if markup == '' or isinstance(markup, unicode): 225 | self.markup = markup 226 | self.unicode_markup = unicode(markup) 227 | self.original_encoding = None 228 | return 229 | 230 | new_markup, document_encoding, sniffed_encoding = \ 231 | self._detectEncoding(markup, is_html) 232 | self.markup = new_markup 233 | 234 | u = None 235 | if new_markup != markup: 236 | # _detectEncoding modified the markup, then converted it to 237 | # Unicode and then to UTF-8. So convert it from UTF-8. 238 | u = self._convert_from("utf8") 239 | self.original_encoding = sniffed_encoding 240 | 241 | if not u: 242 | for proposed_encoding in ( 243 | override_encodings + [document_encoding, sniffed_encoding]): 244 | if proposed_encoding is not None: 245 | u = self._convert_from(proposed_encoding) 246 | if u: 247 | break 248 | 249 | # If no luck and we have auto-detection library, try that: 250 | if not u and not isinstance(self.markup, unicode): 251 | u = self._convert_from(chardet_dammit(self.markup)) 252 | 253 | # As a last resort, try utf-8 and windows-1252: 254 | if not u: 255 | for proposed_encoding in ("utf-8", "windows-1252"): 256 | u = self._convert_from(proposed_encoding) 257 | if u: 258 | break 259 | 260 | # As an absolute last resort, try the encodings again with 261 | # character replacement. 262 | if not u: 263 | for proposed_encoding in ( 264 | override_encodings + [ 265 | document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): 266 | if proposed_encoding != "ascii": 267 | u = self._convert_from(proposed_encoding, "replace") 268 | if u is not None: 269 | logging.warning( 270 | "Some characters could not be decoded, and were " 271 | "replaced with REPLACEMENT CHARACTER.") 272 | self.contains_replacement_characters = True 273 | break 274 | 275 | # We could at this point force it to ASCII, but that would 276 | # destroy so much data that I think giving up is better 277 | self.unicode_markup = u 278 | if not u: 279 | self.original_encoding = None 280 | 281 | def _sub_ms_char(self, match): 282 | """Changes a MS smart quote character to an XML or HTML 283 | entity, or an ASCII character.""" 284 | orig = match.group(1) 285 | if self.smart_quotes_to == 'ascii': 286 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 287 | else: 288 | sub = self.MS_CHARS.get(orig) 289 | if type(sub) == tuple: 290 | if self.smart_quotes_to == 'xml': 291 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 292 | else: 293 | sub = '&'.encode() + sub[0].encode() + ';'.encode() 294 | else: 295 | sub = sub.encode() 296 | return sub 297 | 298 | def _convert_from(self, proposed, errors="strict"): 299 | proposed = self.find_codec(proposed) 300 | if not proposed or (proposed, errors) in self.tried_encodings: 301 | return None 302 | self.tried_encodings.append((proposed, errors)) 303 | markup = self.markup 304 | # Convert smart quotes to HTML if coming from an encoding 305 | # that might have them. 306 | if (self.smart_quotes_to is not None 307 | and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): 308 | smart_quotes_re = b"([\x80-\x9f])" 309 | smart_quotes_compiled = re.compile(smart_quotes_re) 310 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 311 | 312 | try: 313 | #print "Trying to convert document to %s (errors=%s)" % ( 314 | # proposed, errors) 315 | u = self._to_unicode(markup, proposed, errors) 316 | self.markup = u 317 | self.original_encoding = proposed 318 | except Exception as e: 319 | #print "That didn't work!" 320 | #print e 321 | return None 322 | #print "Correct encoding: %s" % proposed 323 | return self.markup 324 | 325 | def _to_unicode(self, data, encoding, errors="strict"): 326 | '''Given a string and its encoding, decodes the string into Unicode. 327 | %encoding is a string recognized by encodings.aliases''' 328 | 329 | # strip Byte Order Mark (if present) 330 | if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 331 | and (data[2:4] != '\x00\x00'): 332 | encoding = 'utf-16be' 333 | data = data[2:] 334 | elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 335 | and (data[2:4] != '\x00\x00'): 336 | encoding = 'utf-16le' 337 | data = data[2:] 338 | elif data[:3] == '\xef\xbb\xbf': 339 | encoding = 'utf-8' 340 | data = data[3:] 341 | elif data[:4] == '\x00\x00\xfe\xff': 342 | encoding = 'utf-32be' 343 | data = data[4:] 344 | elif data[:4] == '\xff\xfe\x00\x00': 345 | encoding = 'utf-32le' 346 | data = data[4:] 347 | newdata = unicode(data, encoding, errors) 348 | return newdata 349 | 350 | def _detectEncoding(self, xml_data, is_html=False): 351 | """Given a document, tries to detect its XML encoding.""" 352 | xml_encoding = sniffed_xml_encoding = None 353 | try: 354 | if xml_data[:4] == b'\x4c\x6f\xa7\x94': 355 | # EBCDIC 356 | xml_data = self._ebcdic_to_ascii(xml_data) 357 | elif xml_data[:4] == b'\x00\x3c\x00\x3f': 358 | # UTF-16BE 359 | sniffed_xml_encoding = 'utf-16be' 360 | xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 361 | elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ 362 | and (xml_data[2:4] != b'\x00\x00'): 363 | # UTF-16BE with BOM 364 | sniffed_xml_encoding = 'utf-16be' 365 | xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 366 | elif xml_data[:4] == b'\x3c\x00\x3f\x00': 367 | # UTF-16LE 368 | sniffed_xml_encoding = 'utf-16le' 369 | xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 370 | elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ 371 | (xml_data[2:4] != b'\x00\x00'): 372 | # UTF-16LE with BOM 373 | sniffed_xml_encoding = 'utf-16le' 374 | xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 375 | elif xml_data[:4] == b'\x00\x00\x00\x3c': 376 | # UTF-32BE 377 | sniffed_xml_encoding = 'utf-32be' 378 | xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 379 | elif xml_data[:4] == b'\x3c\x00\x00\x00': 380 | # UTF-32LE 381 | sniffed_xml_encoding = 'utf-32le' 382 | xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 383 | elif xml_data[:4] == b'\x00\x00\xfe\xff': 384 | # UTF-32BE with BOM 385 | sniffed_xml_encoding = 'utf-32be' 386 | xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 387 | elif xml_data[:4] == b'\xff\xfe\x00\x00': 388 | # UTF-32LE with BOM 389 | sniffed_xml_encoding = 'utf-32le' 390 | xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 391 | elif xml_data[:3] == b'\xef\xbb\xbf': 392 | # UTF-8 with BOM 393 | sniffed_xml_encoding = 'utf-8' 394 | xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 395 | else: 396 | sniffed_xml_encoding = 'ascii' 397 | pass 398 | except: 399 | xml_encoding_match = None 400 | xml_encoding_match = xml_encoding_re.match(xml_data) 401 | if not xml_encoding_match and is_html: 402 | xml_encoding_match = html_meta_re.search(xml_data) 403 | if xml_encoding_match is not None: 404 | xml_encoding = xml_encoding_match.groups()[0].decode( 405 | 'ascii').lower() 406 | if is_html: 407 | self.declared_html_encoding = xml_encoding 408 | if sniffed_xml_encoding and \ 409 | (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 410 | 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 411 | 'utf-16', 'utf-32', 'utf_16', 'utf_32', 412 | 'utf16', 'u16')): 413 | xml_encoding = sniffed_xml_encoding 414 | return xml_data, xml_encoding, sniffed_xml_encoding 415 | 416 | def find_codec(self, charset): 417 | return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 418 | or (charset and self._codec(charset.replace("-", ""))) \ 419 | or (charset and self._codec(charset.replace("-", "_"))) \ 420 | or charset 421 | 422 | def _codec(self, charset): 423 | if not charset: 424 | return charset 425 | codec = None 426 | try: 427 | codecs.lookup(charset) 428 | codec = charset 429 | except (LookupError, ValueError): 430 | pass 431 | return codec 432 | 433 | EBCDIC_TO_ASCII_MAP = None 434 | 435 | def _ebcdic_to_ascii(self, s): 436 | c = self.__class__ 437 | if not c.EBCDIC_TO_ASCII_MAP: 438 | emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 439 | 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 440 | 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 441 | 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 442 | 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 443 | 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 444 | 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 445 | 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 446 | 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 447 | 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 448 | 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 449 | 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 450 | 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 451 | 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 452 | 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 453 | 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 454 | 250,251,252,253,254,255) 455 | import string 456 | c.EBCDIC_TO_ASCII_MAP = string.maketrans( 457 | ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) 458 | return s.translate(c.EBCDIC_TO_ASCII_MAP) 459 | 460 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 461 | MS_CHARS = {b'\x80': ('euro', '20AC'), 462 | b'\x81': ' ', 463 | b'\x82': ('sbquo', '201A'), 464 | b'\x83': ('fnof', '192'), 465 | b'\x84': ('bdquo', '201E'), 466 | b'\x85': ('hellip', '2026'), 467 | b'\x86': ('dagger', '2020'), 468 | b'\x87': ('Dagger', '2021'), 469 | b'\x88': ('circ', '2C6'), 470 | b'\x89': ('permil', '2030'), 471 | b'\x8A': ('Scaron', '160'), 472 | b'\x8B': ('lsaquo', '2039'), 473 | b'\x8C': ('OElig', '152'), 474 | b'\x8D': '?', 475 | b'\x8E': ('#x17D', '17D'), 476 | b'\x8F': '?', 477 | b'\x90': '?', 478 | b'\x91': ('lsquo', '2018'), 479 | b'\x92': ('rsquo', '2019'), 480 | b'\x93': ('ldquo', '201C'), 481 | b'\x94': ('rdquo', '201D'), 482 | b'\x95': ('bull', '2022'), 483 | b'\x96': ('ndash', '2013'), 484 | b'\x97': ('mdash', '2014'), 485 | b'\x98': ('tilde', '2DC'), 486 | b'\x99': ('trade', '2122'), 487 | b'\x9a': ('scaron', '161'), 488 | b'\x9b': ('rsaquo', '203A'), 489 | b'\x9c': ('oelig', '153'), 490 | b'\x9d': '?', 491 | b'\x9e': ('#x17E', '17E'), 492 | b'\x9f': ('Yuml', ''),} 493 | 494 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 495 | # horrors like stripping diacritical marks to turn á into a, but also 496 | # contains non-horrors like turning “ into ". 497 | MS_CHARS_TO_ASCII = { 498 | b'\x80' : 'EUR', 499 | b'\x81' : ' ', 500 | b'\x82' : ',', 501 | b'\x83' : 'f', 502 | b'\x84' : ',,', 503 | b'\x85' : '...', 504 | b'\x86' : '+', 505 | b'\x87' : '++', 506 | b'\x88' : '^', 507 | b'\x89' : '%', 508 | b'\x8a' : 'S', 509 | b'\x8b' : '<', 510 | b'\x8c' : 'OE', 511 | b'\x8d' : '?', 512 | b'\x8e' : 'Z', 513 | b'\x8f' : '?', 514 | b'\x90' : '?', 515 | b'\x91' : "'", 516 | b'\x92' : "'", 517 | b'\x93' : '"', 518 | b'\x94' : '"', 519 | b'\x95' : '*', 520 | b'\x96' : '-', 521 | b'\x97' : '--', 522 | b'\x98' : '~', 523 | b'\x99' : '(TM)', 524 | b'\x9a' : 's', 525 | b'\x9b' : '>', 526 | b'\x9c' : 'oe', 527 | b'\x9d' : '?', 528 | b'\x9e' : 'z', 529 | b'\x9f' : 'Y', 530 | b'\xa0' : ' ', 531 | b'\xa1' : '!', 532 | b'\xa2' : 'c', 533 | b'\xa3' : 'GBP', 534 | b'\xa4' : '$', #This approximation is especially parochial--this is the 535 | #generic currency symbol. 536 | b'\xa5' : 'YEN', 537 | b'\xa6' : '|', 538 | b'\xa7' : 'S', 539 | b'\xa8' : '..', 540 | b'\xa9' : '', 541 | b'\xaa' : '(th)', 542 | b'\xab' : '<<', 543 | b'\xac' : '!', 544 | b'\xad' : ' ', 545 | b'\xae' : '(R)', 546 | b'\xaf' : '-', 547 | b'\xb0' : 'o', 548 | b'\xb1' : '+-', 549 | b'\xb2' : '2', 550 | b'\xb3' : '3', 551 | b'\xb4' : ("'", 'acute'), 552 | b'\xb5' : 'u', 553 | b'\xb6' : 'P', 554 | b'\xb7' : '*', 555 | b'\xb8' : ',', 556 | b'\xb9' : '1', 557 | b'\xba' : '(th)', 558 | b'\xbb' : '>>', 559 | b'\xbc' : '1/4', 560 | b'\xbd' : '1/2', 561 | b'\xbe' : '3/4', 562 | b'\xbf' : '?', 563 | b'\xc0' : 'A', 564 | b'\xc1' : 'A', 565 | b'\xc2' : 'A', 566 | b'\xc3' : 'A', 567 | b'\xc4' : 'A', 568 | b'\xc5' : 'A', 569 | b'\xc6' : 'AE', 570 | b'\xc7' : 'C', 571 | b'\xc8' : 'E', 572 | b'\xc9' : 'E', 573 | b'\xca' : 'E', 574 | b'\xcb' : 'E', 575 | b'\xcc' : 'I', 576 | b'\xcd' : 'I', 577 | b'\xce' : 'I', 578 | b'\xcf' : 'I', 579 | b'\xd0' : 'D', 580 | b'\xd1' : 'N', 581 | b'\xd2' : 'O', 582 | b'\xd3' : 'O', 583 | b'\xd4' : 'O', 584 | b'\xd5' : 'O', 585 | b'\xd6' : 'O', 586 | b'\xd7' : '*', 587 | b'\xd8' : 'O', 588 | b'\xd9' : 'U', 589 | b'\xda' : 'U', 590 | b'\xdb' : 'U', 591 | b'\xdc' : 'U', 592 | b'\xdd' : 'Y', 593 | b'\xde' : 'b', 594 | b'\xdf' : 'B', 595 | b'\xe0' : 'a', 596 | b'\xe1' : 'a', 597 | b'\xe2' : 'a', 598 | b'\xe3' : 'a', 599 | b'\xe4' : 'a', 600 | b'\xe5' : 'a', 601 | b'\xe6' : 'ae', 602 | b'\xe7' : 'c', 603 | b'\xe8' : 'e', 604 | b'\xe9' : 'e', 605 | b'\xea' : 'e', 606 | b'\xeb' : 'e', 607 | b'\xec' : 'i', 608 | b'\xed' : 'i', 609 | b'\xee' : 'i', 610 | b'\xef' : 'i', 611 | b'\xf0' : 'o', 612 | b'\xf1' : 'n', 613 | b'\xf2' : 'o', 614 | b'\xf3' : 'o', 615 | b'\xf4' : 'o', 616 | b'\xf5' : 'o', 617 | b'\xf6' : 'o', 618 | b'\xf7' : '/', 619 | b'\xf8' : 'o', 620 | b'\xf9' : 'u', 621 | b'\xfa' : 'u', 622 | b'\xfb' : 'u', 623 | b'\xfc' : 'u', 624 | b'\xfd' : 'y', 625 | b'\xfe' : 'b', 626 | b'\xff' : 'y', 627 | } 628 | 629 | # A map used when removing rogue Windows-1252/ISO-8859-1 630 | # characters in otherwise UTF-8 documents. 631 | # 632 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 633 | # Windows-1252. 634 | WINDOWS_1252_TO_UTF8 = { 635 | 0x80 : b'\xe2\x82\xac', # € 636 | 0x82 : b'\xe2\x80\x9a', # ‚ 637 | 0x83 : b'\xc6\x92', # ƒ 638 | 0x84 : b'\xe2\x80\x9e', # „ 639 | 0x85 : b'\xe2\x80\xa6', # … 640 | 0x86 : b'\xe2\x80\xa0', # † 641 | 0x87 : b'\xe2\x80\xa1', # ‡ 642 | 0x88 : b'\xcb\x86', # ˆ 643 | 0x89 : b'\xe2\x80\xb0', # ‰ 644 | 0x8a : b'\xc5\xa0', # Š 645 | 0x8b : b'\xe2\x80\xb9', # ‹ 646 | 0x8c : b'\xc5\x92', # Œ 647 | 0x8e : b'\xc5\xbd', # Ž 648 | 0x91 : b'\xe2\x80\x98', # ‘ 649 | 0x92 : b'\xe2\x80\x99', # ’ 650 | 0x93 : b'\xe2\x80\x9c', # “ 651 | 0x94 : b'\xe2\x80\x9d', # ” 652 | 0x95 : b'\xe2\x80\xa2', # • 653 | 0x96 : b'\xe2\x80\x93', # – 654 | 0x97 : b'\xe2\x80\x94', # — 655 | 0x98 : b'\xcb\x9c', # ˜ 656 | 0x99 : b'\xe2\x84\xa2', # ™ 657 | 0x9a : b'\xc5\xa1', # š 658 | 0x9b : b'\xe2\x80\xba', # › 659 | 0x9c : b'\xc5\x93', # œ 660 | 0x9e : b'\xc5\xbe', # ž 661 | 0x9f : b'\xc5\xb8', # Ÿ 662 | 0xa0 : b'\xc2\xa0', #   663 | 0xa1 : b'\xc2\xa1', # ¡ 664 | 0xa2 : b'\xc2\xa2', # ¢ 665 | 0xa3 : b'\xc2\xa3', # £ 666 | 0xa4 : b'\xc2\xa4', # ¤ 667 | 0xa5 : b'\xc2\xa5', # ¥ 668 | 0xa6 : b'\xc2\xa6', # ¦ 669 | 0xa7 : b'\xc2\xa7', # § 670 | 0xa8 : b'\xc2\xa8', # ¨ 671 | 0xa9 : b'\xc2\xa9', # © 672 | 0xaa : b'\xc2\xaa', # ª 673 | 0xab : b'\xc2\xab', # « 674 | 0xac : b'\xc2\xac', # ¬ 675 | 0xad : b'\xc2\xad', # ­ 676 | 0xae : b'\xc2\xae', # ® 677 | 0xaf : b'\xc2\xaf', # ¯ 678 | 0xb0 : b'\xc2\xb0', # ° 679 | 0xb1 : b'\xc2\xb1', # ± 680 | 0xb2 : b'\xc2\xb2', # ² 681 | 0xb3 : b'\xc2\xb3', # ³ 682 | 0xb4 : b'\xc2\xb4', # ´ 683 | 0xb5 : b'\xc2\xb5', # µ 684 | 0xb6 : b'\xc2\xb6', # ¶ 685 | 0xb7 : b'\xc2\xb7', # · 686 | 0xb8 : b'\xc2\xb8', # ¸ 687 | 0xb9 : b'\xc2\xb9', # ¹ 688 | 0xba : b'\xc2\xba', # º 689 | 0xbb : b'\xc2\xbb', # » 690 | 0xbc : b'\xc2\xbc', # ¼ 691 | 0xbd : b'\xc2\xbd', # ½ 692 | 0xbe : b'\xc2\xbe', # ¾ 693 | 0xbf : b'\xc2\xbf', # ¿ 694 | 0xc0 : b'\xc3\x80', # À 695 | 0xc1 : b'\xc3\x81', # Á 696 | 0xc2 : b'\xc3\x82', #  697 | 0xc3 : b'\xc3\x83', # à 698 | 0xc4 : b'\xc3\x84', # Ä 699 | 0xc5 : b'\xc3\x85', # Å 700 | 0xc6 : b'\xc3\x86', # Æ 701 | 0xc7 : b'\xc3\x87', # Ç 702 | 0xc8 : b'\xc3\x88', # È 703 | 0xc9 : b'\xc3\x89', # É 704 | 0xca : b'\xc3\x8a', # Ê 705 | 0xcb : b'\xc3\x8b', # Ë 706 | 0xcc : b'\xc3\x8c', # Ì 707 | 0xcd : b'\xc3\x8d', # Í 708 | 0xce : b'\xc3\x8e', # Î 709 | 0xcf : b'\xc3\x8f', # Ï 710 | 0xd0 : b'\xc3\x90', # Ð 711 | 0xd1 : b'\xc3\x91', # Ñ 712 | 0xd2 : b'\xc3\x92', # Ò 713 | 0xd3 : b'\xc3\x93', # Ó 714 | 0xd4 : b'\xc3\x94', # Ô 715 | 0xd5 : b'\xc3\x95', # Õ 716 | 0xd6 : b'\xc3\x96', # Ö 717 | 0xd7 : b'\xc3\x97', # × 718 | 0xd8 : b'\xc3\x98', # Ø 719 | 0xd9 : b'\xc3\x99', # Ù 720 | 0xda : b'\xc3\x9a', # Ú 721 | 0xdb : b'\xc3\x9b', # Û 722 | 0xdc : b'\xc3\x9c', # Ü 723 | 0xdd : b'\xc3\x9d', # Ý 724 | 0xde : b'\xc3\x9e', # Þ 725 | 0xdf : b'\xc3\x9f', # ß 726 | 0xe0 : b'\xc3\xa0', # à 727 | 0xe1 : b'\xa1', # á 728 | 0xe2 : b'\xc3\xa2', # â 729 | 0xe3 : b'\xc3\xa3', # ã 730 | 0xe4 : b'\xc3\xa4', # ä 731 | 0xe5 : b'\xc3\xa5', # å 732 | 0xe6 : b'\xc3\xa6', # æ 733 | 0xe7 : b'\xc3\xa7', # ç 734 | 0xe8 : b'\xc3\xa8', # è 735 | 0xe9 : b'\xc3\xa9', # é 736 | 0xea : b'\xc3\xaa', # ê 737 | 0xeb : b'\xc3\xab', # ë 738 | 0xec : b'\xc3\xac', # ì 739 | 0xed : b'\xc3\xad', # í 740 | 0xee : b'\xc3\xae', # î 741 | 0xef : b'\xc3\xaf', # ï 742 | 0xf0 : b'\xc3\xb0', # ð 743 | 0xf1 : b'\xc3\xb1', # ñ 744 | 0xf2 : b'\xc3\xb2', # ò 745 | 0xf3 : b'\xc3\xb3', # ó 746 | 0xf4 : b'\xc3\xb4', # ô 747 | 0xf5 : b'\xc3\xb5', # õ 748 | 0xf6 : b'\xc3\xb6', # ö 749 | 0xf7 : b'\xc3\xb7', # ÷ 750 | 0xf8 : b'\xc3\xb8', # ø 751 | 0xf9 : b'\xc3\xb9', # ù 752 | 0xfa : b'\xc3\xba', # ú 753 | 0xfb : b'\xc3\xbb', # û 754 | 0xfc : b'\xc3\xbc', # ü 755 | 0xfd : b'\xc3\xbd', # ý 756 | 0xfe : b'\xc3\xbe', # þ 757 | } 758 | 759 | MULTIBYTE_MARKERS_AND_SIZES = [ 760 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 761 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 762 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 763 | ] 764 | 765 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 766 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 767 | 768 | @classmethod 769 | def detwingle(cls, in_bytes, main_encoding="utf8", 770 | embedded_encoding="windows-1252"): 771 | """Fix characters from one encoding embedded in some other encoding. 772 | 773 | Currently the only situation supported is Windows-1252 (or its 774 | subset ISO-8859-1), embedded in UTF-8. 775 | 776 | The input must be a bytestring. If you've already converted 777 | the document to Unicode, you're too late. 778 | 779 | The output is a bytestring in which `embedded_encoding` 780 | characters have been converted to their `main_encoding` 781 | equivalents. 782 | """ 783 | if embedded_encoding.replace('_', '-').lower() not in ( 784 | 'windows-1252', 'windows_1252'): 785 | raise NotImplementedError( 786 | "Windows-1252 and ISO-8859-1 are the only currently supported " 787 | "embedded encodings.") 788 | 789 | if main_encoding.lower() not in ('utf8', 'utf-8'): 790 | raise NotImplementedError( 791 | "UTF-8 is the only currently supported main encoding.") 792 | 793 | byte_chunks = [] 794 | 795 | chunk_start = 0 796 | pos = 0 797 | while pos < len(in_bytes): 798 | byte = in_bytes[pos] 799 | if not isinstance(byte, int): 800 | # Python 2.x 801 | byte = ord(byte) 802 | if (byte >= cls.FIRST_MULTIBYTE_MARKER 803 | and byte <= cls.LAST_MULTIBYTE_MARKER): 804 | # This is the start of a UTF-8 multibyte character. Skip 805 | # to the end. 806 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 807 | if byte >= start and byte <= end: 808 | pos += size 809 | break 810 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 811 | # We found a Windows-1252 character! 812 | # Save the string up to this point as a chunk. 813 | byte_chunks.append(in_bytes[chunk_start:pos]) 814 | 815 | # Now translate the Windows-1252 character into UTF-8 816 | # and add it as another, one-byte chunk. 817 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 818 | pos += 1 819 | chunk_start = pos 820 | else: 821 | # Go on to the next character. 822 | pos += 1 823 | if chunk_start == 0: 824 | # The string is unchanged. 825 | return in_bytes 826 | else: 827 | # Store the final chunk. 828 | byte_chunks.append(in_bytes[chunk_start:]) 829 | return b''.join(byte_chunks) 830 | 831 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | 3 | import sys 4 | 5 | if sys.version_info[0] < 3: 6 | 7 | from StringIO import StringIO 8 | from HTMLParser import HTMLParser 9 | from pylinkvalidator.included.bs4 import BeautifulSoup, __version__ 10 | from pylinkvalidator.included.bs4.builder import builder_registry 11 | import os 12 | import random 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print "Diagnostic running on Beautiful Soup %s" % __version__ 21 | print "Python version %s" % sys.version 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print ( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | from lxml import etree 37 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 38 | 39 | if 'html5lib' in basic_parsers: 40 | import html5lib 41 | print "Found html5lib version %s" % html5lib.__version__ 42 | 43 | if hasattr(data, 'read'): 44 | data = data.read() 45 | elif os.path.exists(data): 46 | print '"%s" looks like a filename. Reading data from the file.' % data 47 | data = open(data).read() 48 | elif data.startswith("http:") or data.startswith("https:"): 49 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 50 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 51 | return 52 | print 53 | 54 | for parser in basic_parsers: 55 | print "Trying to parse your markup with %s" % parser 56 | success = False 57 | try: 58 | soup = BeautifulSoup(data, parser) 59 | success = True 60 | except Exception, e: 61 | print "%s could not parse the markup." % parser 62 | traceback.print_exc() 63 | if success: 64 | print "Here's what %s did with the markup:" % parser 65 | print soup.prettify() 66 | 67 | print "-" * 80 68 | 69 | def lxml_trace(data, html=True): 70 | """Print out the lxml events that occur during parsing. 71 | 72 | This lets you see how lxml parses a document when no Beautiful 73 | Soup code is running. 74 | """ 75 | from lxml import etree 76 | for event, element in etree.iterparse(StringIO(data), html=html): 77 | print("%s, %4s, %s" % (event, element.tag, element.text)) 78 | 79 | class AnnouncingParser(HTMLParser): 80 | """Announces HTMLParser parse events, without doing anything else.""" 81 | 82 | def _p(self, s): 83 | print(s) 84 | 85 | def handle_starttag(self, name, attrs): 86 | self._p("%s START" % name) 87 | 88 | def handle_endtag(self, name): 89 | self._p("%s END" % name) 90 | 91 | def handle_data(self, data): 92 | self._p("%s DATA" % data) 93 | 94 | def handle_charref(self, name): 95 | self._p("%s CHARREF" % name) 96 | 97 | def handle_entityref(self, name): 98 | self._p("%s ENTITYREF" % name) 99 | 100 | def handle_comment(self, data): 101 | self._p("%s COMMENT" % data) 102 | 103 | def handle_decl(self, data): 104 | self._p("%s DECL" % data) 105 | 106 | def unknown_decl(self, data): 107 | self._p("%s UNKNOWN-DECL" % data) 108 | 109 | def handle_pi(self, data): 110 | self._p("%s PI" % data) 111 | 112 | def htmlparser_trace(data): 113 | """Print out the HTMLParser events that occur during parsing. 114 | 115 | This lets you see how HTMLParser parses a document when no 116 | Beautiful Soup code is running. 117 | """ 118 | parser = AnnouncingParser() 119 | parser.feed(data) 120 | 121 | _vowels = "aeiou" 122 | _consonants = "bcdfghjklmnpqrstvwxyz" 123 | 124 | def rword(length=5): 125 | "Generate a random word-like string." 126 | s = '' 127 | for i in range(length): 128 | if i % 2 == 0: 129 | t = _consonants 130 | else: 131 | t = _vowels 132 | s += random.choice(t) 133 | return s 134 | 135 | def rsentence(length=4): 136 | "Generate a random sentence-like string." 137 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 138 | 139 | def rdoc(num_elements=1000): 140 | """Randomly generate an invalid HTML document.""" 141 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 142 | elements = [] 143 | for i in range(num_elements): 144 | choice = random.randint(0,3) 145 | if choice == 0: 146 | # New tag. 147 | tag_name = random.choice(tag_names) 148 | elements.append("<%s>" % tag_name) 149 | elif choice == 1: 150 | elements.append(rsentence(random.randint(1,4))) 151 | elif choice == 2: 152 | # Close a tag. 153 | tag_name = random.choice(tag_names) 154 | elements.append("" % tag_name) 155 | return "" + "\n".join(elements) + "" 156 | 157 | def benchmark_parsers(num_elements=100000): 158 | """Very basic head-to-head performance benchmark.""" 159 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 160 | data = rdoc(num_elements) 161 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 162 | 163 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 164 | success = False 165 | try: 166 | a = time.time() 167 | soup = BeautifulSoup(data, parser) 168 | b = time.time() 169 | success = True 170 | except Exception, e: 171 | print "%s could not parse the markup." % parser 172 | traceback.print_exc() 173 | if success: 174 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 175 | 176 | from lxml import etree 177 | a = time.time() 178 | etree.HTML(data) 179 | b = time.time() 180 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 181 | 182 | if __name__ == '__main__': 183 | diagnose(sys.stdin.read()) 184 | -------------------------------------------------------------------------------- /pylinkvalidator/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Contains the crawling models. We use namedtuple for most models (easier to 4 | pickle, lower footprint, indicates that it is immutable) and we use classes for 5 | objects with mutable states and helper methods. 6 | 7 | Classes with crawling logic are declared in the crawler module. 8 | """ 9 | from __future__ import unicode_literals, absolute_import 10 | 11 | from collections import namedtuple, Mapping, defaultdict 12 | from optparse import OptionParser, OptionGroup 13 | import re 14 | 15 | from pylinkvalidator.included.bs4 import BeautifulSoup 16 | from pylinkvalidator.compat import get_safe_str 17 | from pylinkvalidator.urlutil import ( 18 | get_clean_url_split, get_absolute_url_split) 19 | 20 | 21 | PREFIX_ALL = "*" 22 | 23 | REGEX_CONTENT = "regex:" 24 | 25 | 26 | def namedtuple_with_defaults(typename, field_names, default_values=[]): 27 | """Creates a namedtuple with default values so they don't have to be 28 | provided for each argument. 29 | """ 30 | T = namedtuple(typename, field_names) 31 | 32 | # Set None everywhere 33 | T.__new__.__defaults__ = (None,) * len(T._fields) 34 | 35 | # Set provided default values 36 | if isinstance(default_values, Mapping): 37 | prototype = T(**default_values) 38 | else: 39 | prototype = T(*default_values) 40 | T.__new__.__defaults__ = tuple(prototype) 41 | 42 | # Return new type 43 | return T 44 | 45 | 46 | DEFAULT_TYPES = ['a', 'img', 'script', 'link'] 47 | 48 | 49 | TYPE_ATTRIBUTES = { 50 | 'a': 'href', 51 | 'img': 'src', 52 | 'script': 'src', 53 | 'link': 'href', 54 | } 55 | 56 | 57 | DEFAULT_TIMEOUT = 10 58 | 59 | 60 | MODE_THREAD = "thread" 61 | MODE_PROCESS = "process" 62 | MODE_GREEN = "green" 63 | 64 | 65 | DEFAULT_WORKERS = { 66 | MODE_THREAD: 1, 67 | MODE_PROCESS: 1, 68 | MODE_GREEN: 1000, 69 | } 70 | 71 | 72 | PARSER_STDLIB = "html.parser" 73 | PARSER_LXML = "lxml" 74 | PARSER_HTML5 = "html5lib" 75 | 76 | # TODO Add support for gumbo. Will require some refactoring of the parsing 77 | # logic. 78 | # PARSER_GUMBO = "gumbo" 79 | 80 | 81 | FORMAT_PLAIN = "plain" 82 | FORMAT_HTML = "html" 83 | FORMAT_JSON = "json" 84 | 85 | 86 | WHEN_ALWAYS = "always" 87 | WHEN_ON_ERROR = "error" 88 | 89 | 90 | REPORT_TYPE_ERRORS = "errors" 91 | REPORT_TYPE_SUMMARY = "summary" 92 | REPORT_TYPE_ALL = "all" 93 | 94 | 95 | VERBOSE_QUIET = "0" 96 | VERBOSE_NORMAL = "1" 97 | VERBOSE_INFO = "2" 98 | 99 | 100 | HTML_MIME_TYPE = "text/html" 101 | 102 | 103 | PAGE_QUEUED = '__PAGE_QUEUED__' 104 | PAGE_CRAWLED = '__PAGE_CRAWLED__' 105 | 106 | # Note: we use namedtuple to exchange data with workers because they are 107 | # immutable and easy to pickle (as opposed to a class). 108 | 109 | WorkerInit = namedtuple_with_defaults( 110 | "WorkerInit", 111 | ["worker_config", "input_queue", "output_queue", "logger"]) 112 | 113 | 114 | WorkerConfig = namedtuple_with_defaults( 115 | "WorkerConfig", 116 | ["username", "password", "types", "timeout", "parser", "strict_mode", 117 | "prefer_server_encoding", "extra_headers", "ignore_bad_tel_urls", 118 | "allow_insecure_content"]) 119 | 120 | 121 | WorkerInput = namedtuple_with_defaults( 122 | "WorkerInput", 123 | ["url_split", "should_crawl", "depth", "site_origin", "content_check"]) 124 | 125 | 126 | Response = namedtuple_with_defaults( 127 | "Response", ["content", "status", "exception", "original_url", 128 | "final_url", "is_redirect", "is_timeout", "response_time"]) 129 | 130 | 131 | ExceptionStr = namedtuple_with_defaults( 132 | "ExceptionStr", ["type_name", "message"]) 133 | 134 | 135 | Link = namedtuple_with_defaults( 136 | "Link", 137 | ["type", "url_split", "original_url_split", "source_str"]) 138 | 139 | 140 | PageCrawl = namedtuple_with_defaults( 141 | "PageCrawl", ["original_url_split", "final_url_split", 142 | "status", "is_timeout", "is_redirect", "links", 143 | "exception", "is_html", "depth", "response_time", 144 | "process_time", "site_origin", "missing_content", 145 | "erroneous_content"]) 146 | 147 | 148 | PageStatus = namedtuple_with_defaults( 149 | "PageStatus", ["status", "sources"]) 150 | 151 | 152 | PageSource = namedtuple_with_defaults( 153 | "PageSource", ["origin", "origin_str"]) 154 | 155 | 156 | ContentCheck = namedtuple_with_defaults( 157 | "ContentCheck", 158 | ["html_presence", "html_absence", "text_presence", "text_absence", 159 | "has_something_to_check"]) 160 | 161 | HTMLCheck = namedtuple_with_defaults( 162 | "HTMLCheck", ["tag", "attrs", "content"]) 163 | 164 | 165 | class UTF8Class(object): 166 | """Handles unicode string from __unicode__() in: __str__() and __repr__() 167 | """ 168 | def __str__(self): 169 | return get_safe_str(self.__unicode__()) 170 | 171 | def __repr__(self): 172 | return get_safe_str(self.__unicode__()) 173 | 174 | 175 | class LazyLogParam(object): 176 | """Lazy Log Parameter that is only evaluated if the logging statement 177 | is printed""" 178 | 179 | def __init__(self, func): 180 | self.func = func 181 | 182 | def __str__(self): 183 | return str(self.func()) 184 | 185 | 186 | class Config(UTF8Class): 187 | """Contains all the configuration options.""" 188 | 189 | def __init__(self): 190 | # Design note: we only use attributes when options need to be 191 | # transformed. Otherwise, we use options. 192 | self.parser = self._build_parser() 193 | self.options = None 194 | self.start_urls = [] 195 | self.start_url_splits = [] 196 | self.worker_config = None 197 | 198 | self.accepted_hosts = [] 199 | """Set of accepted hosts. Dictionary of accepted hosts if in multi 200 | mode: key: start url host, value: set of accepted hosts.""" 201 | 202 | self.ignored_prefixes = [] 203 | self.worker_size = 0 204 | self.content_check = None 205 | 206 | def should_crawl(self, url_split, depth): 207 | """Returns True if url split is local AND depth is acceptable""" 208 | return (self.options.depth < 0 or depth < self.options.depth) and\ 209 | self.is_local(url_split) 210 | 211 | def is_local(self, url_split, site_origin=None): 212 | """Returns true if url split is in the accepted hosts. site_origin must 213 | be provided if multi sites mode is enabled.""" 214 | 215 | if self.options.multi and site_origin: 216 | accepted_hosts = self.accepted_hosts[site_origin] 217 | else: 218 | accepted_hosts = self.accepted_hosts 219 | 220 | return url_split.netloc in accepted_hosts 221 | 222 | def should_download(self, url_split): 223 | """Returns True if the url does not start with an ignored prefix and if 224 | it is local or outside links are allowed.""" 225 | local = self.is_local(url_split) 226 | 227 | if not self.options.test_outside and not local: 228 | return False 229 | 230 | url = url_split.geturl() 231 | 232 | for ignored_prefix in self.ignored_prefixes: 233 | if url.startswith(ignored_prefix): 234 | return False 235 | 236 | return True 237 | 238 | def parse_cli_config(self): 239 | """Builds the options and args based on the command line options.""" 240 | (self.options, self.start_urls) = self.parser.parse_args() 241 | self._parse_config() 242 | 243 | def parse_api_config(self, start_urls, options_dict=None): 244 | """Builds the options and args based on passed parameters.""" 245 | # TODO Add options 246 | options = self._get_options(options_dict) 247 | (self.options, self.start_urls) = self.parser.parse_args( 248 | options + start_urls) 249 | self._parse_config() 250 | 251 | def _get_options(self, options_dict): 252 | if not options_dict: 253 | options_dict = {} 254 | options = [] 255 | for key, value in options_dict.items(): 256 | if isinstance(value, bool) and value: 257 | options.append("--{0}".format(key)) 258 | else: 259 | options.append("--{0}={1}".format(key, value)) 260 | return options 261 | 262 | def _parse_config(self): 263 | if self.options.url_file_path: 264 | self.start_urls = self._read_start_urls(self.options.url_file_path) 265 | self._process_start_urls() 266 | 267 | self.worker_config = self._build_worker_config(self.options) 268 | self.accepted_hosts = self._build_accepted_hosts( 269 | self.options, self.start_urls) 270 | 271 | if self.options.ignored_prefixes: 272 | self.ignored_prefixes = self.options.ignored_prefixes.split(',') 273 | 274 | if self.options.workers: 275 | self.worker_size = self.options.workers 276 | else: 277 | self.worker_size = DEFAULT_WORKERS[self.options.mode] 278 | 279 | if self.options.run_once: 280 | self.options.depth = 0 281 | 282 | self.content_check = self._compute_content_check(self.options) 283 | 284 | self._add_content_check_urls(self.start_url_splits, self.content_check) 285 | 286 | def _read_start_urls(self, url_file_path): 287 | urls = [] 288 | with open(url_file_path, "r") as url_file: 289 | urls = [url for url in url_file.read().split() if url] 290 | return urls 291 | 292 | def _process_start_urls(self): 293 | for start_url in self.start_urls: 294 | self.start_url_splits.append(get_clean_url_split(start_url)) 295 | 296 | def _build_worker_config(self, options): 297 | types = options.types.split(',') 298 | for element_type in types: 299 | if element_type not in DEFAULT_TYPES: 300 | raise ValueError("This type is not supported: {0}" 301 | .format(element_type)) 302 | 303 | headers = {} 304 | if options.headers: 305 | for item in options.headers: 306 | split = item.split(":") 307 | if len(split) == 2: 308 | headers[split[0]] = split[1] 309 | 310 | return WorkerConfig( 311 | options.username, options.password, types, options.timeout, 312 | options.parser, options.strict_mode, 313 | options.prefer_server_encoding, headers, 314 | options.ignore_bad_tel_urls) 315 | 316 | def _build_accepted_hosts(self, options, start_urls): 317 | if options.multi: 318 | return self._build_multi_hosts(options, start_urls) 319 | else: 320 | return self._build_single_hosts(options, start_urls) 321 | 322 | def _build_multi_hosts(self, options, start_urls): 323 | hosts = {} 324 | 325 | extra_hosts = set() 326 | if options.accepted_hosts: 327 | for url in options.accepted_hosts.split(','): 328 | split_result = get_clean_url_split(url) 329 | extra_hosts.add(split_result.netloc) 330 | 331 | for start_url in start_urls: 332 | split_result = get_clean_url_split(start_url) 333 | host = split_result.netloc 334 | hosts[host] = extra_hosts.union(host) 335 | 336 | return hosts 337 | 338 | def _build_single_hosts(self, options, start_urls): 339 | hosts = set() 340 | urls = [] 341 | 342 | if options.accepted_hosts: 343 | urls = options.accepted_hosts.split(',') 344 | urls = urls + start_urls 345 | 346 | for url in urls: 347 | split_result = get_clean_url_split(url) 348 | hosts.add(split_result.netloc) 349 | 350 | return hosts 351 | 352 | def _compute_content_check(self, options): 353 | html_presence = defaultdict(list) 354 | html_absence = defaultdict(list) 355 | raw_presence = defaultdict(list) 356 | raw_absence = defaultdict(list) 357 | self._compute_single_content_check( 358 | options.content_presence, html_presence, 359 | raw_presence, PREFIX_ALL) 360 | self._compute_single_content_check( 361 | options.content_absence, html_absence, 362 | raw_absence, PREFIX_ALL) 363 | self._compute_single_content_check( 364 | options.content_presence_once, html_presence, 365 | raw_presence) 366 | self._compute_single_content_check( 367 | options.content_absence_once, html_absence, 368 | raw_absence) 369 | 370 | has_something_to_check = bool( 371 | html_presence or html_absence or raw_presence or raw_absence) 372 | 373 | return ContentCheck( 374 | html_presence, html_absence, raw_presence, raw_absence, 375 | has_something_to_check) 376 | 377 | def _add_content_check_urls(self, start_urls, content_check): 378 | self._add_urls_from_single_content_check( 379 | start_urls, content_check.html_presence) 380 | self._add_urls_from_single_content_check( 381 | start_urls, content_check.html_absence) 382 | self._add_urls_from_single_content_check( 383 | start_urls, content_check.text_presence) 384 | self._add_urls_from_single_content_check( 385 | start_urls, content_check.text_absence) 386 | 387 | def _add_urls_from_single_content_check( 388 | self, start_urls, single_content_check): 389 | for key in single_content_check.keys(): 390 | if key == PREFIX_ALL: 391 | continue 392 | if key.netloc and key not in start_urls: 393 | start_urls.append(key) 394 | else: 395 | for url_split in start_urls: 396 | new_url = get_absolute_url_split( 397 | key.geturl(), url_split) 398 | if new_url not in start_urls: 399 | start_urls.append(new_url) 400 | 401 | def _compute_single_content_check( 402 | self, content_list, html_dict, raw_dict, prefix=None): 403 | if not content_list: 404 | # Catch None 405 | return 406 | 407 | for content in content_list: 408 | temp_prefix, content = self._get_prefix_content(content, prefix) 409 | content = content.strip() 410 | if content.startswith("<"): 411 | # html.parser because we do not want to automatically create 412 | # surrounding tags 413 | soup = BeautifulSoup(content, "html.parser") 414 | children = list(soup.children) 415 | if children: 416 | child = children[0] 417 | string = child.string 418 | if child.string and child.string.startswith(REGEX_CONTENT): 419 | string = re.compile(child.string[len(REGEX_CONTENT):], 420 | re.MULTILINE) 421 | html_check = HTMLCheck( 422 | child.name, child.attrs, string) 423 | html_dict[temp_prefix].append(html_check) 424 | else: 425 | if content and content.startswith(REGEX_CONTENT): 426 | content = re.compile(content[len(REGEX_CONTENT):], 427 | re.MULTILINE) 428 | raw_dict[temp_prefix].append(content) 429 | 430 | def _get_prefix_content(self, content, prefix=None): 431 | if not prefix: 432 | index = content.find(",") 433 | prefix = get_clean_url_split(content[:index]) 434 | content = content[index+1:] 435 | 436 | return (prefix, content) 437 | 438 | def _build_parser(self): 439 | # avoid circular references 440 | import pylinkvalidator 441 | version = pylinkvalidator.__version__ 442 | 443 | parser = OptionParser( 444 | usage="%prog [options] URL ...", 445 | version="%prog {0}".format(version)) 446 | 447 | parser.add_option( 448 | "-V", "--verbose", dest="verbose", action="store", 449 | default=VERBOSE_QUIET, choices=[VERBOSE_QUIET, VERBOSE_NORMAL, 450 | VERBOSE_INFO]) 451 | 452 | crawler_group = OptionGroup( 453 | parser, "Crawler Options", 454 | "These options modify the way the crawler traverses the site.") 455 | crawler_group.add_option( 456 | "-O", "--test-outside", dest="test_outside", 457 | action="store_true", default=False, 458 | help="fetch resources from other domains without crawling them") 459 | crawler_group.add_option( 460 | "-H", "--accepted-hosts", 461 | dest="accepted_hosts", action="store", default=None, 462 | help="comma-separated list of additional hosts to crawl (e.g., " 463 | "example.com,subdomain.another.com)") 464 | crawler_group.add_option( 465 | "-i", "--ignore", dest="ignored_prefixes", 466 | action="store", default=None, 467 | help="comma-separated list of host/path prefixes to ignore " 468 | "(e.g., www.example.com/ignore_this_and_after/)") 469 | crawler_group.add_option( 470 | "-b", "--ignore-bad-tel-urls", dest="ignore_bad_tel_urls", 471 | action="store_true", default=False, 472 | help="ignore badly formed tel URLs missing the leading + sign, " 473 | "e.g., tel:1234567890 - only necessary for Python > 2.6") 474 | crawler_group.add_option( 475 | "-u", "--username", dest="username", 476 | action="store", default=None, 477 | help="username to use with basic HTTP authentication") 478 | crawler_group.add_option( 479 | "-p", "--password", dest="password", 480 | action="store", default=None, 481 | help="password to use with basic HTTP authentication") 482 | crawler_group.add_option( 483 | "-M", "--multi", dest="multi", 484 | action="store_true", default=False, 485 | help="each argument is considered to be a different site") 486 | crawler_group.add_option( 487 | "-D", "--header", 488 | dest="headers", action="append", metavar="HEADER", 489 | help="custom header of the form Header: Value " 490 | "(repeat for multiple headers)") 491 | crawler_group.add_option( 492 | "--url-file-path", dest="url_file_path", 493 | action="store", default=None, 494 | help="get starting URLs from a line-separated file") 495 | # crawler_group.add_option("-U", "--unique", dest="unique", 496 | # action="store_true", default=False) 497 | crawler_group.add_option( 498 | "-t", "--types", dest="types", action="store", 499 | default=",".join(DEFAULT_TYPES), 500 | help="Comma-separated values of tags to look for when crawling" 501 | "a site. Default (and supported types): a,img,link,script") 502 | crawler_group.add_option( 503 | "-T", "--timeout", dest="timeout", 504 | type="int", action="store", default=DEFAULT_TIMEOUT, 505 | help="Seconds to wait before considering that a page timed out") 506 | crawler_group.add_option( 507 | "-C", "--strict", dest="strict_mode", 508 | action="store_true", default=False, 509 | help="Does not strip href and src attributes from whitespaces") 510 | crawler_group.add_option( 511 | "-P", "--progress", dest="progress", 512 | action="store_true", default=False, 513 | help="Prints crawler progress in the console") 514 | crawler_group.add_option( 515 | "-N", "--run-once", dest="run_once", 516 | action="store_true", default=False, 517 | help="Only crawl the first page (eq. to depth=0).") 518 | crawler_group.add_option( 519 | "-d", "--depth", dest="depth", 520 | type="int", action="store", default=-1, 521 | help="Maximum crawl depth") 522 | crawler_group.add_option( 523 | "-e", "--prefer-server-encoding", dest="prefer_server_encoding", 524 | action="store_true", default=False, 525 | help="Prefer server encoding if specified. Else detect encoding") 526 | crawler_group.add_option( 527 | "--check-presence", dest="content_presence", 528 | action="append", 529 | help="Check presence of raw or HTML content on all pages. e.g., " 530 | "regex:content. " 531 | "Content can be either regex:pattern or plain content") 532 | crawler_group.add_option( 533 | "--check-absence", dest="content_absence", 534 | action="append", 535 | help="Check absence of raw or HTML content on all pages. e.g., " 536 | "regex:content. " 537 | "Content can be either regex:pattern or plain content") 538 | crawler_group.add_option( 539 | "--check-presence-once", dest="content_presence_once", 540 | action="append", 541 | help="Check presence of raw or HTML content for one page: " 542 | "path,content, e.g.,: " 543 | "/path,regex:content. " 544 | "Content can be either regex:pattern or plain content. " 545 | "Path can be either relative or absolute with domain.") 546 | crawler_group.add_option( 547 | "--check-absence-once", dest="content_absence_once", 548 | action="append", 549 | help="Check absence of raw or HTML content for one page: " 550 | "path,content, e.g.," 551 | "path,regex:content. " 552 | "Content can be either regex:pattern or plain content. " 553 | "Path can be either relative or absolute with domain.") 554 | crawler_group.add_option( 555 | "--allow-insecure-content", dest="allow_insecure_content", 556 | action="store_true", default=False, 557 | help="Allow insecure content for HTTPS sites with certificate errors") 558 | 559 | # TODO Add follow redirect option. 560 | 561 | parser.add_option_group(crawler_group) 562 | 563 | perf_group = OptionGroup( 564 | parser, "Performance Options", 565 | "These options can impact the performance of the crawler.") 566 | 567 | perf_group.add_option( 568 | "-w", "--workers", dest="workers", action="store", 569 | default=None, type="int", 570 | help="Number of workers to spawn") 571 | perf_group.add_option( 572 | "-m", "--mode", dest="mode", action="store", 573 | help="Types of workers: thread (default), process, or green", 574 | default=MODE_THREAD, choices=[MODE_THREAD, MODE_PROCESS, 575 | MODE_GREEN]) 576 | perf_group.add_option( 577 | "-R", "--parser", dest="parser", action="store", 578 | help="Types of HTML parse: html.parser (default), lxml, html5lib", 579 | default=PARSER_STDLIB, choices=[PARSER_STDLIB, PARSER_LXML, 580 | PARSER_HTML5]) 581 | 582 | parser.add_option_group(perf_group) 583 | 584 | output_group = OptionGroup( 585 | parser, "Output Options", 586 | "These options change the output of the crawler.") 587 | 588 | output_group.add_option( 589 | "-f", "--format", dest="format", action="store", 590 | default=FORMAT_PLAIN, choices=[FORMAT_PLAIN], 591 | help="Format of the report: plain") 592 | output_group.add_option( 593 | "-o", "--output", dest="output", action="store", 594 | default=None, 595 | help="Path of the file where the report will be printed.") 596 | output_group.add_option( 597 | "-W", "--when", dest="when", action="store", 598 | default=WHEN_ALWAYS, choices=[WHEN_ALWAYS, WHEN_ON_ERROR], 599 | help="When to print the report. error (only if a " 600 | "crawling error occurs) or always (default)") 601 | output_group.add_option( 602 | "-E", "--report-type", dest="report_type", 603 | help="Type of report to print: errors (default, summary and " 604 | "erroneous links), summary, all (summary and all links)", 605 | action="store", default=REPORT_TYPE_ERRORS, 606 | choices=[REPORT_TYPE_ERRORS, REPORT_TYPE_SUMMARY, REPORT_TYPE_ALL]) 607 | output_group.add_option( 608 | "-c", "--console", dest="console", 609 | action="store_true", default=False, 610 | help="Prints report to the console in addition to other output" 611 | " options such as file or email.") 612 | crawler_group.add_option( 613 | "-S", "--show-source", dest="show_source", 614 | action="store_true", default=False, 615 | help="Show source of links (html) in the report.") 616 | 617 | parser.add_option_group(output_group) 618 | 619 | email_group = OptionGroup( 620 | parser, "Email Options", 621 | "These options allows the crawler to send a report by email.") 622 | 623 | email_group.add_option( 624 | "-a", "--address", dest="address", action="store", 625 | default=None, 626 | help="Comma-separated list of email addresses used to send a " 627 | "report") 628 | email_group.add_option( 629 | "--from", dest="from_address", action="store", 630 | default=None, 631 | help="Email address to use in the from field of the email " 632 | "(optional)") 633 | email_group.add_option( 634 | "-s", "--smtp", dest="smtp", action="store", 635 | default=None, 636 | help="Host of the smtp server") 637 | email_group.add_option( 638 | "--port", dest="port", action="store", 639 | default=25, type="int", 640 | help="Port of the smtp server (optional)") 641 | email_group.add_option( 642 | "--tls", dest="tls", action="store_true", 643 | default=False, 644 | help="Use TLS with the email server.") 645 | email_group.add_option( 646 | "--subject", dest="subject", action="store", 647 | default=None, 648 | help="Subject of the email (optional)") 649 | email_group.add_option( 650 | "--smtp-username", dest="smtp_username", 651 | action="store", default=None, 652 | help="Username to use with the smtp server (optional)") 653 | email_group.add_option( 654 | "--smtp-password", dest="smtp_password", 655 | action="store", default=None, 656 | help="Password to use with the smtp server (optional)") 657 | 658 | parser.add_option_group(email_group) 659 | 660 | return parser 661 | 662 | def __unicode__(self): 663 | return "Configuration - Start URLs: {0} - Options: {1}".format( 664 | self.start_urls, self.options) 665 | 666 | 667 | class SitePage(UTF8Class): 668 | """Contains the crawling result for a page. 669 | 670 | This is a class because we need to keep track of the various sources 671 | linking to this page and it must be modified as the crawl progresses. 672 | """ 673 | 674 | def __init__(self, url_split, status=200, is_timeout=False, exception=None, 675 | is_html=True, is_local=True, response_time=None, 676 | process_time=None, site_origin=None, missing_content=None, 677 | erroneous_content=None): 678 | self.url_split = url_split 679 | 680 | self.original_source = None 681 | self.sources = [] 682 | 683 | self.type = type 684 | self.status = status 685 | self.is_timeout = is_timeout 686 | self.exception = exception 687 | self.is_html = is_html 688 | self.is_local = is_local 689 | self.is_ok = status and status < 400 and not missing_content and\ 690 | not erroneous_content 691 | self.response_time = response_time 692 | self.process_time = process_time 693 | self.site_origin = site_origin 694 | 695 | if missing_content: 696 | self.missing_content = missing_content 697 | else: 698 | self.missing_content = [] 699 | 700 | if erroneous_content: 701 | self.erroneous_content = erroneous_content 702 | else: 703 | self.erroneous_content = [] 704 | 705 | def add_sources(self, page_sources): 706 | self.sources.extend(page_sources) 707 | 708 | def get_status_message(self): 709 | if self.status: 710 | if self.status < 400: 711 | return self._compute_ok_status(self.status) 712 | elif self.status == 404: 713 | return "not found (404)" 714 | else: 715 | return "error (status={0})".format(self.status) 716 | elif self.is_timeout: 717 | return "error (timeout)" 718 | elif self.exception: 719 | return "error ({0}): {1}".format( 720 | self.exception.type_name, self.exception.message) 721 | else: 722 | return "error" 723 | 724 | def _compute_ok_status(self, status_code): 725 | if self.missing_content and not self.erroneous_content: 726 | return "error ({0}) missing content".format(status_code) 727 | elif self.erroneous_content and not self.missing_content: 728 | return "error ({0}) erroneous content".format(status_code) 729 | elif self.erroneous_content and self.missing_content: 730 | return "error ({0}) missing and erroneous content".format( 731 | status_code) 732 | else: 733 | return "ok ({0})".format(self.status) 734 | 735 | def get_content_messages(self): 736 | """Gets missing and erroneous content 737 | """ 738 | messages = [ 739 | "missing content: {0}".format(content) for content in 740 | self.missing_content] + [ 741 | "erroneous content: {0}".format(content) for content in 742 | self.erroneous_content] 743 | 744 | return messages 745 | 746 | def __unicode__(self): 747 | return "Resource {0} - {1}".format( 748 | self.url_split.geturl(), self.status) 749 | -------------------------------------------------------------------------------- /pylinkvalidator/reporter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the reporting functions 3 | """ 4 | from __future__ import unicode_literals, absolute_import, print_function 5 | 6 | import codecs 7 | import re 8 | import smtplib 9 | import sys 10 | 11 | from email.mime.text import MIMEText 12 | 13 | from pylinkvalidator.compat import StringIO 14 | from pylinkvalidator.models import ( 15 | REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_PLAIN) 16 | 17 | 18 | PLAIN_TEXT = "text/plain" 19 | HTML = "text/html" 20 | 21 | WHITESPACES = re.compile(r"\s+") 22 | 23 | 24 | EMAIL_HEADER = "from: {0}\r\nsubject: {1}\r\nto: {2}\r\nmime-version: 1.0\r\n"\ 25 | "content-type: {3}\r\n\r\n{4}" 26 | 27 | 28 | def close_quietly(a_file): 29 | """Closes a file and does not report an error.""" 30 | try: 31 | if a_file: 32 | a_file.close() 33 | except Exception: 34 | pass 35 | 36 | 37 | def report(site, config, total_time, logger=None): 38 | """Prints reports to console, file, and email.""" 39 | output_files = [] 40 | output_file = None 41 | email_file = None 42 | 43 | if config.options.output: 44 | output_file = codecs.open(config.options.output, "w", "utf-8") 45 | output_files.append(output_file) 46 | 47 | if config.options.smtp: 48 | email_file = StringIO() 49 | output_files.append(email_file) 50 | 51 | if config.options.console or not output_files: 52 | output_files.append(sys.stdout) 53 | 54 | try: 55 | if config.options.format == FORMAT_PLAIN: 56 | _write_plain_text_report(site, config, output_files, total_time) 57 | except Exception: 58 | if logger: 59 | logger.exception("An exception occurred while writing the report") 60 | 61 | if output_file: 62 | close_quietly(output_file) 63 | 64 | if email_file: 65 | send_email(email_file, site, config) 66 | 67 | 68 | def _write_plain_text_report(site, config, output_files, total_time): 69 | if config.options.multi: 70 | _write_plain_text_report_multi(site, config, output_files, total_time) 71 | else: 72 | _write_plain_text_report_single(site, config, output_files, total_time) 73 | 74 | 75 | def _write_plain_text_report_multi(site, config, output_files, total_time): 76 | total_urls = len(site.pages) 77 | total_errors = len(site.error_pages) 78 | 79 | if not site.is_ok: 80 | global_status = "ERROR" 81 | error_summary = "with {0} error(s) ".format(total_errors) 82 | else: 83 | global_status = "SUCCESS" 84 | error_summary = "" 85 | 86 | try: 87 | avg_response_time = site.get_average_response_time() 88 | avg_process_time = site.get_average_process_time() 89 | 90 | oprint( 91 | "{0} Crawled {1} urls {2}from {4} sites in {3:.2f} seconds" 92 | .format( 93 | global_status, total_urls, error_summary, total_time, 94 | len(site.start_url_splits)), 95 | files=output_files) 96 | 97 | oprint(" average response time: {0:.2f} seconds".format( 98 | avg_response_time), files=output_files) 99 | 100 | oprint(" average process time: {0:.2f} seconds".format( 101 | avg_process_time), files=output_files) 102 | 103 | pages = {} 104 | 105 | if config.options.report_type == REPORT_TYPE_ERRORS: 106 | pages = site.multi_error_pages 107 | elif config.options.report_type == REPORT_TYPE_ALL: 108 | pages = site.multi_pages 109 | 110 | for domain, pages_dict in pages.items(): 111 | if pages_dict: 112 | oprint( 113 | "\n\n Start Domain: {0}".format(domain), 114 | files=output_files) 115 | 116 | _print_details(pages_dict.values(), output_files, config, 4) 117 | except Exception: 118 | from traceback import print_exc 119 | print_exc() 120 | 121 | 122 | def _write_plain_text_report_single(site, config, output_files, total_time): 123 | start_urls = ",".join((start_url_split.geturl() for start_url_split in 124 | site.start_url_splits)) 125 | 126 | total_urls = len(site.pages) 127 | total_errors = len(site.error_pages) 128 | 129 | if not site.is_ok: 130 | global_status = "ERROR" 131 | error_summary = "with {0} error(s) ".format(total_errors) 132 | else: 133 | global_status = "SUCCESS" 134 | error_summary = "" 135 | 136 | try: 137 | avg_response_time = site.get_average_response_time() 138 | avg_process_time = site.get_average_process_time() 139 | 140 | oprint("{0} Crawled {1} urls {2}in {3:.2f} seconds".format( 141 | global_status, total_urls, error_summary, total_time), 142 | files=output_files) 143 | 144 | oprint(" average response time: {0:.2f} seconds".format( 145 | avg_response_time), files=output_files) 146 | 147 | oprint(" average process time: {0:.2f} seconds".format( 148 | avg_process_time), files=output_files) 149 | 150 | except Exception: 151 | from traceback import print_exc 152 | print_exc() 153 | 154 | pages = {} 155 | 156 | if config.options.report_type == REPORT_TYPE_ERRORS: 157 | pages = site.error_pages 158 | elif config.options.report_type == REPORT_TYPE_ALL: 159 | pages = site.pages 160 | 161 | if pages: 162 | oprint("\n Start URL(s): {0}".format(start_urls), files=output_files) 163 | _print_details(pages.values(), output_files, config) 164 | 165 | 166 | def _print_details(page_iterator, output_files, config, indent=2): 167 | initial_indent = " " * indent 168 | for page in page_iterator: 169 | oprint("\n{2}{0}: {1}".format( 170 | page.get_status_message(), page.url_split.geturl(), 171 | initial_indent), 172 | files=output_files) 173 | for content_message in page.get_content_messages(): 174 | oprint("{1} {0}".format(content_message, initial_indent), 175 | files=output_files) 176 | for source in page.sources: 177 | oprint("{1} from {0}".format( 178 | source.origin.geturl(), initial_indent), files=output_files) 179 | if config.options.show_source: 180 | oprint("{1} {0}".format( 181 | truncate(source.origin_str), initial_indent), 182 | files=output_files) 183 | 184 | 185 | def oprint(message, files): 186 | """Prints to a sequence of files.""" 187 | for file in files: 188 | print(message, file=file) 189 | 190 | 191 | def truncate(value, size=72): 192 | """Truncates a string if its length is higher than size.""" 193 | value = value.replace("\n", " ").replace("\r", "").replace("\t", " ") 194 | value = value.strip() 195 | value = WHITESPACES.sub(" ", value) 196 | 197 | if len(value) > size: 198 | value = "{0}...".format(value[:size-3]) 199 | 200 | return value 201 | 202 | 203 | def send_email(email_file, site, config): 204 | options = config.options 205 | if options.subject: 206 | subject = options.subject 207 | else: 208 | if site.is_ok: 209 | subject = "SUCCESS - {0}".format(site.start_url_splits[0].geturl()) 210 | else: 211 | subject = "ERROR - {0}".format(site.start_url_splits[0].geturl()) 212 | 213 | if options.from_address: 214 | from_address = options.from_address 215 | else: 216 | from_address = "pylinkvalidator@localhost" 217 | 218 | if not options.address: 219 | print("Email address must be specified when using smtp.") 220 | sys.exit(1) 221 | 222 | addresses = options.address.split(",") 223 | 224 | msg = MIMEText(email_file.getvalue(), 'plain', "UTF-8") 225 | 226 | msg['From'] = from_address 227 | msg['To'] = ", ".join(addresses) 228 | msg['Subject'] = subject 229 | 230 | smtpserver = smtplib.SMTP(options.smtp, options.port) 231 | 232 | if options.tls: 233 | smtpserver.ehlo() 234 | smtpserver.starttls() 235 | smtpserver.ehlo 236 | 237 | if options.smtp_username and options.smtp_password: 238 | smtpserver.login(options.smtp_username, options.smtp_password) 239 | 240 | smtpserver.sendmail(from_address, addresses, msg.as_string()) 241 | 242 | smtpserver.quit() 243 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/a.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello World

4 | 5 | 6 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/alone.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test URL 7 | Mail me 8 | Call me 9 | 10 | 11 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/badtel.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Go to next page 4 |

Go to good tel link 5 |

Go to bad tel link 6 |

7 | 8 | 9 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/c.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/d.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/depth/0.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1 4 | 5 | 6 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/depth/0b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0 4 | 5 | 6 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/depth/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/depth/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 3 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/depth/3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 4 | 5 | 6 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/depth/root.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0 4 | 0b 5 | 6 | 7 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/f.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Nothing 4 | Nothing 5 | 6 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test hash 7 | Test name 8 | Test A 9 | Test B 10 | Test C 11 | Test D 12 | Test External 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Crawl-delay: 1 3 | Disallow: 4 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/sub/b.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | E 4 | F 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/sub/e.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/sub/small_image.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bartdag/pylinkvalidator/aac5934d88a9c99d0e4f40a8884ad942b6b10ea0/pylinkvalidator/testfiles/sub/small_image.gif -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/sub/style.css: -------------------------------------------------------------------------------- 1 | a { 2 | color: #00ff00; 3 | } -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/sub/test.js: -------------------------------------------------------------------------------- 1 | document.write('Hello World'); -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/à.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello World

4 | 5 | 6 | -------------------------------------------------------------------------------- /pylinkvalidator/testfiles/é.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Go to next page 4 |

5 | 6 | 7 | -------------------------------------------------------------------------------- /pylinkvalidator/tests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Unit and integration tests for pylinkvalidator 4 | """ 5 | from __future__ import unicode_literals, absolute_import 6 | 7 | import os 8 | import logging 9 | import sys 10 | from tempfile import mkstemp 11 | import time 12 | import threading 13 | import unittest 14 | 15 | from pylinkvalidator import api 16 | import pylinkvalidator.compat as compat 17 | from pylinkvalidator.compat import ( 18 | SocketServer, SimpleHTTPServer, get_url_open, get_url_request) 19 | from pylinkvalidator.crawler import ( 20 | open_url, PageCrawler, WORK_DONE, ThreadSiteCrawler, ProcessSiteCrawler, 21 | get_logger) 22 | from pylinkvalidator.models import ( 23 | Config, WorkerInit, WorkerConfig, WorkerInput, PARSER_STDLIB) 24 | from pylinkvalidator.urlutil import get_clean_url_split, get_absolute_url_split 25 | 26 | 27 | TEST_FILES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 28 | 'testfiles') 29 | 30 | # Quiet all logging 31 | logging.basicConfig(level=logging.CRITICAL) 32 | 33 | 34 | # UTILITY CLASSES AND FUNCTIONS ### 35 | 36 | class ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer): 37 | pass 38 | 39 | 40 | def start_http_server(): 41 | """Starts a simple http server for the test files""" 42 | # For the http handler 43 | os.chdir(TEST_FILES_DIR) 44 | handler = SimpleHTTPServer.SimpleHTTPRequestHandler 45 | handler.extensions_map['.html'] = 'text/html; charset=UTF-8' 46 | httpd = ThreadedTCPServer(("localhost", 0), handler) 47 | ip, port = httpd.server_address 48 | 49 | httpd_thread = threading.Thread(target=httpd.serve_forever) 50 | httpd_thread.setDaemon(True) 51 | httpd_thread.start() 52 | 53 | return (ip, port, httpd, httpd_thread) 54 | 55 | 56 | def has_multiprocessing(): 57 | has_multi = False 58 | 59 | try: 60 | import multiprocessing # noqa 61 | has_multi = True 62 | except Exception: 63 | pass 64 | 65 | return has_multi 66 | 67 | 68 | def has_gevent(): 69 | has_gevent = False 70 | 71 | try: 72 | import gevent # noqa 73 | has_gevent = True 74 | except Exception: 75 | pass 76 | 77 | return has_gevent 78 | 79 | 80 | # UNIT AND INTEGRATION TESTS ### 81 | 82 | 83 | class ConfigTest(unittest.TestCase): 84 | 85 | def setUp(self): 86 | self.argv = sys.argv 87 | 88 | def tearDown(self): 89 | sys.argv = self.argv 90 | 91 | def test_accepted_hosts(self): 92 | sys.argv = ['pylinkvalidator', 'http://www.example.com/'] 93 | config = Config() 94 | config.parse_cli_config() 95 | self.assertTrue('www.example.com' in config.accepted_hosts) 96 | 97 | sys.argv = ['pylinkvalidator', '-H', 'www.example.com', 98 | 'http://example.com', 'foo.com', 'http://www.example.com/', 99 | 'baz.com'] 100 | config = Config() 101 | config.parse_cli_config() 102 | 103 | self.assertTrue('www.example.com' in config.accepted_hosts) 104 | self.assertTrue('example.com' in config.accepted_hosts) 105 | self.assertTrue('foo.com' in config.accepted_hosts) 106 | self.assertTrue('baz.com' in config.accepted_hosts) 107 | 108 | 109 | class URLUtilTest(unittest.TestCase): 110 | 111 | def test_clean_url_split(self): 112 | self.assertEqual( 113 | "http://www.example.com", 114 | get_clean_url_split("www.example.com").geturl()) 115 | self.assertEqual( 116 | "http://www.example.com", 117 | get_clean_url_split("//www.example.com").geturl()) 118 | self.assertEqual( 119 | "http://www.example.com", 120 | get_clean_url_split("http://www.example.com").geturl()) 121 | 122 | self.assertEqual( 123 | "http://www.example.com/", 124 | get_clean_url_split("www.example.com/").geturl()) 125 | self.assertEqual( 126 | "http://www.example.com/", 127 | get_clean_url_split("//www.example.com/").geturl()) 128 | self.assertEqual( 129 | "http://www.example.com/", 130 | get_clean_url_split("http://www.example.com/").geturl()) 131 | 132 | def test_get_absolute_url(self): 133 | base_url_split = get_clean_url_split( 134 | "https://www.example.com/hello/index.html") 135 | self.assertEqual( 136 | "https://www.example2.com/test.js", 137 | get_absolute_url_split( 138 | "//www.example2.com/test.js", base_url_split).geturl()) 139 | self.assertEqual( 140 | "https://www.example.com/hello2/test.html", 141 | get_absolute_url_split( 142 | "/hello2/test.html", base_url_split).geturl()) 143 | self.assertEqual( 144 | "https://www.example.com/hello/test.html", 145 | get_absolute_url_split("test.html", base_url_split).geturl()) 146 | self.assertEqual( 147 | "https://www.example.com/test.html", 148 | get_absolute_url_split("../test.html", base_url_split).geturl()) 149 | 150 | 151 | class CrawlerTest(unittest.TestCase): 152 | 153 | @classmethod 154 | def setUpClass(cls): 155 | (cls.ip, cls.port, cls.httpd, cls.httpd_thread) = start_http_server() 156 | 157 | # FIXME replace by thread synchronization on start 158 | time.sleep(0.2) 159 | 160 | @classmethod 161 | def tearDownClass(cls): 162 | cls.httpd.shutdown() 163 | 164 | def setUp(self): 165 | # We must do this because Python 2.6 does not have setUpClass 166 | # This will only be executed if setUpClass is ignored. 167 | # It will not be shutdown properly though, but this does not prevent 168 | # the unit test to run properly 169 | if not hasattr(self, 'port'): 170 | (self.ip, self.port, self.httpd, self.httpd_thread) =\ 171 | start_http_server() 172 | # FIXME replace by thread synchronization on start 173 | time.sleep(0.2) 174 | self.argv = sys.argv 175 | 176 | # Need to override root logger level (reset by something) 177 | logger = logging.getLogger() 178 | logger.setLevel(logging.CRITICAL) 179 | 180 | def tearDown(self): 181 | sys.argv = self.argv 182 | 183 | def get_url(self, test_url): 184 | return "http://{0}:{1}{2}".format(self.ip, self.port, test_url) 185 | 186 | def get_page_crawler(self, url): 187 | url = self.get_url(url) 188 | url_split = get_clean_url_split(url) 189 | input_queue = compat.Queue.Queue() 190 | output_queue = compat.Queue.Queue() 191 | 192 | worker_config = WorkerConfig( 193 | username=None, password=None, types=['a', 'img', 'link', 'script'], 194 | timeout=5, parser=PARSER_STDLIB, 195 | strict_mode=False, prefer_server_encoding=False, 196 | extra_headers=[]) 197 | 198 | worker_init = WorkerInit( 199 | worker_config=worker_config, 200 | input_queue=input_queue, output_queue=output_queue, 201 | logger=get_logger()) 202 | 203 | page_crawler = PageCrawler(worker_init) 204 | 205 | return page_crawler, url_split 206 | 207 | def test_404(self): 208 | urlopen = get_url_open() 209 | import socket 210 | url = self.get_url("/does_not_exist.html") 211 | response = open_url( 212 | urlopen, get_url_request(), url, 5, socket.timeout) 213 | 214 | self.assertEqual(404, response.status) 215 | self.assertTrue(response.exception is not None) 216 | 217 | def test_200(self): 218 | urlopen = get_url_open() 219 | import socket 220 | url = self.get_url("/index.html") 221 | response = open_url(urlopen, get_url_request(), url, 5, socket.timeout) 222 | 223 | self.assertEqual(200, response.status) 224 | self.assertTrue(response.exception is None) 225 | 226 | def test_301(self): 227 | urlopen = get_url_open() 228 | import socket 229 | url = self.get_url("/sub") 230 | response = open_url(urlopen, get_url_request(), url, 5, socket.timeout) 231 | 232 | self.assertEqual(200, response.status) 233 | self.assertTrue(response.is_redirect) 234 | 235 | def test_crawl_page(self): 236 | page_crawler, url_split = self.get_page_crawler("/index.html") 237 | page_crawl = page_crawler._crawl_page( 238 | WorkerInput(url_split, True, 0, url_split.netloc)) 239 | 240 | self.assertEqual(200, page_crawl.status) 241 | self.assertTrue(page_crawl.is_html) 242 | self.assertFalse(page_crawl.is_timeout) 243 | self.assertFalse(page_crawl.is_redirect) 244 | self.assertTrue(page_crawl.exception is None) 245 | 246 | a_links = [link for link in page_crawl.links if link.type == 'a'] 247 | img_links = [link for link in page_crawl.links if link.type == 'img'] 248 | script_links = [link for link in page_crawl.links 249 | if link.type == 'script'] 250 | link_links = [link for link in page_crawl.links if link.type == 'link'] 251 | 252 | self.assertEqual(5, len(a_links)) 253 | self.assertEqual(1, len(img_links)) 254 | self.assertEqual(1, len(script_links)) 255 | self.assertEqual(1, len(link_links)) 256 | 257 | def test_crawl_resource(self): 258 | page_crawler, url_split = self.get_page_crawler("/sub/small_image.gif") 259 | page_crawl = page_crawler._crawl_page( 260 | WorkerInput(url_split, True, 0, url_split.netloc)) 261 | 262 | self.assertEqual(200, page_crawl.status) 263 | self.assertFalse(page_crawl.links) 264 | self.assertFalse(page_crawl.is_html) 265 | self.assertFalse(page_crawl.is_timeout) 266 | self.assertFalse(page_crawl.is_redirect) 267 | self.assertTrue(page_crawl.exception is None) 268 | 269 | def test_base_url(self): 270 | page_crawler, url_split = self.get_page_crawler("/alone.html") 271 | page_crawl = page_crawler._crawl_page( 272 | WorkerInput(url_split, True, 0, url_split.netloc)) 273 | 274 | self.assertEqual(1, len(page_crawl.links)) 275 | self.assertEqual( 276 | 'http://www.example.com/test.html', 277 | page_crawl.links[0].url_split.geturl()) 278 | 279 | def test_crawl_404(self): 280 | page_crawler, url_split = self.get_page_crawler( 281 | "/sub/small_image_bad.gif") 282 | page_crawl = page_crawler._crawl_page( 283 | WorkerInput(url_split, True, 0, url_split.netloc)) 284 | 285 | self.assertEqual(404, page_crawl.status) 286 | self.assertFalse(page_crawl.links) 287 | self.assertFalse(page_crawl.is_html) 288 | self.assertFalse(page_crawl.is_timeout) 289 | self.assertFalse(page_crawl.is_redirect) 290 | 291 | def test_page_crawler(self): 292 | page_crawler, url_split = self.get_page_crawler("/index.html") 293 | input_queue = page_crawler.input_queue 294 | output_queue = page_crawler.output_queue 295 | 296 | input_queue.put(WorkerInput(url_split, True, 0, url_split.netloc)) 297 | input_queue.put(WORK_DONE) 298 | page_crawler.crawl_page_forever() 299 | 300 | page_crawl = output_queue.get() 301 | 302 | self.assertEqual(200, page_crawl.status) 303 | self.assertTrue(len(page_crawl.links) > 0) 304 | 305 | def _run_crawler_plain( 306 | self, crawler_class, other_options=None, url="/index.html"): 307 | url = self.get_url(url) 308 | sys.argv = ['pylinkvalidator', "-m", "process", url] 309 | if not other_options: 310 | other_options = [] 311 | sys.argv.extend(other_options) 312 | config = Config() 313 | config.parse_cli_config() 314 | 315 | crawler = crawler_class(config, get_logger()) 316 | crawler.crawl() 317 | 318 | if config.options.multi: 319 | crawler.site.collect_multi_sites() 320 | 321 | return crawler.site 322 | 323 | def test_site_thread_crawler_plain(self): 324 | site = self._run_crawler_plain(ThreadSiteCrawler) 325 | self.assertEqual(11, len(site.pages)) 326 | self.assertEqual(1, len(site.error_pages)) 327 | 328 | def test_site_process_crawler_plain(self): 329 | if not has_multiprocessing(): 330 | return 331 | site = self._run_crawler_plain(ProcessSiteCrawler) 332 | self.assertEqual(11, len(site.pages)) 333 | self.assertEqual(1, len(site.error_pages)) 334 | 335 | def test_run_once(self): 336 | site = self._run_crawler_plain(ThreadSiteCrawler, ["--run-once"]) 337 | 338 | # 8 pages linked on the index 339 | self.assertEqual(8, len(site.pages)) 340 | self.assertEqual(0, len(site.error_pages)) 341 | 342 | def test_multi_sites(self): 343 | site = self._run_crawler_plain(ThreadSiteCrawler, ["--multi"]) 344 | self.assertEqual(11, len(site.pages)) 345 | self.assertEqual(1, len(site.error_pages)) 346 | 347 | multi_pages_for_site = list(site.multi_pages.values())[0] 348 | multi_error_pages_for_site = list(site.multi_error_pages.values())[0] 349 | self.assertEqual(11, len(multi_pages_for_site)) 350 | self.assertEqual(1, len(multi_error_pages_for_site)) 351 | 352 | def test_content_check(self): 353 | site = self._run_crawler_plain( 354 | ThreadSiteCrawler, 355 | [ 356 | "--check-absence", "tata12345", 357 | "--check-absence", "BOOM", 358 | "--check-presence", "", 359 | ]) 360 | self.assertEqual(11, len(site.pages)) 361 | self.assertEqual(1, len(site.error_pages)) 362 | 363 | site = self._run_crawler_plain( 364 | ThreadSiteCrawler, 365 | [ 366 | "--check-presence-once", 367 | "/a.html,

Hello World

", 368 | "--check-presence-once", 369 | "/robots.txt,regex:^Disallow:\s*$", 370 | ]) 371 | self.assertEqual(12, len(site.pages)) 372 | self.assertEqual(1, len(site.error_pages)) 373 | 374 | site = self._run_crawler_plain( 375 | ThreadSiteCrawler, 376 | ["--check-presence-once", 377 | "/a.html,

regex:Hello

"]) 378 | self.assertEqual(11, len(site.pages)) 379 | self.assertEqual(1, len(site.error_pages)) 380 | 381 | site = self._run_crawler_plain( 382 | ThreadSiteCrawler, 383 | ["--check-absence-once", 384 | "/a.html,

regex:Hello

"]) 385 | self.assertEqual(11, len(site.pages)) 386 | self.assertEqual(2, len(site.error_pages)) 387 | 388 | def test_url_file_path(self): 389 | (_, temp_file_path) = mkstemp() 390 | url = self.get_url("/index.html") 391 | url2 = self.get_url("/robots.txt") 392 | with open(temp_file_path, "w") as temp_file: 393 | temp_file.write(url + "\n") 394 | temp_file.write(url2 + "\n") 395 | 396 | sys.argv = [ 397 | "pylinkvalidator", "-m", "process", "--url-file-path", 398 | temp_file_path] 399 | config = Config() 400 | config.parse_cli_config() 401 | 402 | crawler = ThreadSiteCrawler(config, get_logger()) 403 | crawler.crawl() 404 | 405 | site = crawler.site 406 | self.assertEqual(12, len(site.pages)) 407 | self.assertEqual(1, len(site.error_pages)) 408 | os.unlink(temp_file_path) 409 | 410 | def test_depth_0(self): 411 | site = self._run_crawler_plain( 412 | ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html") 413 | # 3 pages linked on the root (root, 0, 0b) 414 | self.assertEqual(3, len(site.pages)) 415 | self.assertEqual(0, len(site.error_pages)) 416 | 417 | site = self._run_crawler_plain( 418 | ThreadSiteCrawler, ["--run-once"], "/depth/root.html") 419 | # Same as depth = 0 420 | self.assertEqual(3, len(site.pages)) 421 | self.assertEqual(0, len(site.error_pages)) 422 | 423 | site = self._run_crawler_plain( 424 | ThreadSiteCrawler, ["--depth", "1"], "/depth/root.html") 425 | # 4 pages linked on the root (root, 0, 0b, 1) 426 | self.assertEqual(4, len(site.pages)) 427 | self.assertEqual(0, len(site.error_pages)) 428 | 429 | site = self._run_crawler_plain( 430 | ThreadSiteCrawler, ["--depth", "10"], "/depth/root.html") 431 | # 3 pages linked on the root (root, 0, 0b) 432 | self.assertEqual(7, len(site.pages)) 433 | self.assertEqual(1, len(site.error_pages)) 434 | 435 | def test_strict_mode(self): 436 | site = self._run_crawler_plain(ThreadSiteCrawler, ["--strict"]) 437 | 438 | # The placeholdit is interpreted as a relative url 439 | # So 12 "good" urls and 1 bad. 440 | self.assertEqual(12, len(site.pages)) 441 | 442 | # Python 3 returns an error. There was a change in urllib. 443 | # In general, strict mode should be false, which is the default 444 | # This avoids these silly differences 445 | self.assertTrue(len(site.error_pages) >= 1) 446 | 447 | def test_site_gevent_crawler_plain(self): 448 | if not has_gevent(): 449 | return 450 | # TODO test gevent. Cannot use threaded simple http server :-( 451 | self.assertTrue(True) 452 | 453 | def test_api(self): 454 | url = self.get_url("/index.html") 455 | 456 | site = api.crawl(url) 457 | self.assertEqual(11, len(site.pages)) 458 | self.assertEqual(1, len(site.error_pages)) 459 | 460 | def test_api_with_options(self): 461 | url = self.get_url("/index.html") 462 | 463 | site = api.crawl_with_options([url], {"run-once": True, "workers": 2}) 464 | self.assertEqual(8, len(site.pages)) 465 | self.assertEqual(0, len(site.error_pages)) 466 | 467 | def test_api_with_options_2(self): 468 | site = self._run_crawler_plain( 469 | ThreadSiteCrawler, 470 | ["--prefer-server-encoding", "--header", "\"XKey: XValue\"", 471 | "--header", "\"XKey2: XValue2\"", "--run-once"], "/index.html") 472 | self.assertEqual(8, len(site.pages)) 473 | self.assertEqual(0, len(site.error_pages)) 474 | 475 | def test_unicode(self): 476 | site = self._run_crawler_plain( 477 | ThreadSiteCrawler, ["--prefer-server-encoding"], "/é.html") 478 | # 3 pages linked on the root (root, 0, 0b) 479 | self.assertEqual(2, len(site.pages)) 480 | self.assertEqual(0, len(site.error_pages)) 481 | 482 | def test_bad_tel_link(self): 483 | site = self._run_crawler_plain( 484 | ThreadSiteCrawler, ["--ignore-bad-tel-urls"], "/badtel.html") 485 | # root + one page linked. bad tel link and tel link are ignored. 486 | self.assertEqual(2, len(site.pages)) 487 | self.assertEqual(0, len(site.error_pages)) 488 | 489 | if sys.version_info[:2] > (2, 6): 490 | site = self._run_crawler_plain( 491 | ThreadSiteCrawler, [], "/badtel.html") 492 | # root + one page + one bad tel link. One correct tel link ignored 493 | self.assertEqual(3, len(site.pages)) 494 | self.assertEqual(1, len(site.error_pages)) 495 | -------------------------------------------------------------------------------- /pylinkvalidator/urlutil.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Contains the crawling logic. 4 | """ 5 | from __future__ import unicode_literals, absolute_import 6 | 7 | import re 8 | 9 | from pylinkvalidator.compat import urlparse, quote 10 | 11 | 12 | SCHEME_HTTP = "http" 13 | SCHEME_HTTPS = "https" 14 | SUPPORTED_SCHEMES = (SCHEME_HTTP, SCHEME_HTTPS) 15 | 16 | 17 | NOT_LINK = [ 18 | 'data', 19 | '#', 20 | ] 21 | 22 | 23 | def is_link(url): 24 | """Return True if the url is not base 64 data or a local ref (#)""" 25 | for prefix in NOT_LINK: 26 | if url.startswith(prefix): 27 | return False 28 | return True 29 | 30 | 31 | def get_clean_url_split(url): 32 | """Returns a clean SplitResult with a scheme and a valid path 33 | 34 | :param url: The url to clean 35 | :rtype: A urlparse.SplitResult 36 | """ 37 | if not url: 38 | raise ValueError('The URL must not be empty') 39 | split_result = urlparse.urlsplit(url) 40 | 41 | if not split_result.scheme: 42 | if split_result.netloc: 43 | url = SCHEME_HTTP + ":" + url 44 | else: 45 | url = SCHEME_HTTP + "://" + url 46 | split_result = urlparse.urlsplit(url) 47 | 48 | split_result = convert_iri_to_uri(split_result) 49 | 50 | return split_result 51 | 52 | 53 | def convert_iri_to_uri(url_split): 54 | """Attempts to convert potential IRI to URI. 55 | 56 | IRI may contain non-ascii characters. 57 | """ 58 | new_parts = [] 59 | for i, part in enumerate(url_split): 60 | if i == 1: 61 | # domain name 62 | new_parts.append(part.encode('idna').decode('ascii')) 63 | else: 64 | # other parts such as path or query string. 65 | new_parts.append(url_encode_non_ascii(part)) 66 | return urlparse.SplitResult(*new_parts) 67 | 68 | 69 | def url_encode_non_ascii(url_part): 70 | """For each byte in url_part, if the byte is outside ascii range, quote the 71 | byte. UTF characters that take two bytes will be correctly converted using 72 | this technique. 73 | 74 | We do not quote the whole url part because it might contain already quoted 75 | characters, which would then be double-quoted. 76 | 77 | The url part is converted from utf-8 and then to utf-8, which might not 78 | always work if there is mixed or bad encoding. 79 | """ 80 | return re.sub( 81 | b'[\x80-\xFF]', 82 | lambda match: quote(match.group(0)).encode("utf-8"), 83 | url_part.encode("utf-8")).decode("ascii") 84 | 85 | 86 | def get_absolute_url_split(url, base_url_split): 87 | """Returns a SplitResult containing the new URL. 88 | 89 | :param url: The url (relative or absolute). 90 | :param base_url_split: THe SplitResult of the base URL. 91 | :rtype: A SplitResult 92 | """ 93 | new_url = urlparse.urljoin(base_url_split.geturl(), url) 94 | 95 | return get_clean_url_split(new_url) 96 | 97 | 98 | def is_similar_url_split(url_split_1, url_split_2): 99 | """Returns True if the two url split shares 100 | the same path and netloc. 101 | 102 | Also returns True if one of the url split does not have a netloc and both 103 | shares the same path. 104 | """ 105 | if not url_split_1.netloc or not url_split_2.netloc: 106 | return url_split_1.path == url_split_2.path 107 | else: 108 | return url_split_1.path == url_split_2.path and\ 109 | url_split_1.netloc == url_split_2.netloc 110 | 111 | 112 | def is_bad_tel_url_split(url_split): 113 | """Returns True if the URL is using a badly formed tel scheme 114 | that is not detected by Python urlparse. 115 | """ 116 | return url_split.netloc.startswith("tel:") or\ 117 | url_split.path.startswith("/tel:") 118 | 119 | 120 | def is_supported_scheme(url_split, ignore_bad_tel_urls=False): 121 | """Returns True if the URL has a supported scheme and can be crawled. 122 | """ 123 | if url_split.scheme not in SUPPORTED_SCHEMES: 124 | return False 125 | elif ignore_bad_tel_urls and is_bad_tel_url_split(url_split): 126 | # issue #16 127 | return False 128 | return True 129 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.2.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | try: 4 | from setuptools import setup 5 | except ImportError: 6 | from distutils.core import setup 7 | 8 | import sys 9 | 10 | version = __import__('pylinkvalidator').__version__ 11 | 12 | if sys.version_info[0] >= 3: 13 | requires = ['beautifulsoup4>=4.2.0'] 14 | else: 15 | requires = [] 16 | 17 | setup( 18 | name='pylinkvalidator', 19 | version=version, 20 | description='Simple crawler that detects link errors such as 404 and 500.', 21 | long_description=''' 22 | pylinkvalidator is a simple crawler that traverses a web sites and reports 23 | errors (e.g., 500 and 404 errors) encountered. The crawler can try to download 24 | resources like images. 25 | ''', 26 | author='Barthelemy Dagenais', 27 | author_email='barthelemy@infobart.com', 28 | license='BSD License', 29 | url='https://github.com/bartdag/pylinkvalidator', 30 | packages=['pylinkvalidator', 'pylinkvalidator.included', 31 | 'pylinkvalidator.included.bs4', 32 | 'pylinkvalidator.included.bs4.builder'], 33 | scripts=['pylinkvalidator/bin/pylinkvalidate.py'], 34 | classifiers=[ 35 | 'Environment :: Console', 36 | 'Intended Audience :: Developers', 37 | 'License :: OSI Approved :: BSD License', 38 | 'Operating System :: OS Independent', 39 | 'Programming Language :: Python', 40 | 'Programming Language :: Python :: 2', 41 | 'Programming Language :: Python :: 2.6', 42 | 'Programming Language :: Python :: 2.7', 43 | 'Programming Language :: Python :: 3', 44 | 'Programming Language :: Python :: 3.3', 45 | 'Programming Language :: Python :: 3.4', 46 | 'Programming Language :: Python :: 3.5', 47 | 'Programming Language :: Python :: 3.6', 48 | 'Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking', 49 | 'Topic :: Utilities', 50 | ], 51 | install_requires=requires, 52 | ) 53 | --------------------------------------------------------------------------------