├── .gitignore
├── .travis.yml
├── AUTHORS.txt
├── CHANGELOG.rst
├── LICENSE.txt
├── README.rst
├── pylinkvalidator
├── __init__.py
├── api.py
├── bin
│ └── pylinkvalidate.py
├── compat.py
├── crawler.py
├── included
│ ├── __init__.py
│ └── bs4
│ │ ├── __init__.py
│ │ ├── builder
│ │ ├── __init__.py
│ │ ├── _html5lib.py
│ │ ├── _htmlparser.py
│ │ └── _lxml.py
│ │ ├── dammit.py
│ │ ├── diagnose.py
│ │ └── element.py
├── models.py
├── reporter.py
├── testfiles
│ ├── a.html
│ ├── alone.html
│ ├── badtel.html
│ ├── c.html
│ ├── d.html
│ ├── depth
│ │ ├── 0.html
│ │ ├── 0b.html
│ │ ├── 1.html
│ │ ├── 2.html
│ │ ├── 3.html
│ │ └── root.html
│ ├── f.html
│ ├── index.html
│ ├── robots.txt
│ ├── sub
│ │ ├── b.html
│ │ ├── e.html
│ │ ├── small_image.gif
│ │ ├── style.css
│ │ └── test.js
│ ├── à.html
│ └── é.html
├── tests.py
└── urlutil.py
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | pylinkvalidator.egg-info/
3 | dist/
4 | build/
5 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.6"
4 | - "2.7"
5 | - "3.4"
6 | - "3.6"
7 | install:
8 | - "pip install ."
9 | script: nosetests
10 | sudo: false
11 |
--------------------------------------------------------------------------------
/AUTHORS.txt:
--------------------------------------------------------------------------------
1 | Pylinkvalidator was originally created as part of pylinkchecker in 2013 by
2 | Barthelemy Dagenais while he was working at Xprima Inc. It has been forked on
3 | June 24th 2014 with the name pylinkvalidator.
4 |
5 | Here is an inevitably incomplete list of MUCH-APPRECIATED CONTRIBUTORS --
6 | people who have submitted patches, reported bugs, and generally made
7 | pylinkvalidator that much better:
8 |
9 | Arun Elias
10 | Jim Priest regex:Hello\s+World
' http://example.com/``
258 |
259 | Check that robots.txt have a Disallow none:
260 | ``pylinkvalidate.py --check-content-once '/robots.txt,regex:^Disallow:\s*$' http://example.com/``
261 |
262 | Allow insecure content for HTTPS sites with certificate errors [SSL: CERTIFICATE_VERIFY_FAILED]
263 | ``pylinkvalidate.py --allow-insecure-content https://self-signed.example.com/``
264 |
265 |
266 | API Usage
267 | ---------
268 |
269 | To crawl a site from a single URL:
270 |
271 | .. code-block:: python
272 |
273 | from pylinkvalidator.api import crawl
274 | crawled_site = crawl("http://www.example.com/")
275 | number_of_crawled_pages = len(crawled_site.pages)
276 | number_of_errors = len(crawled_sites.error_pages)
277 |
278 |
279 | To crawl a site and pass some configuration options (the same supported by the
280 | command line interface):
281 |
282 |
283 | .. code-block:: python
284 |
285 | from pylinkvalidator.api import crawl_with_options
286 | crawled_site = crawl_with_options(["http://www.example.com/"], {"run-once":
287 | True, "workers": 10})
288 | number_of_crawled_pages = len(crawled_site.pages)
289 | number_of_errors = len(crawled_sites.error_pages)
290 |
291 |
292 | FAQ and Troubleshooting
293 | -----------------------
294 |
295 | I cannot find pylinkvalidate.py on Windows with virtualenv
296 | This is a known problem with virtualenv on windows. The interpreter is
297 | different than the one used by the virtualenv. Prefix pylinkvalidate.py with the
298 | full path: ``python c:\myvirtualenv\Scripts\pylinkvalidate.py``
299 |
300 | I see Exception KeyError ... module 'threading' when using --mode=green
301 | This output is generally harmless and is generated by gevent patching the
302 | python thread module. If someone knows how to make it go away, patches are
303 | more than welcome :-)
304 |
305 |
306 | License
307 | -------
308 |
309 | This software is licensed under the `New BSD License`. See the `LICENSE` file
310 | in the for the full license text. It includes the beautifulsoup library which
311 | is licensed under the MIT license.
312 |
--------------------------------------------------------------------------------
/pylinkvalidator/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Main pylinkvalidator package
4 | """
5 |
6 | __version__ = "0.3"
7 |
--------------------------------------------------------------------------------
/pylinkvalidator/api.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Contains a simple crawling API to use pylinkvalidator programmatically.
4 |
5 | We will do everything to keep functions in this module backward compatible
6 | across versions.
7 | """
8 | from __future__ import unicode_literals, absolute_import
9 |
10 | from pylinkvalidator.crawler import configure_logger, execute_from_config
11 | from pylinkvalidator.models import Config
12 |
13 |
14 | def crawl(url):
15 | """Crawls a URL and returns a pylinkvalidator.crawler.Site instance.
16 |
17 | :rtype: A pylinkvalidator.crawler.Site instance
18 | """
19 | config = Config()
20 | config.parse_api_config([url])
21 | logger = configure_logger(config)
22 | crawler = execute_from_config(config, logger)
23 |
24 | return crawler.site
25 |
26 |
27 | def crawl_with_options(urls, options_dict=None, logger_builder=None):
28 | """Crawls URLs with provided options and logger.
29 |
30 | :param options_dict: Must contain the long name of the command line
31 | options. (optional)
32 |
33 | :param logger_builder: Function that will be called to instantiate a
34 | logger. (optional)
35 |
36 | :rtype: A pylinkvalidator.crawler.Site instance
37 | """
38 |
39 | config = Config()
40 |
41 | config.parse_api_config(urls, options_dict)
42 |
43 | if not logger_builder:
44 | logger = configure_logger(config)
45 | else:
46 | logger = logger_builder()
47 |
48 | # TODO In the future, we will pass the logger builder and not the logger
49 | # to enable the ProcessSiteCrawler to instantiate its own custom logger.
50 | crawler = execute_from_config(config, logger)
51 |
52 | return crawler.site
53 |
--------------------------------------------------------------------------------
/pylinkvalidator/bin/pylinkvalidate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from pylinkvalidator import crawler
4 |
5 | if __name__ == "__main__":
6 | crawler.execute_from_command_line()
7 |
--------------------------------------------------------------------------------
/pylinkvalidator/compat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # flake8: noqa
3 | """
4 | Contains the compatibility layer for python 2 & 3
5 | """
6 | from __future__ import unicode_literals, absolute_import
7 |
8 | import sys
9 |
10 | if sys.version_info[0] < 3:
11 | range = xrange
12 | import urlparse
13 | from urllib import quote
14 | import SimpleHTTPServer
15 | import SocketServer
16 | from urllib2 import HTTPError
17 | import Queue
18 | unicode = unicode
19 |
20 | def get_content_type(m):
21 | return m.gettype()
22 |
23 | def get_charset(m):
24 | return m.getparam("charset")
25 |
26 | def get_safe_str(s):
27 | return s.encode("utf-8")
28 |
29 | from StringIO import StringIO
30 | else:
31 | range = range
32 | import urllib.parse as urlparse
33 | from urllib.parse import quote
34 | import http.server as SimpleHTTPServer
35 | import socketserver as SocketServer
36 | from urllib.error import HTTPError
37 | import queue as Queue
38 | unicode = str
39 |
40 | def get_content_type(m):
41 | return m.get_content_type()
42 |
43 | def get_charset(m):
44 | return m.get_content_charset()
45 |
46 | def get_safe_str(s):
47 | return s
48 | from io import StringIO
49 |
50 | try:
51 | from logging import NullHandler
52 | except ImportError:
53 | from logging import Handler
54 |
55 | class NullHandler(Handler):
56 | def emit(self, record):
57 | pass
58 |
59 | def handle(self, record):
60 | pass
61 |
62 | def createLock(self):
63 | return None
64 |
65 |
66 | def get_url_open():
67 | # Not automatically imported to allow monkey patching.
68 | if sys.version_info[0] < 3:
69 | from urllib2 import urlopen
70 | else:
71 | from urllib.request import urlopen
72 | return urlopen
73 |
74 |
75 | def get_url_request():
76 | if sys.version_info[0] < 3:
77 | from urllib2 import Request
78 | else:
79 | from urllib.request import Request
80 | return Request
81 |
--------------------------------------------------------------------------------
/pylinkvalidator/crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Contains the crawling logic.
4 | """
5 | from __future__ import unicode_literals, absolute_import
6 |
7 | import base64
8 | from collections import defaultdict
9 | import logging
10 | import sys
11 | import time
12 |
13 | from pylinkvalidator.included.bs4 import BeautifulSoup, UnicodeDammit
14 |
15 | import pylinkvalidator.compat as compat
16 | from pylinkvalidator.compat import (
17 | range, HTTPError, get_url_open, unicode,
18 | get_content_type, get_url_request, get_charset)
19 | from pylinkvalidator.models import (
20 | Config, WorkerInit, Response, PageCrawl,
21 | ExceptionStr, Link, SitePage, WorkerInput, TYPE_ATTRIBUTES, HTML_MIME_TYPE,
22 | MODE_THREAD, MODE_PROCESS, MODE_GREEN, WHEN_ALWAYS, UTF8Class,
23 | PageStatus, PageSource, PAGE_QUEUED, PAGE_CRAWLED, VERBOSE_QUIET,
24 | VERBOSE_NORMAL, LazyLogParam, PREFIX_ALL)
25 | from pylinkvalidator.reporter import report
26 | from pylinkvalidator.urlutil import (
27 | get_clean_url_split, get_absolute_url_split,
28 | is_link, is_similar_url_split, is_supported_scheme)
29 |
30 |
31 | WORK_DONE = '__WORK_DONE__'
32 |
33 |
34 | def get_logger(propagate=False):
35 | """Returns a logger."""
36 | root_logger = logging.getLogger()
37 |
38 | logger = logging.getLogger(__name__)
39 |
40 | handler = logging.StreamHandler()
41 |
42 | formatter = logging.Formatter(
43 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
44 | handler.setFormatter(formatter)
45 |
46 | if root_logger.level != logging.CRITICAL:
47 | logger.addHandler(handler)
48 | logger.propagate = propagate
49 | else:
50 | logger.addHandler(compat.NullHandler())
51 |
52 | return logger
53 |
54 |
55 | class SiteCrawler(object):
56 | """Main crawler/orchestrator"""
57 |
58 | def __init__(self, config, logger):
59 | self.config = config
60 | self.start_url_splits = list(config.start_url_splits)
61 | self.workers = []
62 | self.input_queue = self.build_queue(config)
63 | self.output_queue = self.build_queue(config)
64 | self.logger = logger
65 | self.site = Site(self.start_url_splits, config, self.logger)
66 |
67 | def build_logger(self):
68 | return self.logger
69 |
70 | def crawl(self):
71 | worker_init = WorkerInit(
72 | self.config.worker_config, self.input_queue,
73 | self.output_queue, self.build_logger())
74 | self.workers = self.get_workers(self.config, worker_init)
75 |
76 | queue_size = len(self.start_url_splits)
77 | for start_url_split in self.start_url_splits:
78 | self.input_queue.put(
79 | WorkerInput(
80 | start_url_split, True, 0, start_url_split.netloc,
81 | self.config.content_check),
82 | False)
83 |
84 | self.start_workers(self.workers, self.input_queue, self.output_queue)
85 |
86 | self.start_progress()
87 |
88 | while True:
89 | page_crawl = self.output_queue.get()
90 | queue_size -= 1
91 | new_worker_inputs = self.process_page_crawl(page_crawl)
92 |
93 | # We only process new pages if we did not exceed configured depth
94 | for worker_input in new_worker_inputs:
95 | queue_size += 1
96 | self.input_queue.put(worker_input, False)
97 |
98 | self.progress(page_crawl, len(self.site.pages), queue_size)
99 |
100 | if queue_size <= 0:
101 | self.stop_workers(self.workers, self.input_queue,
102 | self.output_queue)
103 | self.stop_progress()
104 | return self.site
105 |
106 | def start_progress(self):
107 | if self.config.options.progress:
108 | print("Starting crawl...")
109 |
110 | def stop_progress(self):
111 | if self.config.options.progress:
112 | print("Crawling Done...\n")
113 |
114 | def progress(self, page_crawl, done_size, queue_size):
115 | if not self.config.options.progress:
116 | return
117 |
118 | total = done_size + queue_size
119 | percent = float(done_size) / float(total) * 100.0
120 |
121 | url = ""
122 | if page_crawl.final_url_split:
123 | url = page_crawl.final_url_split.geturl()
124 | elif page_crawl.original_url_split:
125 | url = page_crawl.original_url_split.geturl()
126 |
127 | status = page_crawl.status
128 | if not status:
129 | status = "error"
130 |
131 | print("{0} - {1} ({2} of {3} - {4:.0f}%)".format(
132 | status, url, done_size, total, percent))
133 |
134 | def build_queue(self, config):
135 | """Returns an object implementing the Queue interface."""
136 | raise NotImplementedError()
137 |
138 | def get_workers(self, config, worker_init):
139 | """Returns a sequence of workers of the desired type."""
140 | raise NotImplementedError()
141 |
142 | def start_workers(self, workers, input_queue, output_queue):
143 | """Start the workers."""
144 | raise NotImplementedError()
145 |
146 | def stop_workers(self, workers, input_queue, output_queue):
147 | """Stops the workers."""
148 | for worker in workers:
149 | input_queue.put(WORK_DONE)
150 |
151 | def process_page_crawl(self, page_crawl):
152 | """Returns a sequence of SplitResult to crawl."""
153 | return self.site.add_crawled_page(page_crawl)
154 |
155 |
156 | class ThreadSiteCrawler(SiteCrawler):
157 | """Site Crawler with thread workers."""
158 |
159 | def build_queue(self, config):
160 | return compat.Queue.Queue()
161 |
162 | def get_workers(self, config, worker_init):
163 | from threading import Thread
164 | workers = []
165 | for _ in range(config.worker_size):
166 | workers.append(
167 | Thread(target=crawl_page, kwargs={'worker_init': worker_init}))
168 |
169 | return workers
170 |
171 | def start_workers(self, workers, input_queue, output_queue):
172 | for worker in workers:
173 | worker.start()
174 |
175 |
176 | class ProcessSiteCrawler(SiteCrawler):
177 | """Site Crawler with process workers."""
178 |
179 | def __init__(self, *args, **kwargs):
180 | import multiprocessing
181 | self.manager = multiprocessing.Manager()
182 | self.ProcessClass = multiprocessing.Process
183 | super(ProcessSiteCrawler, self).__init__(*args, **kwargs)
184 |
185 | def build_logger(self):
186 | """We do not want to share a logger."""
187 | return None
188 |
189 | def build_queue(self, config):
190 | return self.manager.Queue()
191 |
192 | def get_workers(self, config, worker_init):
193 | workers = []
194 | for _ in range(config.worker_size):
195 | workers.append(self.ProcessClass(
196 | target=crawl_page, kwargs={'worker_init': worker_init}))
197 |
198 | return workers
199 |
200 | def start_workers(self, workers, input_queue, output_queue):
201 | for worker in workers:
202 | worker.start()
203 |
204 |
205 | class GreenSiteCrawler(SiteCrawler):
206 | """Site Crawler with green thread workers."""
207 |
208 | def __init__(self, *args, **kwargs):
209 | from gevent import monkey, queue, Greenlet
210 | # TODO thread=false should be used to remove useless exception
211 | # But weird behavior sometimes happen when it is not patched...
212 | monkey.patch_all()
213 | self.QueueClass = queue.Queue
214 | self.GreenClass = Greenlet
215 | super(GreenSiteCrawler, self).__init__(*args, **kwargs)
216 |
217 | def build_queue(self, config):
218 | return self.QueueClass()
219 |
220 | def get_workers(self, config, worker_init):
221 | workers = []
222 | for _ in range(config.worker_size):
223 | workers.append(self.GreenClass(
224 | crawl_page, worker_init=worker_init))
225 |
226 | return workers
227 |
228 | def start_workers(self, workers, input_queue, output_queue):
229 | for worker in workers:
230 | worker.start()
231 |
232 |
233 | class PageCrawler(object):
234 | """Worker that parses a page and extracts links"""
235 |
236 | def __init__(self, worker_init):
237 | self.worker_config = worker_init.worker_config
238 | self.input_queue = worker_init.input_queue
239 | self.output_queue = worker_init.output_queue
240 | self.urlopen = get_url_open()
241 | self.request_class = get_url_request()
242 | self.logger = worker_init.logger
243 | if not self.logger:
244 | # Get a new one!
245 | self.logger = get_logger()
246 |
247 | # We do this here to allow patching by gevent
248 | import socket
249 | self.timeout_exception = socket.timeout
250 |
251 | self.auth_header = None
252 |
253 | if self.worker_config.username and self.worker_config.password:
254 | base64string = unicode(
255 | base64.encodestring(
256 | '{0}:{1}'.format(
257 | self.worker_config.username,
258 | self.worker_config.password)
259 | .encode("utf-8")), "utf-8")
260 | self.auth_header = ("Authorization",
261 | "Basic {0}".format(base64string))
262 |
263 | def crawl_page_forever(self):
264 | """Starts page crawling loop for this worker."""
265 |
266 | while True:
267 | worker_input = self.input_queue.get()
268 |
269 | if worker_input == WORK_DONE:
270 | # No more work! Pfew!
271 | return
272 | else:
273 | page_crawl = self._crawl_page(worker_input)
274 | self.output_queue.put(page_crawl)
275 |
276 | def _crawl_page(self, worker_input):
277 | page_crawl = None
278 | erroneous_content = []
279 | missing_content = []
280 | url_split_to_crawl = worker_input.url_split
281 |
282 | try:
283 | response = open_url(
284 | self.urlopen, self.request_class,
285 | url_split_to_crawl.geturl(), self.worker_config.timeout,
286 | self.timeout_exception, self.auth_header,
287 | extra_headers=self.worker_config.extra_headers,
288 | logger=self.logger)
289 |
290 | if response.exception:
291 | if response.status:
292 | # This is a http error. Good.
293 | page_crawl = PageCrawl(
294 | original_url_split=url_split_to_crawl,
295 | final_url_split=None, status=response.status,
296 | is_timeout=False, is_redirect=False, links=[],
297 | exception=None, is_html=False,
298 | depth=worker_input.depth,
299 | response_time=response.response_time,
300 | process_time=None,
301 | site_origin=worker_input.site_origin)
302 | elif response.is_timeout:
303 | # This is a timeout. No need to wrap the exception
304 | page_crawl = PageCrawl(
305 | original_url_split=url_split_to_crawl,
306 | final_url_split=None, status=None,
307 | is_timeout=True, is_redirect=False, links=[],
308 | exception=None, is_html=False,
309 | depth=worker_input.depth,
310 | response_time=response.response_time,
311 | process_time=0,
312 | site_origin=worker_input.site_origin)
313 | else:
314 | # Something bad happened when opening the url
315 | exception = ExceptionStr(
316 | unicode(type(response.exception)),
317 | unicode(response.exception))
318 | page_crawl = PageCrawl(
319 | original_url_split=url_split_to_crawl,
320 | final_url_split=None, status=None,
321 | is_timeout=False, is_redirect=False, links=[],
322 | exception=exception, is_html=False,
323 | depth=worker_input.depth,
324 | response_time=response.response_time,
325 | process_time=0,
326 | site_origin=worker_input.site_origin)
327 | else:
328 | final_url_split = get_clean_url_split(response.final_url)
329 |
330 | message = response.content.info()
331 | mime_type = get_content_type(message)
332 | if self.worker_config.prefer_server_encoding:
333 | charset = get_charset(message)
334 | else:
335 | charset = None
336 | links = []
337 |
338 | is_html = mime_type == HTML_MIME_TYPE
339 | process_time = None
340 |
341 | if is_html and worker_input.should_crawl:
342 | start = time.time()
343 | html_soup = BeautifulSoup(
344 | response.content, self.worker_config.parser,
345 | from_encoding=charset)
346 | links = self.get_links(html_soup, final_url_split)
347 | if self._has_content_to_check(worker_input):
348 | (missing_content, erroneous_content) =\
349 | self.check_content(
350 | unicode(html_soup), html_soup,
351 | url_split_to_crawl,
352 | final_url_split, worker_input.content_check)
353 | process_time = time.time() - start
354 | else:
355 | self.logger.debug(
356 | "Won't crawl %s. MIME Type: %s. Should crawl: %s",
357 | final_url_split, mime_type,
358 | worker_input.should_crawl)
359 | if self._has_content_to_check(worker_input):
360 | text_content = self.get_text_content(
361 | response.content.read(), charset)
362 | (missing_content, erroneous_content) =\
363 | self.check_content(
364 | text_content, None, url_split_to_crawl,
365 | final_url_split, worker_input.content_check)
366 |
367 | page_crawl = PageCrawl(
368 | original_url_split=url_split_to_crawl,
369 | final_url_split=final_url_split, status=response.status,
370 | is_timeout=False, is_redirect=response.is_redirect,
371 | links=links, exception=None, is_html=is_html,
372 | depth=worker_input.depth,
373 | response_time=response.response_time,
374 | process_time=process_time,
375 | site_origin=worker_input.site_origin,
376 | missing_content=missing_content,
377 | erroneous_content=erroneous_content)
378 | except Exception as exc:
379 | exception = ExceptionStr(unicode(type(exc)), unicode(exc))
380 | page_crawl = PageCrawl(
381 | original_url_split=url_split_to_crawl,
382 | final_url_split=None, status=None,
383 | is_timeout=False, is_redirect=False, links=[],
384 | exception=exception, is_html=False,
385 | depth=worker_input.depth,
386 | response_time=None,
387 | process_time=None,
388 | site_origin=worker_input.site_origin)
389 | self.logger.exception("Exception occurred while crawling a page.")
390 |
391 | return page_crawl
392 |
393 | def _has_content_to_check(self, worker_input):
394 | return worker_input.content_check and\
395 | worker_input.content_check.has_something_to_check
396 |
397 | def get_text_content(self, binary_blob, charset):
398 | """Retrieves unicode content from response binary blob.
399 | """
400 | override_encodings = []
401 | if charset:
402 | override_encodings.append(charset)
403 |
404 | return UnicodeDammit(binary_blob, override_encodings).unicode_markup
405 |
406 | def check_content(
407 | self, response_content, html_soup, original_url_split,
408 | final_url_split, content_check):
409 | """Ensures that the specified content is present (or absent).
410 | """
411 | missing_content = []
412 | erroneous_content = []
413 |
414 | if html_soup:
415 | for content, found in self.check_html_content_single(
416 | content_check.html_presence, html_soup, original_url_split,
417 | final_url_split):
418 | if not found:
419 | missing_content.append(content)
420 |
421 | if html_soup:
422 | for content, found in self.check_html_content_single(
423 | content_check.html_absence, html_soup, original_url_split,
424 | final_url_split):
425 | if found:
426 | erroneous_content.append(content)
427 |
428 | for content, found in self.check_text_content_single(
429 | content_check.text_presence, response_content,
430 | original_url_split, final_url_split):
431 | if not found:
432 | missing_content.append(content)
433 |
434 | for content, found in self.check_text_content_single(
435 | content_check.text_absence, response_content,
436 | original_url_split, final_url_split):
437 | if found:
438 | erroneous_content.append(content)
439 |
440 | return (missing_content, erroneous_content)
441 |
442 | def check_html_content_single(
443 | self, html_to_check, html_soup, original_url_split,
444 | final_url_split):
445 | """Returns a list of tuple (content, presence) indicating whether an
446 | html tag was present or not in the source.
447 | """
448 | content = []
449 |
450 | for key, html_check_list in html_to_check.items():
451 | if key == PREFIX_ALL or\
452 | is_similar_url_split(key, original_url_split) or\
453 | is_similar_url_split(key, final_url_split):
454 | # we check
455 | for html_check in html_check_list:
456 | kwargs = {}
457 | if html_check.attrs:
458 | kwargs["attrs"] = html_check.attrs
459 | if html_check.content:
460 | # XXX Use text because the included bs4 does not use
461 | # the new string parameter and text is backward
462 | # compatible.
463 | kwargs["text"] = html_check.content
464 | found = html_soup.find(
465 | html_check.tag, **kwargs) is not None
466 | content.append((str(html_check), found))
467 |
468 | return content
469 |
470 | def check_text_content_single(
471 | self, text_content_to_check, full_text, original_url_split,
472 | final_url_split):
473 | """Returns a list of tuple (content, presence) indicating whether an
474 | html tag was present or not in the source.
475 | """
476 | content = []
477 |
478 | for key, text_check_list in text_content_to_check.items():
479 | if key == PREFIX_ALL or\
480 | is_similar_url_split(key, original_url_split) or\
481 | is_similar_url_split(key, final_url_split):
482 | # we check
483 | for text_check in text_check_list:
484 | try:
485 | match = text_check.search(full_text)
486 | content.append((text_check.pattern, match is not None))
487 | except AttributeError:
488 | found = text_check in full_text
489 | content.append((text_check, found))
490 |
491 | return content
492 |
493 | def get_links(self, html_soup, original_url_split):
494 | """Gets links for desired types (e.g., a, link, img, script)
495 |
496 | :param html_soup: The page parsed by BeautifulSoup
497 | :param original_url_split: The URL of the page used to resolve relative
498 | links.
499 | :rtype: A sequence of Link objects
500 | """
501 |
502 | # This is a weird html tag that defines the base URL of a page.
503 | base_url_split = original_url_split
504 |
505 | bases = html_soup.find_all('base')
506 | if bases:
507 | base = bases[0]
508 | if 'href' in base.attrs:
509 | base_url_split = get_clean_url_split(base['href'])
510 |
511 | links = []
512 | for element_type in self.worker_config.types:
513 | if element_type not in TYPE_ATTRIBUTES:
514 | raise Exception(
515 | "Unknown element type: {0}".format(element_type))
516 | attribute = TYPE_ATTRIBUTES[element_type]
517 | element_links = html_soup.find_all(element_type)
518 | links.extend(self._get_links(
519 | element_links, attribute, base_url_split, original_url_split))
520 | return links
521 |
522 | def _get_links(self, elements, attribute, base_url_split,
523 | original_url_split):
524 | links = []
525 | for element in elements:
526 | if attribute in element.attrs:
527 | url = element[attribute]
528 |
529 | if not self.worker_config.strict_mode:
530 | url = url.strip()
531 |
532 | if not is_link(url):
533 | continue
534 | abs_url_split = get_absolute_url_split(url, base_url_split)
535 |
536 | if not is_supported_scheme(
537 | abs_url_split, self.worker_config.ignore_bad_tel_urls):
538 | continue
539 |
540 | link = Link(
541 | type=unicode(element.name), url_split=abs_url_split,
542 | original_url_split=original_url_split,
543 | source_str=unicode(element))
544 | links.append(link)
545 |
546 | return links
547 |
548 |
549 | class Site(UTF8Class):
550 | """Contains all the visited and visiting pages of a site.
551 |
552 | This class is NOT thread-safe and should only be accessed by one thread at
553 | a time!
554 | """
555 |
556 | def __init__(self, start_url_splits, config, logger=None):
557 | self.start_url_splits = start_url_splits
558 |
559 | self.pages = {}
560 | """Map of url:SitePage"""
561 |
562 | self.multi_pages = defaultdict(dict)
563 | """Map of netloc:map(url:SitePage). Only used in multi sites mode."""
564 |
565 | self.error_pages = {}
566 | """Map of url:SitePage with is_ok=False"""
567 |
568 | self.multi_error_pages = defaultdict(dict)
569 | """Map of netloc:map(url:SitePage). Only used in multi sites
570 | mode."""
571 |
572 | self.page_statuses = {}
573 | """Map of url:PageStatus (PAGE_QUEUED, PAGE_CRAWLED)"""
574 |
575 | self.config = config
576 |
577 | self.logger = logger
578 |
579 | for start_url_split in self.start_url_splits:
580 | self.page_statuses[start_url_split] = PageStatus(PAGE_QUEUED, [])
581 |
582 | def collect_multi_sites(self):
583 | """Collects page results and maps them to their respective domain in
584 | multi_pages and multi_error_pages.
585 | """
586 | for url, page in self.pages.items():
587 | self.multi_pages[page.site_origin][url] = page
588 |
589 | for url, page in self.error_pages.items():
590 | self.multi_error_pages[page.site_origin][url] = page
591 |
592 | @property
593 | def is_ok(self):
594 | """Returns True if there is no error page."""
595 | return len(self.error_pages) == 0
596 |
597 | def add_crawled_page(self, page_crawl):
598 | """Adds a crawled page. Returns a list of url split to crawl"""
599 | if page_crawl.original_url_split not in self.page_statuses:
600 | self.logger.warning("Original URL not seen before!")
601 | return []
602 |
603 | status = self.page_statuses[page_crawl.original_url_split]
604 |
605 | # Mark it as crawled
606 | self.page_statuses[page_crawl.original_url_split] = PageStatus(
607 | PAGE_CRAWLED, None)
608 |
609 | if page_crawl.original_url_split in self.pages:
610 | self.logger.warning(
611 | "Original URL already crawled! Concurrency issue!")
612 | return []
613 |
614 | final_url_split = page_crawl.final_url_split
615 | if not final_url_split:
616 | # Happens on 404/500/timeout/error
617 | final_url_split = page_crawl.original_url_split
618 |
619 | if final_url_split in self.pages:
620 | # This means that we already processed this final page.
621 | # It's a redirect. Just add a source
622 | site_page = self.pages[final_url_split]
623 | site_page.add_sources(status.sources)
624 | else:
625 | # We never crawled this page before
626 | is_local = self.config.is_local(final_url_split)
627 | site_page = SitePage(
628 | final_url_split, page_crawl.status,
629 | page_crawl.is_timeout, page_crawl.exception,
630 | page_crawl.is_html, is_local,
631 | response_time=page_crawl.response_time,
632 | process_time=page_crawl.process_time,
633 | site_origin=page_crawl.site_origin,
634 | missing_content=page_crawl.missing_content,
635 | erroneous_content=page_crawl.erroneous_content)
636 | site_page.add_sources(status.sources)
637 | self.pages[final_url_split] = site_page
638 |
639 | if not site_page.is_ok:
640 | self.error_pages[final_url_split] = site_page
641 |
642 | return self.process_links(page_crawl)
643 |
644 | def process_links(self, page_crawl):
645 | links_to_process = []
646 |
647 | source_url_split = page_crawl.original_url_split
648 | if page_crawl.final_url_split:
649 | source_url_split = page_crawl.final_url_split
650 |
651 | for link in page_crawl.links:
652 | url_split = link.url_split
653 | if not self.config.should_download(url_split):
654 | self.logger.debug(
655 | "Won't download %s. Is local? %s",
656 | url_split,
657 | LazyLogParam(lambda: self.config.is_local(url_split)))
658 | continue
659 |
660 | page_status = self.page_statuses.get(url_split, None)
661 | page_source = PageSource(source_url_split, link.source_str)
662 |
663 | if not page_status:
664 | # We never encountered this url before
665 | self.page_statuses[url_split] = PageStatus(
666 | PAGE_QUEUED, [page_source])
667 | should_crawl = self.config.should_crawl(
668 | url_split, page_crawl.depth)
669 | links_to_process.append(WorkerInput(
670 | url_split, should_crawl, page_crawl.depth + 1,
671 | page_crawl.site_origin, self.config.content_check))
672 | elif page_status.status == PAGE_CRAWLED:
673 | # Already crawled. Add source
674 | if url_split in self.pages:
675 | self.pages[url_split].add_sources([page_source])
676 | else:
677 | # TODO the final url is different. need a way to link it...
678 | pass
679 | elif page_status.status == PAGE_QUEUED:
680 | # Already queued for crawling. Add source.
681 | page_status.sources.append(page_source)
682 |
683 | return links_to_process
684 |
685 | def get_average_response_time(self):
686 | """Computes the average response time of pages that returned an HTTP
687 | code (good or bad). Exceptions such as timeout are ignored.
688 | """
689 | response_time_sum = 0
690 | total = 0
691 | for page in self.pages.values():
692 | if page.response_time is not None:
693 | response_time_sum += page.response_time
694 | total += 1
695 |
696 | if total > 0:
697 | return float(response_time_sum) / float(total)
698 | else:
699 | return 0
700 |
701 | def get_average_process_time(self):
702 | """Computes the average process (parse) time of pages that returned an HTTP
703 | code (good or bad). Exceptions are ignored.
704 | """
705 | process_time_sum = 0
706 | total = 0
707 | for page in self.pages.values():
708 | if page.process_time is not None:
709 | process_time_sum += page.process_time
710 | total += 1
711 |
712 | if total > 0:
713 | return float(process_time_sum) / float(total)
714 | else:
715 | return 0
716 |
717 | def __unicode__(self):
718 | return "Site for {0}".format(self.start_url_splits)
719 |
720 |
721 | def crawl_page(worker_init):
722 | """Safe redirection to the page crawler"""
723 | page_crawler = PageCrawler(worker_init)
724 | page_crawler.crawl_page_forever()
725 |
726 |
727 | def open_url(open_func, request_class, url, timeout, timeout_exception,
728 | auth_header=None, extra_headers=None, logger=None):
729 | """Opens a URL and returns a Response object.
730 |
731 | All parameters are required to be able to use a patched version of the
732 | Python standard library (i.e., patched by gevent)
733 |
734 | :param open_func: url open function, typicaly urllib2.urlopen
735 | :param request_class: the request class to use
736 | :param url: the url to open
737 | :param timeout: number of seconds to wait before timing out
738 | :param timeout_exception: the exception thrown by open_func if a timeout
739 | occurs
740 | :param auth_header: authentication header
741 | :param extra_headers: dict of {Header: Value}
742 | :param logger: logger used to log exceptions
743 | :rtype: A Response object
744 | """
745 | try:
746 | request = request_class(url)
747 |
748 | if auth_header:
749 | request.add_header(auth_header[0], auth_header[1])
750 |
751 | if extra_headers:
752 | for header, value in extra_headers.items():
753 | request.add_header(header, value)
754 |
755 | start = time.time()
756 | output_value = open_func(request, timeout=timeout)
757 | stop = time.time()
758 | final_url = output_value.geturl()
759 | code = output_value.getcode()
760 | response = Response(
761 | content=output_value, status=code, exception=None,
762 | original_url=url, final_url=final_url,
763 | is_redirect=final_url != url, is_timeout=False,
764 | response_time=stop-start)
765 | except HTTPError as http_error:
766 | stop = time.time()
767 | code = http_error.code
768 | response = Response(
769 | content=None, status=code, exception=http_error,
770 | original_url=url, final_url=None, is_redirect=False,
771 | is_timeout=False, response_time=stop-start)
772 | except timeout_exception as t_exception:
773 | response = Response(
774 | content=None, status=None, exception=t_exception,
775 | original_url=url, final_url=None, is_redirect=False,
776 | is_timeout=True, response_time=None)
777 | except Exception as exc:
778 | if logger:
779 | logger.warning("Exception while opening an URL", exc_info=True)
780 | response = Response(
781 | content=None, status=None, exception=exc,
782 | original_url=url, final_url=None, is_redirect=False,
783 | is_timeout=False, response_time=None)
784 |
785 | return response
786 |
787 |
788 | def execute_from_command_line():
789 | """Runs the crawler and retrieves the configuration from the command
790 | line.
791 | """
792 | try:
793 | start = time.time()
794 | config = Config()
795 | config.parse_cli_config()
796 |
797 | logger = configure_logger(config)
798 | crawler = execute_from_config(config, logger)
799 |
800 | stop = time.time()
801 |
802 | if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
803 | report(crawler.site, config, stop - start, logger)
804 |
805 | if not crawler.site.is_ok:
806 | sys.exit(1)
807 | except Exception as e:
808 | print(e)
809 | sys.exit(1)
810 |
811 |
812 | def configure_logger(config):
813 | """Configures a logger based on the configuration."""
814 | if config.options.verbose == VERBOSE_QUIET:
815 | logging.basicConfig(level=logging.CRITICAL)
816 | elif config.options.verbose == VERBOSE_NORMAL:
817 | logging.basicConfig(level=logging.WARNING)
818 | else:
819 | logging.basicConfig(level=logging.DEBUG)
820 |
821 | logger = get_logger()
822 |
823 | return logger
824 |
825 |
826 | def execute_from_config(config, logger):
827 | """Executes a crawler given a config and logger."""
828 | if not config.start_urls:
829 | raise Exception("At least one starting URL must be supplied.")
830 |
831 | if config.options.allow_insecure_content:
832 | # Ref: https://www.python.org/dev/peps/pep-0476/#opting-out
833 | import ssl
834 | try:
835 | _create_unverified_https_context = ssl._create_unverified_context
836 | except AttributeError:
837 | # Legacy Python that doesn't verify HTTPS certificates by default
838 | pass
839 | else:
840 | # Handle target environment that doesn't support HTTPS verification
841 | ssl._create_default_https_context = _create_unverified_https_context
842 |
843 | if config.options.mode == MODE_THREAD:
844 | crawler = ThreadSiteCrawler(config, logger)
845 | elif config.options.mode == MODE_PROCESS:
846 | crawler = ProcessSiteCrawler(config, logger)
847 | elif config.options.mode == MODE_GREEN:
848 | crawler = GreenSiteCrawler(config, logger)
849 |
850 | if not crawler:
851 | raise Exception("Invalid crawling mode supplied.")
852 |
853 | crawler.crawl()
854 |
855 | if config.options.multi:
856 | crawler.site.collect_multi_sites()
857 |
858 | return crawler
859 |
--------------------------------------------------------------------------------
/pylinkvalidator/included/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bartdag/pylinkvalidator/aac5934d88a9c99d0e4f40a8884ad942b6b10ea0/pylinkvalidator/included/__init__.py
--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/__init__.py:
--------------------------------------------------------------------------------
1 | """Beautiful Soup
2 | Elixir and Tonic
3 | "The Screen-Scraper's Friend"
4 | http://www.crummy.com/software/BeautifulSoup/
5 |
6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a
7 | (possibly invalid) document into a tree representation. Beautiful Soup
8 | provides provides methods and Pythonic idioms that make it easy to
9 | navigate, search, and modify the parse tree.
10 |
11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml
12 | and/or html5lib is installed.
13 |
14 | For more than you ever wanted to know about Beautiful Soup, see the
15 | documentation:
16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17 | """
18 |
19 | from __future__ import absolute_import
20 | import sys
21 |
22 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
23 | __version__ = "4.2.1"
24 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
25 | __license__ = "MIT"
26 |
27 |
28 | use_system_version = False
29 |
30 | try:
31 | # The system-installed version has priority providing it is not an
32 | # earlier version. The embedded bs4 only works for Python 2.
33 | import bs4
34 | if (bs4.__version__.split('.') >= __version__.split('.')) or\
35 | sys.version_info[0] >= 3:
36 | from bs4 import *
37 |
38 | # Necessary for direct import in pylinkvalidator
39 | UnicodeDammit = bs4.UnicodeDammit
40 | use_system_version = True
41 | # Make sure we copy over the version. See #17071
42 | __version__ = bs4.__version__
43 | except ImportError:
44 | if sys.version_info[0] >= 3:
45 | raise
46 |
47 | if not use_system_version:
48 |
49 | __all__ = ['BeautifulSoup']
50 |
51 | import re
52 | import warnings
53 |
54 | from .builder import builder_registry
55 | from .dammit import UnicodeDammit
56 | from .element import (
57 | CData,
58 | Comment,
59 | DEFAULT_OUTPUT_ENCODING,
60 | Declaration,
61 | Doctype,
62 | NavigableString,
63 | PageElement,
64 | ProcessingInstruction,
65 | ResultSet,
66 | SoupStrainer,
67 | Tag,
68 | )
69 |
70 | # The very first thing we do is give a useful error if someone is
71 | # running this code under Python 3 without converting it.
72 | syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
73 |
74 | class BeautifulSoup(Tag):
75 | """
76 | This class defines the basic interface called by the tree builders.
77 |
78 | These methods will be called by the parser:
79 | reset()
80 | feed(markup)
81 |
82 | The tree builder may call these methods from its feed() implementation:
83 | handle_starttag(name, attrs) # See note about return value
84 | handle_endtag(name)
85 | handle_data(data) # Appends to the current data node
86 | endData(containerClass=NavigableString) # Ends the current data node
87 |
88 | No matter how complicated the underlying parser is, you should be
89 | able to build a tree using 'start tag' events, 'end tag' events,
90 | 'data' events, and "done with data" events.
91 |
92 | If you encounter an empty-element tag (aka a self-closing tag,
93 | like HTML's
tag), call handle_starttag and then
94 | handle_endtag.
95 | """
96 | ROOT_TAG_NAME = u'[document]'
97 |
98 | # If the end-user gives no indication which tree builder they
99 | # want, look for one with these features.
100 | DEFAULT_BUILDER_FEATURES = ['html', 'fast']
101 |
102 | # Used when determining whether a text node is all whitespace and
103 | # can be replaced with a single space. A text node that contains
104 | # fancy Unicode spaces (usually non-breaking) should be left
105 | # alone.
106 | STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
107 |
108 | def __init__(self, markup="", features=None, builder=None,
109 | parse_only=None, from_encoding=None, **kwargs):
110 | """The Soup object is initialized as the 'root tag', and the
111 | provided markup (which can be a string or a file-like object)
112 | is fed into the underlying parser."""
113 |
114 | if 'convertEntities' in kwargs:
115 | warnings.warn(
116 | "BS4 does not respect the convertEntities argument to the "
117 | "BeautifulSoup constructor. Entities are always converted "
118 | "to Unicode characters.")
119 |
120 | if 'markupMassage' in kwargs:
121 | del kwargs['markupMassage']
122 | warnings.warn(
123 | "BS4 does not respect the markupMassage argument to the "
124 | "BeautifulSoup constructor. The tree builder is responsible "
125 | "for any necessary markup massage.")
126 |
127 | if 'smartQuotesTo' in kwargs:
128 | del kwargs['smartQuotesTo']
129 | warnings.warn(
130 | "BS4 does not respect the smartQuotesTo argument to the "
131 | "BeautifulSoup constructor. Smart quotes are always converted "
132 | "to Unicode characters.")
133 |
134 | if 'selfClosingTags' in kwargs:
135 | del kwargs['selfClosingTags']
136 | warnings.warn(
137 | "BS4 does not respect the selfClosingTags argument to the "
138 | "BeautifulSoup constructor. The tree builder is responsible "
139 | "for understanding self-closing tags.")
140 |
141 | if 'isHTML' in kwargs:
142 | del kwargs['isHTML']
143 | warnings.warn(
144 | "BS4 does not respect the isHTML argument to the "
145 | "BeautifulSoup constructor. You can pass in features='html' "
146 | "or features='xml' to get a builder capable of handling "
147 | "one or the other.")
148 |
149 | def deprecated_argument(old_name, new_name):
150 | if old_name in kwargs:
151 | warnings.warn(
152 | 'The "%s" argument to the BeautifulSoup constructor '
153 | 'has been renamed to "%s."' % (old_name, new_name))
154 | value = kwargs[old_name]
155 | del kwargs[old_name]
156 | return value
157 | return None
158 |
159 | parse_only = parse_only or deprecated_argument(
160 | "parseOnlyThese", "parse_only")
161 |
162 | from_encoding = from_encoding or deprecated_argument(
163 | "fromEncoding", "from_encoding")
164 |
165 | if len(kwargs) > 0:
166 | arg = kwargs.keys().pop()
167 | raise TypeError(
168 | "__init__() got an unexpected keyword argument '%s'" % arg)
169 |
170 | if builder is None:
171 | if isinstance(features, basestring):
172 | features = [features]
173 | if features is None or len(features) == 0:
174 | features = self.DEFAULT_BUILDER_FEATURES
175 | builder_class = builder_registry.lookup(*features)
176 | if builder_class is None:
177 | raise FeatureNotFound(
178 | "Couldn't find a tree builder with the features you "
179 | "requested: %s. Do you need to install a parser library?"
180 | % ",".join(features))
181 | builder = builder_class()
182 | self.builder = builder
183 | self.is_xml = builder.is_xml
184 | self.builder.soup = self
185 |
186 | self.parse_only = parse_only
187 |
188 | self.reset()
189 |
190 | if hasattr(markup, 'read'): # It's a file-type object.
191 | markup = markup.read()
192 | (self.markup, self.original_encoding, self.declared_html_encoding,
193 | self.contains_replacement_characters) = (
194 | self.builder.prepare_markup(markup, from_encoding))
195 |
196 | try:
197 | self._feed()
198 | except StopParsing:
199 | pass
200 |
201 | # Clear out the markup and remove the builder's circular
202 | # reference to this object.
203 | self.markup = None
204 | self.builder.soup = None
205 |
206 | def _feed(self):
207 | # Convert the document to Unicode.
208 | self.builder.reset()
209 |
210 | self.builder.feed(self.markup)
211 | # Close out any unfinished strings and close all the open tags.
212 | self.endData()
213 | while self.currentTag.name != self.ROOT_TAG_NAME:
214 | self.popTag()
215 |
216 | def reset(self):
217 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
218 | self.hidden = 1
219 | self.builder.reset()
220 | self.currentData = []
221 | self.currentTag = None
222 | self.tagStack = []
223 | self.pushTag(self)
224 |
225 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
226 | """Create a new tag associated with this soup."""
227 | return Tag(None, self.builder, name, namespace, nsprefix, attrs)
228 |
229 | def new_string(self, s, subclass=NavigableString):
230 | """Create a new NavigableString associated with this soup."""
231 | navigable = subclass(s)
232 | navigable.setup()
233 | return navigable
234 |
235 | def insert_before(self, successor):
236 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
237 |
238 | def insert_after(self, successor):
239 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
240 |
241 | def popTag(self):
242 | tag = self.tagStack.pop()
243 | #print "Pop", tag.name
244 | if self.tagStack:
245 | self.currentTag = self.tagStack[-1]
246 | return self.currentTag
247 |
248 | def pushTag(self, tag):
249 | #print "Push", tag.name
250 | if self.currentTag:
251 | self.currentTag.contents.append(tag)
252 | self.tagStack.append(tag)
253 | self.currentTag = self.tagStack[-1]
254 |
255 | def endData(self, containerClass=NavigableString):
256 | if self.currentData:
257 | currentData = u''.join(self.currentData)
258 | if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
259 | not set([tag.name for tag in self.tagStack]).intersection(
260 | self.builder.preserve_whitespace_tags)):
261 | if '\n' in currentData:
262 | currentData = '\n'
263 | else:
264 | currentData = ' '
265 | self.currentData = []
266 | if self.parse_only and len(self.tagStack) <= 1 and \
267 | (not self.parse_only.text or \
268 | not self.parse_only.search(currentData)):
269 | return
270 | o = containerClass(currentData)
271 | self.object_was_parsed(o)
272 |
273 | def object_was_parsed(self, o, parent=None, most_recent_element=None):
274 | """Add an object to the parse tree."""
275 | parent = parent or self.currentTag
276 | most_recent_element = most_recent_element or self._most_recent_element
277 | o.setup(parent, most_recent_element)
278 | if most_recent_element is not None:
279 | most_recent_element.next_element = o
280 | self._most_recent_element = o
281 | parent.contents.append(o)
282 |
283 | def _popToTag(self, name, nsprefix=None, inclusivePop=True):
284 | """Pops the tag stack up to and including the most recent
285 | instance of the given tag. If inclusivePop is false, pops the tag
286 | stack up to but *not* including the most recent instqance of
287 | the given tag."""
288 | #print "Popping to %s" % name
289 | if name == self.ROOT_TAG_NAME:
290 | return
291 |
292 | numPops = 0
293 | mostRecentTag = None
294 |
295 | for i in range(len(self.tagStack) - 1, 0, -1):
296 | if (name == self.tagStack[i].name
297 | and nsprefix == self.tagStack[i].prefix):
298 | numPops = len(self.tagStack) - i
299 | break
300 | if not inclusivePop:
301 | numPops = numPops - 1
302 |
303 | for i in range(0, numPops):
304 | mostRecentTag = self.popTag()
305 | return mostRecentTag
306 |
307 | def handle_starttag(self, name, namespace, nsprefix, attrs):
308 | """Push a start tag on to the stack.
309 |
310 | If this method returns None, the tag was rejected by the
311 | SoupStrainer. You should proceed as if the tag had not occured
312 | in the document. For instance, if this was a self-closing tag,
313 | don't call handle_endtag.
314 | """
315 |
316 | # print "Start tag %s: %s" % (name, attrs)
317 | self.endData()
318 |
319 | if (self.parse_only and len(self.tagStack) <= 1
320 | and (self.parse_only.text
321 | or not self.parse_only.search_tag(name, attrs))):
322 | return None
323 |
324 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
325 | self.currentTag, self._most_recent_element)
326 | if tag is None:
327 | return tag
328 | if self._most_recent_element:
329 | self._most_recent_element.next_element = tag
330 | self._most_recent_element = tag
331 | self.pushTag(tag)
332 | return tag
333 |
334 | def handle_endtag(self, name, nsprefix=None):
335 | #print "End tag: " + name
336 | self.endData()
337 | self._popToTag(name, nsprefix)
338 |
339 | def handle_data(self, data):
340 | self.currentData.append(data)
341 |
342 | def decode(self, pretty_print=False,
343 | eventual_encoding=DEFAULT_OUTPUT_ENCODING,
344 | formatter="minimal"):
345 | """Returns a string or Unicode representation of this document.
346 | To get Unicode, pass None for encoding."""
347 |
348 | if self.is_xml:
349 | # Print the XML declaration
350 | encoding_part = ''
351 | if eventual_encoding != None:
352 | encoding_part = ' encoding="%s"' % eventual_encoding
353 | prefix = u'\n' % encoding_part
354 | else:
355 | prefix = u''
356 | if not pretty_print:
357 | indent_level = None
358 | else:
359 | indent_level = 0
360 | return prefix + super(BeautifulSoup, self).decode(
361 | indent_level, eventual_encoding, formatter)
362 |
363 | # Alias to make it easier to type import: 'from bs4 import _soup'
364 | _s = BeautifulSoup
365 | _soup = BeautifulSoup
366 |
367 | class BeautifulStoneSoup(BeautifulSoup):
368 | """Deprecated interface to an XML parser."""
369 |
370 | def __init__(self, *args, **kwargs):
371 | kwargs['features'] = 'xml'
372 | warnings.warn(
373 | 'The BeautifulStoneSoup class is deprecated. Instead of using '
374 | 'it, pass features="xml" into the BeautifulSoup constructor.')
375 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
376 |
377 |
378 | class StopParsing(Exception):
379 | pass
380 |
381 |
382 | class FeatureNotFound(ValueError):
383 | pass
384 |
385 |
386 | #By default, act as an HTML pretty-printer.
387 | if __name__ == '__main__':
388 | import sys
389 | soup = BeautifulSoup(sys.stdin)
390 | print(soup.prettify())
391 |
--------------------------------------------------------------------------------
/pylinkvalidator/included/bs4/builder/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 |
4 | if sys.version_info[0] < 3:
5 | from collections import defaultdict
6 | import itertools
7 | import sys
8 | from pylinkvalidator.included.bs4.element import (
9 | CharsetMetaAttributeValue,
10 | ContentMetaAttributeValue,
11 | whitespace_re
12 | )
13 |
14 | __all__ = [
15 | 'HTMLTreeBuilder',
16 | 'SAXTreeBuilder',
17 | 'TreeBuilder',
18 | 'TreeBuilderRegistry',
19 | ]
20 |
21 | # Some useful features for a TreeBuilder to have.
22 | FAST = 'fast'
23 | PERMISSIVE = 'permissive'
24 | STRICT = 'strict'
25 | XML = 'xml'
26 | HTML = 'html'
27 | HTML_5 = 'html5'
28 |
29 |
30 | class TreeBuilderRegistry(object):
31 |
32 | def __init__(self):
33 | self.builders_for_feature = defaultdict(list)
34 | self.builders = []
35 |
36 | def register(self, treebuilder_class):
37 | """Register a treebuilder based on its advertised features."""
38 | for feature in treebuilder_class.features:
39 | self.builders_for_feature[feature].insert(0, treebuilder_class)
40 | self.builders.insert(0, treebuilder_class)
41 |
42 | def lookup(self, *features):
43 | if len(self.builders) == 0:
44 | # There are no builders at all.
45 | return None
46 |
47 | if len(features) == 0:
48 | # They didn't ask for any features. Give them the most
49 | # recently registered builder.
50 | return self.builders[0]
51 |
52 | # Go down the list of features in order, and eliminate any builders
53 | # that don't match every feature.
54 | features = list(features)
55 | features.reverse()
56 | candidates = None
57 | candidate_set = None
58 | while len(features) > 0:
59 | feature = features.pop()
60 | we_have_the_feature = self.builders_for_feature.get(feature, [])
61 | if len(we_have_the_feature) > 0:
62 | if candidates is None:
63 | candidates = we_have_the_feature
64 | candidate_set = set(candidates)
65 | else:
66 | # Eliminate any candidates that don't have this feature.
67 | candidate_set = candidate_set.intersection(
68 | set(we_have_the_feature))
69 |
70 | # The only valid candidates are the ones in candidate_set.
71 | # Go through the original list of candidates and pick the first one
72 | # that's in candidate_set.
73 | if candidate_set is None:
74 | return None
75 | for candidate in candidates:
76 | if candidate in candidate_set:
77 | return candidate
78 | return None
79 |
80 | # The BeautifulSoup class will take feature lists from developers and use them
81 | # to look up builders in this registry.
82 | builder_registry = TreeBuilderRegistry()
83 |
84 | class TreeBuilder(object):
85 | """Turn a document into a Beautiful Soup object tree."""
86 |
87 | features = []
88 |
89 | is_xml = False
90 | preserve_whitespace_tags = set()
91 | empty_element_tags = None # A tag will be considered an empty-element
92 | # tag when and only when it has no contents.
93 |
94 | # A value for these tag/attribute combinations is a space- or
95 | # comma-separated list of CDATA, rather than a single CDATA.
96 | cdata_list_attributes = {}
97 |
98 |
99 | def __init__(self):
100 | self.soup = None
101 |
102 | def reset(self):
103 | pass
104 |
105 | def can_be_empty_element(self, tag_name):
106 | """Might a tag with this name be an empty-element tag?
107 |
108 | The final markup may or may not actually present this tag as
109 | self-closing.
110 |
111 | For instance: an HTMLBuilder does not consider a
tag to be 112 | an empty-element tag (it's not in 113 | HTMLBuilder.empty_element_tags). This means an empty
tag 114 | will be presented as "
", not "
".
115 |
116 | The default implementation has no opinion about which tags are
117 | empty-element tags, so a tag will be presented as an
118 | empty-element tag if and only if it has no contents.
119 | "
tag, and html5lib 138 | doesn't. Abstracting this away lets us write simple tests 139 | which run HTML fragments through the parser and compare the 140 | results against other HTML fragments. 141 | 142 | This method should not be used outside of tests. 143 | """ 144 | return fragment 145 | 146 | def set_up_substitutions(self, tag): 147 | return False 148 | 149 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 150 | """Replaces class="foo bar" with class=["foo", "bar"] 151 | 152 | Modifies its input in place. 153 | """ 154 | if self.cdata_list_attributes: 155 | universal = self.cdata_list_attributes.get('*', []) 156 | tag_specific = self.cdata_list_attributes.get( 157 | tag_name.lower(), []) 158 | for cdata_list_attr in itertools.chain(universal, tag_specific): 159 | if cdata_list_attr in attrs: 160 | # Basically, we have a "class" attribute whose 161 | # value is a whitespace-separated list of CSS 162 | # classes. Split it into a list. 163 | value = attrs[cdata_list_attr] 164 | if isinstance(value, basestring): 165 | values = whitespace_re.split(value) 166 | else: 167 | # html5lib sometimes calls setAttributes twice 168 | # for the same tag when rearranging the parse 169 | # tree. On the second call the attribute value 170 | # here is already a list. If this happens, 171 | # leave the value alone rather than trying to 172 | # split it again. 173 | values = value 174 | attrs[cdata_list_attr] = values 175 | return attrs 176 | 177 | class SAXTreeBuilder(TreeBuilder): 178 | """A Beautiful Soup treebuilder that listens for SAX events.""" 179 | 180 | def feed(self, markup): 181 | raise NotImplementedError() 182 | 183 | def close(self): 184 | pass 185 | 186 | def startElement(self, name, attrs): 187 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 188 | #print "Start %s, %r" % (name, attrs) 189 | self.soup.handle_starttag(name, attrs) 190 | 191 | def endElement(self, name): 192 | #print "End %s" % name 193 | self.soup.handle_endtag(name) 194 | 195 | def startElementNS(self, nsTuple, nodeName, attrs): 196 | # Throw away (ns, nodeName) for now. 197 | self.startElement(nodeName, attrs) 198 | 199 | def endElementNS(self, nsTuple, nodeName): 200 | # Throw away (ns, nodeName) for now. 201 | self.endElement(nodeName) 202 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 203 | 204 | def startPrefixMapping(self, prefix, nodeValue): 205 | # Ignore the prefix for now. 206 | pass 207 | 208 | def endPrefixMapping(self, prefix): 209 | # Ignore the prefix for now. 210 | # handler.endPrefixMapping(prefix) 211 | pass 212 | 213 | def characters(self, content): 214 | self.soup.handle_data(content) 215 | 216 | def startDocument(self): 217 | pass 218 | 219 | def endDocument(self): 220 | pass 221 | 222 | 223 | class HTMLTreeBuilder(TreeBuilder): 224 | """This TreeBuilder knows facts about HTML. 225 | 226 | Such as which tags are empty-element tags. 227 | """ 228 | 229 | preserve_whitespace_tags = set(['pre', 'textarea']) 230 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 231 | 'spacer', 'link', 'frame', 'base']) 232 | 233 | # The HTML standard defines these attributes as containing a 234 | # space-separated list of values, not a single value. That is, 235 | # class="foo bar" means that the 'class' attribute has two values, 236 | # 'foo' and 'bar', not the single value 'foo bar'. When we 237 | # encounter one of these attributes, we will parse its value into 238 | # a list of values if possible. Upon output, the list will be 239 | # converted back into a string. 240 | cdata_list_attributes = { 241 | "*" : ['class', 'accesskey', 'dropzone'], 242 | "a" : ['rel', 'rev'], 243 | "link" : ['rel', 'rev'], 244 | "td" : ["headers"], 245 | "th" : ["headers"], 246 | "td" : ["headers"], 247 | "form" : ["accept-charset"], 248 | "object" : ["archive"], 249 | 250 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 251 | "area" : ["rel"], 252 | "icon" : ["sizes"], 253 | "iframe" : ["sandbox"], 254 | "output" : ["for"], 255 | } 256 | 257 | def set_up_substitutions(self, tag): 258 | # We are only interested in tags 259 | if tag.name != 'meta': 260 | return False 261 | 262 | http_equiv = tag.get('http-equiv') 263 | content = tag.get('content') 264 | charset = tag.get('charset') 265 | 266 | # We are interested in tags that say what encoding the 267 | # document was originally in. This means HTML 5-style 268 | # tags that provide the "charset" attribute. It also means 269 | # HTML 4-style tags that provide the "content" 270 | # attribute and have "http-equiv" set to "content-type". 271 | # 272 | # In both cases we will replace the value of the appropriate 273 | # attribute with a standin object that can take on any 274 | # encoding. 275 | meta_encoding = None 276 | if charset is not None: 277 | # HTML 5 style: 278 | # 279 | meta_encoding = charset 280 | tag['charset'] = CharsetMetaAttributeValue(charset) 281 | 282 | elif (content is not None and http_equiv is not None 283 | and http_equiv.lower() == 'content-type'): 284 | # HTML 4 style: 285 | # 286 | tag['content'] = ContentMetaAttributeValue(content) 287 | 288 | return (meta_encoding is not None) 289 | 290 | def register_treebuilders_from(module): 291 | """Copy TreeBuilders from the given module into this module.""" 292 | # I'm fairly sure this is not the best way to do this. 293 | this_module = sys.modules['pylinkvalidator.included.bs4.builder'] 294 | for name in module.__all__: 295 | obj = getattr(module, name) 296 | 297 | if issubclass(obj, TreeBuilder): 298 | setattr(this_module, name, obj) 299 | this_module.__all__.append(name) 300 | # Register the builder while we're at it. 301 | this_module.builder_registry.register(obj) 302 | 303 | # Builders are registered in reverse order of priority, so that custom 304 | # builder registrations will take precedence. In general, we want lxml 305 | # to take precedence over html5lib, because it's faster. And we only 306 | # want to use HTMLParser as a last result. 307 | from . import _htmlparser 308 | register_treebuilders_from(_htmlparser) 309 | try: 310 | from . import _html5lib 311 | register_treebuilders_from(_html5lib) 312 | except ImportError: 313 | # They don't have html5lib installed. 314 | pass 315 | try: 316 | from . import _lxml 317 | register_treebuilders_from(_lxml) 318 | except ImportError: 319 | # They don't have lxml installed. 320 | pass 321 | -------------------------------------------------------------------------------- /pylinkvalidator/included/bs4/builder/_html5lib.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | if sys.version_info[0] < 3: 5 | __all__ = [ 6 | 'HTML5TreeBuilder', 7 | ] 8 | 9 | import warnings 10 | from pylinkvalidator.included.bs4.builder import ( 11 | PERMISSIVE, 12 | HTML, 13 | HTML_5, 14 | HTMLTreeBuilder, 15 | ) 16 | from pylinkvalidator.included.bs4.element import NamespacedAttribute 17 | import html5lib 18 | from html5lib.constants import namespaces 19 | from pylinkvalidator.included.bs4.element import ( 20 | Comment, 21 | Doctype, 22 | NavigableString, 23 | Tag, 24 | ) 25 | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): 27 | """Use html5lib to build a tree.""" 28 | 29 | features = ['html5lib', PERMISSIVE, HTML_5, HTML] 30 | 31 | def prepare_markup(self, markup, user_specified_encoding): 32 | # Store the user-specified encoding for use later on. 33 | self.user_specified_encoding = user_specified_encoding 34 | return markup, None, None, False 35 | 36 | # These methods are defined by Beautiful Soup. 37 | def feed(self, markup): 38 | if self.soup.parse_only is not None: 39 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 40 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 41 | doc = parser.parse(markup, encoding=self.user_specified_encoding) 42 | 43 | # Set the character encoding detected by the tokenizer. 44 | if isinstance(markup, unicode): 45 | # We need to special-case this because html5lib sets 46 | # charEncoding to UTF-8 if it gets Unicode input. 47 | doc.original_encoding = None 48 | else: 49 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 50 | 51 | def create_treebuilder(self, namespaceHTMLElements): 52 | self.underlying_builder = TreeBuilderForHtml5lib( 53 | self.soup, namespaceHTMLElements) 54 | return self.underlying_builder 55 | 56 | def test_fragment_to_document(self, fragment): 57 | """See `TreeBuilder`.""" 58 | return u'