├── Documentation ├── License └── Changelog ├── README.md └── Source └── 404.py /Documentation/License: -------------------------------------------------------------------------------- 1 | 2 | LICENSE 3 | 4 | Permission is hereby granted, free of charge, to anyone 5 | obtaining a copy of this document and accompanying files, 6 | to do whatever they want with them without any restriction, 7 | including, but not limited to, copying, modification and redistribution. 8 | 9 | NO WARRANTY OF ANY KIND IS PROVIDED. 10 | 11 | -------------------------------------------------------------------------------- /Documentation/Changelog: -------------------------------------------------------------------------------- 1 | 2 | CHANGELOG 3 | 4 | * 2016/02/03: 5 | 6 | - Revised. Working on Python 3.5.0, beautifulsoup4 4.4.1, 7 | requests 2.9.1. 8 | 9 | * 2015/07/27: 10 | 11 | - 404 now uses 'html.parser' explicitly. 12 | 13 | * 2015/05/12: 14 | 15 | - Allow ignoring internal links. 16 | 17 | * 2015/05/10: 18 | 19 | - Avoid parsing the entire HTML and look only for link tags. 20 | This should make 404 faster and use less memory. 21 | 22 | - Show the number of errors in the final stats. 23 | 24 | * 2015/05/09: 25 | 26 | - Make a single, lazy get request instead of head/get. 27 | This is a major performance improvement. 28 | 29 | - Also look for

when crawling. 30 | 31 | - Added links and time statistics and an option to suppress them (--quiet). 32 | 33 | - Added an option to disable redirects (--no-redirects). 34 | 35 | - Bugfix: add the root url to the link cache too. 36 | 37 | - Bugfix: check that the number of threads is positive. 38 | 39 | * 2015/05/08: 40 | 41 | - Implemented 'ignore', 'check' and 'follow' for links 42 | allowing recursive link crawling. 43 | 44 | - Print all http status codes > 400 instead of just 404. 45 | 46 | - Follow redirects. 47 | May add an option later to turn them off. 48 | 49 | - Check the headers content type when crawling 50 | to avoid doing get requests when possible. 51 | 52 | - Ignore fragments. 53 | 54 | * 2015/05/06: 55 | 56 | - First version. 57 | 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## About 3 | 4 | This shouldn't have happened. 5 | 6 | The thing is... I was testing a new programming language by writing 7 | a simple web crawler as an exercise. Being frustrated by multiple concurrency 8 | bugs in the stdlib I thought: "Okay, enough. I can probably write this in 9 | Python in an evening". 10 | 11 | Famous last words. 12 | 13 | A week later, the program snowballed from a toy example and it currently 14 | has the following features: 15 | 16 | * Supports SSL, redirections and custom timeouts, thanks 17 | to the excellent [requests][] library. 18 | 19 | * Lenient HTML parsing, so dubious markup should be fine, using 20 | the also excellent [beautifulsoup4][] library. 21 | 22 | * Validates both usual `` hyperlinks and `

182 | for tag in soup.find_all('img', src = True): 183 | absolute_link = urllib.parse.urljoin(self.link, tag['src']) 184 | self.links.append(absolute_link) 185 | 186 | except: 187 | self.exception = sys.exc_info() 188 | 189 | 190 | # IO: 191 | 192 | # For portability, all output is done in bytes 193 | # to avoid Python default encoding and automatic newline conversion: 194 | 195 | def utf8_bytes(string): 196 | """ Convert 'string' to bytes using UTF-8. """ 197 | return bytes(string, 'UTF-8') 198 | 199 | 200 | BYTES_NEWLINES = { 201 | 'dos' : b'\r\n', 202 | 'mac' : b'\r', 203 | 'unix' : b'\n', 204 | 'system' : utf8_bytes(os.linesep), 205 | } 206 | 207 | 208 | def binary_stdout_writeline(line, newline): 209 | """ 210 | Write 'line' (as bytes) to stdout without buffering 211 | using the specified 'newline' format (as bytes). 212 | """ 213 | sys.stdout.buffer.write(line) 214 | sys.stdout.buffer.write(newline) 215 | sys.stdout.flush() 216 | 217 | 218 | # Parser: 219 | 220 | def make_parser(): 221 | parser = ArgumentParser( 222 | description = __doc__, 223 | formatter_class = RawDescriptionHelpFormatter, 224 | epilog = 'example: 404.py http://beluki.github.io --internal follow --threads 5', 225 | usage = '404.py url [option [options ...]]', 226 | ) 227 | 228 | # positional: 229 | parser.add_argument('url', 230 | help = 'url to crawl looking for links') 231 | 232 | # optional: 233 | parser.add_argument('--external', 234 | help = 'whether to check, ignore or follow external links (default: check)', 235 | choices = ['check', 'ignore', 'follow'], 236 | default = 'check') 237 | 238 | parser.add_argument('--internal', 239 | help = 'whether to check, ignore or follow internal links (default: check)', 240 | choices = ['check', 'ignore', 'follow'], 241 | default = 'check') 242 | 243 | parser.add_argument('--newline', 244 | help = 'use a specific newline mode (default: system)', 245 | choices = ['dos', 'mac', 'unix', 'system'], 246 | default = 'system') 247 | 248 | parser.add_argument('--no-redirects', 249 | help = 'do not follow redirects, just return the status code', 250 | action = 'store_true') 251 | 252 | parser.add_argument('--print-all', 253 | help = 'print all status codes and urls instead of only errors', 254 | action = 'store_true') 255 | 256 | parser.add_argument('--quiet', 257 | help = 'do not print statistics to stderr after crawling', 258 | action = 'store_true') 259 | 260 | parser.add_argument('--threads', 261 | help = 'number of threads (default: 1)', 262 | default = 1, 263 | type = int) 264 | 265 | parser.add_argument('--timeout', 266 | help = 'seconds to wait for request responses (default: 10)', 267 | default = 10, 268 | type = int) 269 | 270 | return parser 271 | 272 | 273 | # Main program: 274 | 275 | def run(url, allow_redirects, internal, external, newline, print_all, quiet, threads, timeout): 276 | """ 277 | Setup a threadpool and start checking links. 278 | """ 279 | status = 0 280 | 281 | # create the pool and a task to start at the root: 282 | pool = ThreadPool(threads) 283 | pool.add_task(LinkTask(url, True, timeout, allow_redirects)) 284 | pool.start() 285 | 286 | # link cache to avoid following repeating links: 287 | link_cache = set([url]) 288 | 289 | # url domain: 290 | netloc = urllib.parse.urlparse(url).netloc 291 | 292 | # stats: 293 | st_total_links = 1 294 | st_total_internal = 1 295 | st_total_external = 0 296 | st_error_task = 0 297 | st_error_link = 0 298 | st_start_time = time.clock() 299 | 300 | # start checking links: 301 | for task in pool.poll_completed_tasks(): 302 | 303 | # error in request: 304 | if task.exception: 305 | status = 1 306 | exc_type, exc_obj, exc_trace = task.exception 307 | 308 | # provide a concise error message for timeouts (common): 309 | if isinstance(exc_obj, Timeout): 310 | errln('{} - timeout.'.format(task.link)) 311 | else: 312 | errln('{} - {}.'.format(task.link, exc_obj)) 313 | 314 | st_error_task += 1 315 | 316 | else: 317 | client_or_server_error = (400 <= task.status < 600) 318 | 319 | if client_or_server_error or print_all: 320 | output = utf8_bytes('{}: {}'.format(task.status, task.link)) 321 | binary_stdout_writeline(output, newline) 322 | 323 | if client_or_server_error: 324 | st_error_link += 1 325 | 326 | for link in task.links: 327 | 328 | # ignore client-side fragment: 329 | link, _ = urllib.parse.urldefrag(link) 330 | 331 | if link not in link_cache: 332 | link_cache.add(link) 333 | parsed = urllib.parse.urlparse(link) 334 | 335 | # accept http/s protocols: 336 | if not parsed.scheme in ('http', 'https'): 337 | continue 338 | 339 | # internal or external link? 340 | if parsed.netloc == netloc: 341 | if internal == 'ignore': 342 | continue 343 | 344 | st_total_internal += 1 345 | get_links = (internal == 'follow') 346 | 347 | else: 348 | if external == 'ignore': 349 | continue 350 | 351 | st_total_external += 1 352 | get_links = (external == 'follow') 353 | 354 | link_task = LinkTask(link, get_links, timeout, allow_redirects) 355 | pool.add_task(link_task) 356 | st_total_links += 1 357 | 358 | if not quiet: 359 | st_end_time = time.clock() - st_start_time 360 | 361 | print('Checked {} total links in {:.3} seconds.'.format(st_total_links, st_end_time), file = sys.stderr) 362 | print('{} internal, {} external.'.format(st_total_internal, st_total_external), file = sys.stderr) 363 | print('{} network/parsing errors, {} link errors.'.format(st_error_task, st_error_link), file = sys.stderr) 364 | 365 | sys.exit(status) 366 | 367 | 368 | # Entry point: 369 | 370 | def main(): 371 | parser = make_parser() 372 | options = parser.parse_args() 373 | 374 | url = options.url 375 | external = options.external 376 | internal = options.internal 377 | newline = BYTES_NEWLINES[options.newline] 378 | no_redirects = options.no_redirects 379 | print_all = options.print_all 380 | quiet = options.quiet 381 | threads = options.threads 382 | 383 | # validate threads number: 384 | if threads < 1: 385 | errln('the number of threads must be positive.') 386 | sys.exit(1) 387 | 388 | # 0 means no timeout: 389 | if options.timeout > 0: 390 | timeout = options.timeout 391 | else: 392 | timeout = None 393 | 394 | allow_redirects = not(no_redirects) 395 | run(url, allow_redirects, internal, external, newline, print_all, quiet, threads, timeout) 396 | 397 | 398 | if __name__ == '__main__': 399 | try: 400 | main() 401 | except KeyboardInterrupt: 402 | pass 403 | 404 | --------------------------------------------------------------------------------