├── requirements.txt ├── config.py ├── README.md ├── .gitignore └── broken_links_spider.py /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy 2 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | name = "github_help" 2 | allowed_domains = ["help.github.com", ] 3 | start_urls = ["https://help.github.com", ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Usage 2 | ------------- 3 | 4 | Configure urls to parse in config.py. Then, run: 5 | 6 | scrapy runspider broken_links_spider.py -o output.json 7 | 8 | Then check 404 items in the output.json file. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | *.json -------------------------------------------------------------------------------- /broken_links_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy.selector import HtmlXPathSelector 2 | from scrapy.contrib.spiders import CrawlSpider, Rule 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy.item import Item, Field 5 | import config 6 | 7 | 8 | class BrokenItem(Item): 9 | url = Field() 10 | referer = Field() 11 | status = Field() 12 | 13 | 14 | class BrokenLinksSpider(CrawlSpider): 15 | name = config.name 16 | allowed_domains = config.allowed_domains 17 | start_urls = config.start_urls 18 | handle_httpstatus_list = [404] 19 | rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True),) 20 | 21 | def parse_item(self, response): 22 | if response.status == 404: 23 | item = BrokenItem() 24 | item['url'] = response.url 25 | item['referer'] = response.request.headers.get('Referer') 26 | item['status'] = response.status 27 | 28 | return item 29 | --------------------------------------------------------------------------------