├── .gitignore ├── README.md ├── images └── sample.png ├── scrapydot.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | *.swp 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Description 2 | =========== 3 | A Scrapy extensions which export a graph of link between crawled items by scrapy 4 | in dot file format. 5 | 6 | Dot file can be convert to graph with graphviz or tools like gephi. 7 | 8 | Sample output with gephi: 9 | 10 | 11 | 12 | 13 | Install 14 | ======= 15 | pip install "ScrapyDot" 16 | 17 | Configure your settings.py: 18 | ---------------------------- 19 | EXTENSIONS = { 20 | "scrapydot.ScrapyDot": 1000 21 | } 22 | 23 | DOT_OUTPUT_DIRECTORY = "dot" 24 | 25 | 26 | Changelog 27 | ========= 28 | 0.1 29 | 30 | * Initial version 31 | 32 | Licence 33 | ======= 34 | Copyright 2011 Julien Duponchelle 35 | 36 | Licensed under the Apache License, Version 2.0 (the "License"); 37 | you may not use this file except in compliance with the License. 38 | You may obtain a copy of the License at 39 | 40 | http://www.apache.org/licenses/LICENSE-2.0 41 | 42 | Unless required by applicable law or agreed to in writing, software 43 | distributed under the License is distributed on an "AS IS" BASIS, 44 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 45 | See the License for the specific language governing permissions and 46 | limitations under the License. 47 | -------------------------------------------------------------------------------- /images/sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julien-duponchelle/scrapy-dot/545280389cb5e6b41141038684c93b910c71cdf1/images/sample.png -------------------------------------------------------------------------------- /scrapydot.py: -------------------------------------------------------------------------------- 1 | from scrapy.xlib.pydispatch import dispatcher 2 | from scrapy import signals 3 | from scrapy.conf import settings 4 | import os 5 | import os.path 6 | 7 | class ScrapyDot(object): 8 | def __init__(self): 9 | dispatcher.connect(self.request_received, signal=signals.request_received) 10 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 11 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 12 | if not os.path.exists(settings['DOT_OUTPUT_DIRECTORY']): 13 | os.makedirs(settings['DOT_OUTPUT_DIRECTORY']) 14 | self.output = {} 15 | 16 | def spider_opened(self, spider): 17 | self.output[spider.name] = open("%s/%s.dot" % (settings['DOT_OUTPUT_DIRECTORY'], spider.name), 'w+') 18 | self.output[spider.name].write("digraph %s {\n" % (spider.name)) 19 | 20 | 21 | def request_received(self, request, spider): 22 | if request.headers.has_key('Referer'): 23 | out = "\"%s\" -> \"%s\";\n" % (request.headers['Referer'], request.url) 24 | self.output[spider.name].write(out) 25 | 26 | def spider_closed(self,spider): 27 | self.output[spider.name].write("}\n") 28 | self.output[spider.name].close() 29 | 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='ScrapyDot', 4 | version='0.1', 5 | license='Apache License, Version 2.0', 6 | description='Export a graph of link between crawled items in dot file format.', 7 | author='Julien Duponchelle', 8 | author_email='julien@duponchelle.info', 9 | url='http://github.com/noplay/scrapy-dot', 10 | keywords="scrapy dot graphviz", 11 | py_modules=['scrapydot'], 12 | platforms = ['Any'], 13 | install_requires = ['scrapy'], 14 | classifiers = [ 'Development Status :: 4 - Beta', 15 | 'Environment :: No Input/Output (Daemon)', 16 | 'License :: OSI Approved :: Apache Software License', 17 | 'Operating System :: OS Independent', 18 | 'Programming Language :: Python'] 19 | ) 20 | 21 | --------------------------------------------------------------------------------