├── README.rst ├── scaws ├── __init__.py ├── statscol.py ├── tests │ ├── __init__.py │ ├── test_spiderqueue.py │ └── test_utils.py └── utils.py └── setup.py /README.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | scaws 3 | ===== 4 | 5 | This project contains some components and extensions for using Scrapy on Amazon 6 | AWS. 7 | 8 | Requirements 9 | ============ 10 | 11 | * Scrapy 0.13 or above 12 | * boto 1.8 or above 13 | 14 | Install 15 | ======= 16 | 17 | Download and run: ``python setup.py install`` 18 | 19 | Available components 20 | ==================== 21 | 22 | SimpleDB stats collector 23 | ------------------------ 24 | 25 | Module: ``scaws.statscol`` 26 | 27 | .. class:: SimpledbStatsCollector 28 | 29 | A Stats collector which persists stats to `Amazon SimpleDB`_, using one 30 | SimpleDB item per scraping run (ie. it keeps history of all scraping runs). 31 | The data is persisted to the SimpleDB domain specified by the 32 | `STATS_SDB_DOMAIN`_ setting. The domain will be created if it 33 | doesn't exist. 34 | 35 | In addition to the existing stats keys, the following keys are added at 36 | persitance time: 37 | 38 | * ``spider``: the spider name (so you can use it later for querying stats 39 | for that spider) 40 | * ``timestamp``: the timestamp when the stats were persisted 41 | 42 | Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB 43 | item name in order to avoid overwriting stats of previous scraping runs. 44 | 45 | As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and 46 | numbers are zero-padded to 16 digits. Negative numbers are not currently 47 | supported. 48 | 49 | This Stats Collector requires the `boto`_ library. 50 | 51 | .. _Amazon SimpleDB: http://aws.amazon.com/simpledb/ 52 | .. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html 53 | .. _boto: http://code.google.com/p/boto/ 54 | 55 | This Stats Collector can be configured through the following settings: 56 | 57 | * `STATS_SDB_DOMAIN`_ 58 | * `STATS_SDB_ASYNC`_ 59 | 60 | .. _STATS_SDB_DOMAIN: 61 | 62 | STATS_SDB_DOMAIN 63 | ~~~~~~~~~~~~~~~~ 64 | 65 | Default: ``'scrapy_stats'`` 66 | 67 | A string containing the SimpleDB domain to use for collecting the stats. 68 | 69 | .. _STATS_SDB_ASYNC: 70 | 71 | STATS_SDB_ASYNC 72 | ~~~~~~~~~~~~~~~ 73 | 74 | Default: ``False`` 75 | 76 | If ``True``, communication with SimpleDB will be performed asynchronously. If 77 | ``False`` blocking IO will be used instead. This is the default as using 78 | asynchronous communication can result in the stats not being persisted if the 79 | Scrapy engine is shut down in the middle (for example, when you run only one 80 | spider in a process and then exit). 81 | 82 | 83 | -------------------------------------------------------------------------------- /scaws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/scaws/67bd72e1bc56f1cc8505c30515f02a8bd41511ee/scaws/__init__.py -------------------------------------------------------------------------------- /scaws/statscol.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Stats collector for persisting stats to Amazon SimpleDB. 3 | 4 | Requires the boto library: http://code.google.com/p/boto/ 5 | """ 6 | 7 | from datetime import datetime 8 | 9 | import boto 10 | from twisted.internet import threads 11 | 12 | from scrapy.statscol import StatsCollector 13 | from scrapy import log 14 | 15 | from scaws.utils import to_sdb_value 16 | 17 | class SimpledbStatsCollector(StatsCollector): 18 | 19 | def __init__(self, crawler): 20 | super(SimpledbStatsCollector, self).__init__(crawler) 21 | self._sdbdomain = crawler.settings['STATS_SDB_DOMAIN'] 22 | self._access_key = crawler.settings['AWS_ACCESS_KEY_ID'] 23 | self._secret_key = crawler.settings['AWS_SECRET_ACCESS_KEY'] 24 | self._async = crawler.settings.getbool('STATS_SDB_ASYNC') 25 | self.connect_sdb = boto.connect_sdb 26 | self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain) 27 | 28 | def _persist_stats(self, stats, spider=None): 29 | if spider is None: # only store spider-specific stats 30 | return 31 | if not self._sdbdomain: 32 | return 33 | if self._async: 34 | dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy()) 35 | dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \ 36 | spider=spider) 37 | else: 38 | self._persist_to_sdb(spider, stats) 39 | 40 | def _persist_to_sdb(self, spider, stats): 41 | ts = self._get_timestamp(spider).isoformat() 42 | sdb_item_id = "%s_%s" % (spider.name, ts) 43 | sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems()) 44 | sdb_item['spider'] = spider.name 45 | sdb_item['timestamp'] = self._to_sdb_value(ts) 46 | self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item) 47 | 48 | def _get_timestamp(self, spider): 49 | return datetime.utcnow() 50 | 51 | def _to_sdb_value(self, obj, key=None): 52 | try: 53 | return to_sdb_value(obj) 54 | except TypeError: 55 | raise TypeError("%s unsupported type %r used in key %r" % \ 56 | (type(self).__name__, type(obj).__name__, key)) 57 | -------------------------------------------------------------------------------- /scaws/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/scaws/67bd72e1bc56f1cc8505c30515f02a8bd41511ee/scaws/tests/__init__.py -------------------------------------------------------------------------------- /scaws/tests/test_spiderqueue.py: -------------------------------------------------------------------------------- 1 | from twisted.trial import unittest 2 | from zope.interface.verify import verifyObject 3 | 4 | from scrapyd.interfaces import ISpiderQueue 5 | from scrapy.utils.test import assert_aws_environ 6 | from scaws.spiderqueue import SQSSpiderQueue 7 | 8 | class SQSSpiderQueueTest(unittest.TestCase): 9 | 10 | def setUp(self): 11 | assert_aws_environ() 12 | 13 | def test_interface(self): 14 | verifyObject(ISpiderQueue, SQSSpiderQueue()) 15 | 16 | # XXX: testing SQS queue operations is hard because there are long delays 17 | # for the operations to complete 18 | -------------------------------------------------------------------------------- /scaws/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime 3 | 4 | from scaws.utils import to_sdb_value 5 | 6 | class UtilsTest(unittest.TestCase): 7 | 8 | def test_to_sdb_value(self): 9 | self.assertEqual(to_sdb_value(123), u'0000000000000123') 10 | self.assertEqual(to_sdb_value(123L), u'0000000000000123') 11 | self.assertEqual(to_sdb_value(True), u'1') 12 | self.assertEqual(to_sdb_value(False), u'0') 13 | self.assertEqual(to_sdb_value(None), u'') 14 | self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \ 15 | u'2009-01-01T10:10:10') 16 | self.assertEqual(to_sdb_value('test'), 'test') 17 | self.assertEqual(to_sdb_value(u'test'), u'test') 18 | self.assertRaises(TypeError, to_sdb_value, object()) 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /scaws/utils.py: -------------------------------------------------------------------------------- 1 | """Helper functions""" 2 | 3 | from datetime import datetime 4 | 5 | def to_sdb_value(obj): 6 | """Convert the given object to proper value to store in Amazon SimpleDB""" 7 | if isinstance(obj, bool): 8 | return u'%d' % obj 9 | elif isinstance(obj, (int, long)): 10 | return "%016d" % obj 11 | elif isinstance(obj, datetime): 12 | return obj.isoformat() 13 | elif isinstance(obj, basestring): 14 | return obj 15 | elif obj is None: 16 | return u'' 17 | else: 18 | raise TypeError("Unsupported Type: %s" % type(obj).__name__) 19 | 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='scaws', 4 | version='0.1', 5 | license='BSD', 6 | description='Scrapy extensions for Amazon EC2', 7 | author='Scrapinghub', 8 | author_email='info@scrapinghub.com', 9 | url='http://github.com/scrapinghub/scaws', 10 | keywords="scrapy amazon aws ec2", 11 | packages=['scaws'], 12 | platforms = ['Any'], 13 | install_requires = ['Scrapy', 'boto'], 14 | classifiers = [ 'Development Status :: 4 - Beta', 15 | 'License :: OSI Approved :: BSD License', 16 | 'Operating System :: OS Independent', 17 | 'Programming Language :: Python'] 18 | ) 19 | --------------------------------------------------------------------------------