├── README.rst
├── scaws
    ├── __init__.py
    ├── statscol.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_spiderqueue.py
    │   └── test_utils.py
    └── utils.py
└── setup.py


/README.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | scaws
 3 | =====
 4 | 
 5 | This project contains some components and extensions for using Scrapy on Amazon
 6 | AWS.
 7 | 
 8 | Requirements
 9 | ============
10 | 
11 | * Scrapy 0.13 or above
12 | * boto 1.8 or above
13 | 
14 | Install
15 | =======
16 | 
17 | Download and run: ``python setup.py install``
18 | 
19 | Available components
20 | ====================
21 | 
22 | SimpleDB stats collector
23 | ------------------------
24 | 
25 | Module: ``scaws.statscol``
26 | 
27 | .. class:: SimpledbStatsCollector
28 | 
29 |     A Stats collector which persists stats to `Amazon SimpleDB`_, using one
30 |     SimpleDB item per scraping run (ie. it keeps history of all scraping runs).
31 |     The data is persisted to the SimpleDB domain specified by the
32 |     `STATS_SDB_DOMAIN`_ setting. The domain will be created if it
33 |     doesn't exist.
34 |     
35 |     In addition to the existing stats keys, the following keys are added at
36 |     persitance time:
37 | 
38 |         * ``spider``: the spider name (so you can use it later for querying stats
39 |           for that spider)
40 |         * ``timestamp``: the timestamp when the stats were persisted
41 | 
42 |     Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB
43 |     item name in order to avoid overwriting stats of previous scraping runs.
44 | 
45 |     As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and
46 |     numbers are zero-padded to 16 digits. Negative numbers are not currently
47 |     supported.
48 | 
49 |     This Stats Collector requires the `boto`_ library.
50 | 
51 | .. _Amazon SimpleDB: http://aws.amazon.com/simpledb/
52 | .. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html
53 | .. _boto: http://code.google.com/p/boto/
54 | 
55 | This Stats Collector can be configured through the following settings:
56 | 
57 | * `STATS_SDB_DOMAIN`_
58 | * `STATS_SDB_ASYNC`_
59 | 
60 | .. _STATS_SDB_DOMAIN:
61 | 
62 | STATS_SDB_DOMAIN
63 | ~~~~~~~~~~~~~~~~
64 | 
65 | Default: ``'scrapy_stats'``
66 | 
67 | A string containing the SimpleDB domain to use for collecting the stats.
68 | 
69 | .. _STATS_SDB_ASYNC:
70 | 
71 | STATS_SDB_ASYNC
72 | ~~~~~~~~~~~~~~~
73 | 
74 | Default: ``False``
75 | 
76 | If ``True``, communication with SimpleDB will be performed asynchronously. If
77 | ``False`` blocking IO will be used instead. This is the default as using
78 | asynchronous communication can result in the stats not being persisted if the
79 | Scrapy engine is shut down in the middle (for example, when you run only one
80 | spider in a process and then exit).
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/scaws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/scaws/67bd72e1bc56f1cc8505c30515f02a8bd41511ee/scaws/__init__.py


--------------------------------------------------------------------------------
/scaws/statscol.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A Stats collector for persisting stats to Amazon SimpleDB.
 3 | 
 4 | Requires the boto library: http://code.google.com/p/boto/
 5 | """
 6 | 
 7 | from datetime import datetime
 8 | 
 9 | import boto
10 | from twisted.internet import threads
11 | 
12 | from scrapy.statscol import StatsCollector
13 | from scrapy import log
14 | 
15 | from scaws.utils import to_sdb_value
16 | 
17 | class SimpledbStatsCollector(StatsCollector):
18 | 
19 |     def __init__(self, crawler):
20 |         super(SimpledbStatsCollector, self).__init__(crawler)
21 |         self._sdbdomain = crawler.settings['STATS_SDB_DOMAIN']
22 |         self._access_key = crawler.settings['AWS_ACCESS_KEY_ID']
23 |         self._secret_key = crawler.settings['AWS_SECRET_ACCESS_KEY']
24 |         self._async = crawler.settings.getbool('STATS_SDB_ASYNC')
25 |         self.connect_sdb = boto.connect_sdb
26 |         self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
27 | 
28 |     def _persist_stats(self, stats, spider=None):
29 |         if spider is None: # only store spider-specific stats
30 |             return
31 |         if not self._sdbdomain:
32 |             return
33 |         if self._async:
34 |             dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy())
35 |             dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \
36 |                 spider=spider)
37 |         else:
38 |             self._persist_to_sdb(spider, stats)
39 | 
40 |     def _persist_to_sdb(self, spider, stats):
41 |         ts = self._get_timestamp(spider).isoformat()
42 |         sdb_item_id = "%s_%s" % (spider.name, ts)
43 |         sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
44 |         sdb_item['spider'] = spider.name
45 |         sdb_item['timestamp'] = self._to_sdb_value(ts)
46 |         self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
47 | 
48 |     def _get_timestamp(self, spider):
49 |         return datetime.utcnow()
50 | 
51 |     def _to_sdb_value(self, obj, key=None):
52 |         try:
53 |             return to_sdb_value(obj)
54 |         except TypeError:
55 |             raise TypeError("%s unsupported type %r used in key %r" % \
56 |                 (type(self).__name__, type(obj).__name__, key))
57 | 


--------------------------------------------------------------------------------
/scaws/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/scaws/67bd72e1bc56f1cc8505c30515f02a8bd41511ee/scaws/tests/__init__.py


--------------------------------------------------------------------------------
/scaws/tests/test_spiderqueue.py:
--------------------------------------------------------------------------------
 1 | from twisted.trial import unittest
 2 | from zope.interface.verify import verifyObject
 3 | 
 4 | from scrapyd.interfaces import ISpiderQueue
 5 | from scrapy.utils.test import assert_aws_environ
 6 | from scaws.spiderqueue import SQSSpiderQueue
 7 | 
 8 | class SQSSpiderQueueTest(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         assert_aws_environ()
12 | 
13 |     def test_interface(self):
14 |         verifyObject(ISpiderQueue, SQSSpiderQueue())
15 | 
16 |     # XXX: testing SQS queue operations is hard because there are long delays
17 |     # for the operations to complete
18 | 


--------------------------------------------------------------------------------
/scaws/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datetime import datetime
 3 | 
 4 | from scaws.utils import to_sdb_value
 5 | 
 6 | class UtilsTest(unittest.TestCase):
 7 | 
 8 |     def test_to_sdb_value(self):
 9 |         self.assertEqual(to_sdb_value(123), u'0000000000000123')
10 |         self.assertEqual(to_sdb_value(123L), u'0000000000000123')
11 |         self.assertEqual(to_sdb_value(True), u'1')
12 |         self.assertEqual(to_sdb_value(False), u'0')
13 |         self.assertEqual(to_sdb_value(None), u'')
14 |         self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \
15 |             u'2009-01-01T10:10:10')
16 |         self.assertEqual(to_sdb_value('test'), 'test')
17 |         self.assertEqual(to_sdb_value(u'test'), u'test')
18 |         self.assertRaises(TypeError, to_sdb_value, object())
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/scaws/utils.py:
--------------------------------------------------------------------------------
 1 | """Helper functions"""
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | def to_sdb_value(obj):
 6 |     """Convert the given object to proper value to store in Amazon SimpleDB"""
 7 |     if isinstance(obj, bool):
 8 |         return u'%d' % obj
 9 |     elif isinstance(obj, (int, long)):
10 |         return "%016d" % obj
11 |     elif isinstance(obj, datetime):
12 |         return obj.isoformat()
13 |     elif isinstance(obj, basestring):
14 |         return obj
15 |     elif obj is None:
16 |         return u''
17 |     else:
18 |         raise TypeError("Unsupported Type: %s" % type(obj).__name__)
19 | 
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(name='scaws',
 4 |       version='0.1',
 5 |       license='BSD',
 6 |       description='Scrapy extensions for Amazon EC2',
 7 |       author='Scrapinghub',
 8 |       author_email='info@scrapinghub.com',
 9 |       url='http://github.com/scrapinghub/scaws',
10 |       keywords="scrapy amazon aws ec2",
11 |       packages=['scaws'],
12 |       platforms = ['Any'],
13 |       install_requires = ['Scrapy', 'boto'],
14 |       classifiers = [ 'Development Status :: 4 - Beta',
15 |                       'License :: OSI Approved :: BSD License',
16 |                       'Operating System :: OS Independent',
17 |                       'Programming Language :: Python']
18 | )
19 | 


--------------------------------------------------------------------------------