├── tests
    ├── __init__.py
    └── tests.py
├── .gitignore
├── timak
    ├── __init__.py
    └── timelines.py
├── LICENSE
├── setup.py
└── README.rst


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /dist
3 | /timak.egg-info
4 | 


--------------------------------------------------------------------------------
/timak/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import pkg_resources
 4 | 
 5 | from .timelines import Timeline
 6 | 
 7 | 
 8 | VERSION = tuple(map(int, pkg_resources.get_distribution('timak').version.split('.')))
 9 | __version__ = VERSION
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2011 Brett Hoerner
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | version = "0.1.1"
 5 | 
 6 | 
 7 | setup(name='timak',
 8 |     version=version,
 9 |     description='Timelines (activity streams) backed by Riak',
10 |     author='Brett Hoerner',
11 |     author_email='brett@bretthoerner.com',
12 |     url='http://github.com/bretthoerner/timak',
13 |     packages=find_packages(),
14 |     test_suite='unittest2.collector',
15 |     install_requires=['riak'],
16 |     tests_require=['unittest2'],
17 |     classifiers=[
18 |         "Intended Audience :: Developers",
19 |         "Intended Audience :: System Administrators",
20 |         "Operating System :: OS Independent",
21 |         "Topic :: Software Development"
22 |     ],
23 |     license="Apache License (2.0)",
24 | )
25 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | timak
 3 | =====
 4 | 
 5 | timak is a Python library for storing timelines (activity streams) in Riak. It is very alpha and rough around the edges.
 6 | 
 7 | It is loosely based on my understanding of Yammer's `Streamie <http://blog.basho.com/2011/03/28/Riak-and-Scala-at-Yammer/>`_.
 8 | 
 9 | Example
10 | -------
11 | 
12 | Timelines are unique sets of objects (unique by the ID you provide) ordered by a datetime (that you also provide). They are bounded, so items fall off the end when a (user defined) capacity is reached.
13 | 
14 |     >>> from datetime import datetime
15 |     >>> import riak
16 |     >>> from timak.timelines import Timeline
17 | 
18 |     >>> conn = riak.RiakClient()
19 | 
20 |     >>> tl = Timeline(connection=conn, max_items=3)
21 | 
22 |     >>> # t1.add("key", "unique_id", "score")
23 |     >>> tl.add("brett:tweets", 1, datetime(2011, 1, 1))
24 |     [1]
25 |     >>> tl.add("brett:tweets", 2, datetime(2011, 1, 2))
26 |     [2, 1]
27 |     >>> tl.add("brett:tweets", 3, datetime(2011, 1, 3))
28 |     [3, 2, 1]
29 |     >>> tl.add("brett:tweets", 4, datetime(2011, 1, 4))
30 |     [4, 3, 2]
31 |     >>> tl.delete("brett:tweets", 2, datetime(2011, 1, 2))
32 |     [4, 3]
33 | 
34 | If you provide a ``datetime.datetime`` value to score Timak will automatically convert to a sortable score value.
35 | 
36 | As you can see the default order is descending by the date you provide, and the object IDs are returned by default. You can also provide an ``obj_data`` argument (must be JSON serializable) which will be returned instead.
37 | 
38 |    >>> tl.add("brett:tweets", 5, datetime(2011, 1, 5), obj_data={'body': 'Hello world, this is my first tweet'})
39 |    [{'body': 'Hello world, this is my first tweet'}, 4, 3]
40 | 
41 | Why?
42 | ----
43 | 
44 | I needed *highly available*, *linearly scalable* timelines where readers and writers *don't block* one another. Because Riak is a Dynamo based system, multiple writers can update a single value and I can merge the conflicts on a later read. I can also add a machine to the cluster for more throughput, and since it's simply fetching denormalized timelines by key it should be incredibly performant.
45 | 
46 | So what? I could write this in...
47 | ---------------------------------
48 | 
49 | PostgreSQL or MySQL
50 | ```````````````````
51 | 
52 | This would be a very simple table in a RDBMS. It could even be boundless (though without some PLSQL hackery large ``OFFSETS`` are very expensive). You'd be hitting large indexes instead of fetching values directly by key. The biggest problem is it all has to fit on a single system, unless you manually shard the data (and re-shard if you ever grew out of that size). Plus you'd have to deal with availability using read slaves and failover.
53 | 
54 | MongoDB
55 | ```````
56 | 
57 | The only possible difference I see from the RDBMSs above is that you could use Mongo's "auto-sharding." If that's your thing, and you trust it, then I wish you the best of luck. You may want to `read this <http://www.allthingsdistributed.com/2007/10/amazons_dynamo.html>`_.
58 | 
59 | Redis
60 | `````
61 | 
62 | You can fake timelines in Redis using a list or sorted set. Like RDBMS you have to handle all of the sharding yourself, re-shard on growth, and use slaves and failover for availability. In addition to these, and even more critical for my use case: all of your timelines would have to fit in RAM. If you have this problem and that kind of money please send me some.
63 | 
64 | Cassandra
65 | `````````
66 | 
67 | Probably another great fit. You could even store much longer timelines, though I'm not sure what the cost is of doing a ``SELECT`` with ``OFFSET`` equivalent on the columns in a Cassandra row.
68 | 
69 | TODO
70 | ----
71 | 
72 | 1. Add better API with cursors (last seen ``obj_date``?) for pagination.
73 | 2. Built-in Django support for update on ``post_save`` and ``post_delete``.
74 | 3. Compress values.
75 | 


--------------------------------------------------------------------------------
/tests/tests.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import unittest2
  3 | 
  4 | import riak
  5 | from timak.timelines import Timeline
  6 | 
  7 | 
  8 | class TimakTest(unittest2.TestCase):
  9 |     def setUp(self):
 10 |         self.key = self.bucket = "test-timak"
 11 |         self.c1 = riak.RiakClient()
 12 |         self.c2 = riak.RiakClient()
 13 | 
 14 |         self.b1 = self.c1.bucket(self.bucket)
 15 |         self.b2 = self.c2.bucket(self.bucket)
 16 | 
 17 |         self.c1.bucket(self.bucket).set_allow_multiples(True)
 18 | 
 19 |     def tearDown(self):
 20 |         # resolve conflicts / clear data between tests
 21 |         riak.RiakClient().bucket(self.bucket).get(self.key).delete()
 22 | 
 23 |     def test_allow_multiples(self):
 24 |         """
 25 |         Verify that sibling objects are created on a conflict.
 26 |         """
 27 |         o1 = self.b1.get(self.key)
 28 |         o2 = self.b2.get(self.key)
 29 | 
 30 |         o1.set_data("object-1")
 31 |         o1.store()
 32 |         o2.set_data("object-2")
 33 |         o2.store()
 34 | 
 35 |         conflicted = self.b1.get(self.key)
 36 |         siblings = filter(bool, (s.get_data() for s in conflicted.get_siblings()))
 37 |         self.assertEqual(len(siblings), 2)
 38 | 
 39 |     def test_max_items(self):
 40 |         """
 41 |         Verify items > max_items are removed.
 42 |         """
 43 |         timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3)
 44 |         now = datetime.utcnow()
 45 | 
 46 |         timeline.add(self.key, 1, now)
 47 |         timeline.add(self.key, 2, now)
 48 |         timeline.add(self.key, 3, now)
 49 |         self.assertEqual(len(timeline.get(self.key)), 3)
 50 | 
 51 |         timeline.add(self.key, 4, now)
 52 |         self.assertEqual(len(timeline.get(self.key)), 3)
 53 | 
 54 |     def test_delete(self):
 55 |         timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3)
 56 |         now = datetime.utcnow()
 57 | 
 58 |         timeline.add(self.key, 1, now)
 59 |         self.assertEqual(len(timeline.get(self.key)), 1)
 60 | 
 61 |         timeline.delete(self.key, 1, now)
 62 |         self.assertEqual(len(timeline.get(self.key)), 0)
 63 | 
 64 |     def test_multi_writers(self):
 65 |         now = datetime.utcnow()
 66 | 
 67 |         t1 = Timeline(connection=self.c1, bucket=self.bucket, max_items=10)
 68 |         t2 = Timeline(connection=self.c2, bucket=self.bucket, max_items=10)
 69 | 
 70 |         t1.add(self.key, 1, now)
 71 |         t2.add(self.key, 2, now + timedelta(minutes=1))
 72 | 
 73 |         self.assertEqual(t1.get(self.key), [2, 1])
 74 | 
 75 |     def test_timestamp_scores(self):
 76 |         timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3)
 77 |         now = datetime.utcnow()
 78 |         
 79 |         timeline.add(self.key, 1, now)
 80 |         timeline.add(self.key, 2, now + timedelta(seconds=1))
 81 |         timeline.add(self.key, 3, now + timedelta(seconds=2))
 82 |         results = timeline.get(self.key)
 83 |         self.assertEqual(len(results), 3)
 84 |         self.assertEquals(results[0], 3)
 85 |         self.assertEquals(results[1], 2)
 86 |         self.assertEquals(results[2], 1)
 87 | 
 88 |     def test_non_timestamp_scores(self):
 89 |         timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3)
 90 | 
 91 |         timeline.add(self.key, 1, 3)
 92 |         timeline.add(self.key, 2, 2)
 93 |         timeline.add(self.key, 3, 1)
 94 |         results = timeline.get(self.key)
 95 |         self.assertEqual(len(results), 3)
 96 |         self.assertEquals(results[0], 1)
 97 |         self.assertEquals(results[1], 2)
 98 |         self.assertEquals(results[2], 3)
 99 | 
100 |     def test_score_scoping(self):
101 |         timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3)
102 | 
103 |         timeline.add(self.key, 1, 3)
104 |         timeline.add(self.key, 2, 2)
105 |         timeline.add(self.key, 3, 1)
106 |         results = timeline.get(self.key)
107 |         self.assertEqual(len(results), 3)
108 |         self.assertEquals(results[0], 1)
109 |         self.assertEquals(results[1], 2)
110 |         self.assertEquals(results[2], 3)
111 |         
112 |         timeline.add(self.key, 4, 0)
113 |         results = timeline.get(self.key)
114 |         self.assertEqual(len(results), 3)
115 |         self.assertEquals(results[0], 1)
116 |         self.assertEquals(results[1], 2)
117 |         self.assertEquals(results[2], 3)
118 | 
119 |         timeline.add(self.key, 5, 5)
120 |         results = timeline.get(self.key)
121 |         self.assertEqual(len(results), 3)
122 |         self.assertEquals(results[0], 5)
123 |         self.assertEquals(results[1], 1)
124 |         self.assertEquals(results[2], 2)
125 | 


--------------------------------------------------------------------------------
/timak/timelines.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | 
  4 | class Timeline(object):
  5 |     def __init__(self, connection=None, bucket="timelines", order='desc',
  6 |                  max_items=1000):
  7 |         self.connection = connection
  8 |         self.bucket = bucket
  9 |         self.order = order
 10 |         self.max_items = max_items
 11 | 
 12 |     def get_connection(self):
 13 |         return self.connection
 14 | 
 15 |     def get_bucket(self):
 16 |         return self.connection.bucket(self.bucket)
 17 | 
 18 |     def _datetime_to_js(self, dt):
 19 |         return int(dt.strftime("%s") + dt.strftime("%f")[:3])
 20 | 
 21 |     def _merge_two(self, obj1, obj2):
 22 |         """
 23 |         Merges two data dictionaries, respecting the one with the most recent
 24 |         modified time per item.
 25 |         """
 26 |         for uniq_ident in obj2.keys():
 27 |             if (uniq_ident not in obj1) \
 28 |                or (obj1[uniq_ident]['modified'] \
 29 |                    < obj2[uniq_ident]['modified']):
 30 |                 obj1[uniq_ident] = obj2[uniq_ident]
 31 | 
 32 |         return obj1 # self._dict_to_list(obj1)
 33 | 
 34 |     def _list_to_dict(self, l):
 35 |         if not l: return {}
 36 |         d = {}
 37 |         for o in l:
 38 |             d[o['id']] = o
 39 |         return d
 40 | 
 41 |     def _dict_to_list(self, d):
 42 |         if not d: return []
 43 |         l = d.values()
 44 |         reverse = self.order == 'desc'
 45 |         l.sort(key=lambda x: x['score'], reverse=reverse)
 46 |         return l
 47 | 
 48 |     def _list_to_data(self, l):
 49 |         """
 50 |         Coerces a list of timeline objects into the data the user cares about.
 51 |         """
 52 |         return [o.get('data', None) or o.get('id')
 53 |                 for o in l
 54 |                 if not o.get('deleted', False)]
 55 | 
 56 |     def _get_obj_and_data(self, key, write_merged=True):
 57 |         """
 58 |         Returns RiakObject with proper vclock set and dictionary of merged entries.
 59 | 
 60 |         NOTE: The data on the object itself should not be used, the object is
 61 |         returned only so it can be used later for updates.
 62 |         """
 63 |         bucket = self.get_bucket()
 64 | 
 65 |         obj = bucket.get(key)
 66 |         data = [self._list_to_dict(o.get_data()) for o
 67 |                 in obj.get_siblings()
 68 |                 if o.get_data() is not None]
 69 | 
 70 |         obj_data = obj.get_data()
 71 |         if obj_data is not None:
 72 |             data.append(self._list_to_dict(obj_data))
 73 | 
 74 |         # if we have no data or only 1 sibling we can safetly return
 75 |         # it without merging
 76 |         if len(data) == 0:
 77 |             return obj, {}
 78 |         elif len(data) == 1:
 79 |             return obj, data[0]
 80 | 
 81 |         resolved_data = reduce(self._merge_two, data)
 82 |         # NOTE: is this really the only way to fix a conflict in the
 83 |         # python riak library?
 84 |         try:
 85 |             obj._vclock = obj.get_sibling(0).vclock()
 86 |         except IndexError:
 87 |             pass
 88 |         else:
 89 |             if write_merged:
 90 |                 obj.set_data(self._dict_to_list(resolved_data)[:self.max_items])
 91 |                 obj.store()
 92 | 
 93 |         return obj, resolved_data
 94 | 
 95 |     def get(self, key, raw=False):
 96 |         """
 97 |         Returns timeline as list.
 98 |         """
 99 |         # TODO: Optimize this so we don't have to coerce
100 |         # list->dict->list for the common case.
101 |         result = self._dict_to_list(self._get_obj_and_data(key)[1])
102 |         if raw:
103 |             return result
104 |         return self._list_to_data(result)
105 | 
106 |     def _make_op(action):
107 |         assert action in ('add', 'delete')
108 |         def _op(self, key, uniq_ident, obj_score, obj_data=None, raw=False):
109 |             now = self._datetime_to_js(datetime.datetime.utcnow())
110 |             obj, data = self._get_obj_and_data(key, write_merged=False)
111 | 
112 |             if isinstance(obj_score, datetime.datetime):
113 |                 obj_score = self._datetime_to_js(obj_score)
114 | 
115 |             new_item = {'id': uniq_ident,
116 |                         'score': obj_score,
117 |                         'modified': now}
118 |             if obj_data:
119 |                 new_item['data'] = obj_data
120 |             if action == 'delete':
121 |                 new_item['deleted'] = True
122 | 
123 |             existing = data.get(uniq_ident, None)
124 |             if existing:
125 |                 if existing['modified'] < now:
126 |                     data[uniq_ident] = new_item
127 |             else:
128 |                 data[uniq_ident] = new_item
129 | 
130 |             timeline = self._dict_to_list(data)[:self.max_items]
131 |             obj.set_data(timeline)
132 |             obj.store()
133 |             if raw:
134 |                 return timeline
135 |             return self._list_to_data(timeline)
136 |         return _op
137 | 
138 |     add = _make_op("add")
139 |     delete = _make_op("delete")
140 | 


--------------------------------------------------------------------------------