├── tests ├── __init__.py └── tests.py ├── .gitignore ├── timak ├── __init__.py └── timelines.py ├── LICENSE ├── setup.py └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /dist 3 | /timak.egg-info 4 | -------------------------------------------------------------------------------- /timak/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import pkg_resources 4 | 5 | from .timelines import Timeline 6 | 7 | 8 | VERSION = tuple(map(int, pkg_resources.get_distribution('timak').version.split('.'))) 9 | __version__ = VERSION 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2011 Brett Hoerner 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | version = "0.1.1" 5 | 6 | 7 | setup(name='timak', 8 | version=version, 9 | description='Timelines (activity streams) backed by Riak', 10 | author='Brett Hoerner', 11 | author_email='brett@bretthoerner.com', 12 | url='http://github.com/bretthoerner/timak', 13 | packages=find_packages(), 14 | test_suite='unittest2.collector', 15 | install_requires=['riak'], 16 | tests_require=['unittest2'], 17 | classifiers=[ 18 | "Intended Audience :: Developers", 19 | "Intended Audience :: System Administrators", 20 | "Operating System :: OS Independent", 21 | "Topic :: Software Development" 22 | ], 23 | license="Apache License (2.0)", 24 | ) 25 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | timak 3 | ===== 4 | 5 | timak is a Python library for storing timelines (activity streams) in Riak. It is very alpha and rough around the edges. 6 | 7 | It is loosely based on my understanding of Yammer's `Streamie `_. 8 | 9 | Example 10 | ------- 11 | 12 | Timelines are unique sets of objects (unique by the ID you provide) ordered by a datetime (that you also provide). They are bounded, so items fall off the end when a (user defined) capacity is reached. 13 | 14 | >>> from datetime import datetime 15 | >>> import riak 16 | >>> from timak.timelines import Timeline 17 | 18 | >>> conn = riak.RiakClient() 19 | 20 | >>> tl = Timeline(connection=conn, max_items=3) 21 | 22 | >>> # t1.add("key", "unique_id", "score") 23 | >>> tl.add("brett:tweets", 1, datetime(2011, 1, 1)) 24 | [1] 25 | >>> tl.add("brett:tweets", 2, datetime(2011, 1, 2)) 26 | [2, 1] 27 | >>> tl.add("brett:tweets", 3, datetime(2011, 1, 3)) 28 | [3, 2, 1] 29 | >>> tl.add("brett:tweets", 4, datetime(2011, 1, 4)) 30 | [4, 3, 2] 31 | >>> tl.delete("brett:tweets", 2, datetime(2011, 1, 2)) 32 | [4, 3] 33 | 34 | If you provide a ``datetime.datetime`` value to score Timak will automatically convert to a sortable score value. 35 | 36 | As you can see the default order is descending by the date you provide, and the object IDs are returned by default. You can also provide an ``obj_data`` argument (must be JSON serializable) which will be returned instead. 37 | 38 | >>> tl.add("brett:tweets", 5, datetime(2011, 1, 5), obj_data={'body': 'Hello world, this is my first tweet'}) 39 | [{'body': 'Hello world, this is my first tweet'}, 4, 3] 40 | 41 | Why? 42 | ---- 43 | 44 | I needed *highly available*, *linearly scalable* timelines where readers and writers *don't block* one another. Because Riak is a Dynamo based system, multiple writers can update a single value and I can merge the conflicts on a later read. I can also add a machine to the cluster for more throughput, and since it's simply fetching denormalized timelines by key it should be incredibly performant. 45 | 46 | So what? I could write this in... 47 | --------------------------------- 48 | 49 | PostgreSQL or MySQL 50 | ``````````````````` 51 | 52 | This would be a very simple table in a RDBMS. It could even be boundless (though without some PLSQL hackery large ``OFFSETS`` are very expensive). You'd be hitting large indexes instead of fetching values directly by key. The biggest problem is it all has to fit on a single system, unless you manually shard the data (and re-shard if you ever grew out of that size). Plus you'd have to deal with availability using read slaves and failover. 53 | 54 | MongoDB 55 | ``````` 56 | 57 | The only possible difference I see from the RDBMSs above is that you could use Mongo's "auto-sharding." If that's your thing, and you trust it, then I wish you the best of luck. You may want to `read this `_. 58 | 59 | Redis 60 | ````` 61 | 62 | You can fake timelines in Redis using a list or sorted set. Like RDBMS you have to handle all of the sharding yourself, re-shard on growth, and use slaves and failover for availability. In addition to these, and even more critical for my use case: all of your timelines would have to fit in RAM. If you have this problem and that kind of money please send me some. 63 | 64 | Cassandra 65 | ````````` 66 | 67 | Probably another great fit. You could even store much longer timelines, though I'm not sure what the cost is of doing a ``SELECT`` with ``OFFSET`` equivalent on the columns in a Cassandra row. 68 | 69 | TODO 70 | ---- 71 | 72 | 1. Add better API with cursors (last seen ``obj_date``?) for pagination. 73 | 2. Built-in Django support for update on ``post_save`` and ``post_delete``. 74 | 3. Compress values. 75 | -------------------------------------------------------------------------------- /tests/tests.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import unittest2 3 | 4 | import riak 5 | from timak.timelines import Timeline 6 | 7 | 8 | class TimakTest(unittest2.TestCase): 9 | def setUp(self): 10 | self.key = self.bucket = "test-timak" 11 | self.c1 = riak.RiakClient() 12 | self.c2 = riak.RiakClient() 13 | 14 | self.b1 = self.c1.bucket(self.bucket) 15 | self.b2 = self.c2.bucket(self.bucket) 16 | 17 | self.c1.bucket(self.bucket).set_allow_multiples(True) 18 | 19 | def tearDown(self): 20 | # resolve conflicts / clear data between tests 21 | riak.RiakClient().bucket(self.bucket).get(self.key).delete() 22 | 23 | def test_allow_multiples(self): 24 | """ 25 | Verify that sibling objects are created on a conflict. 26 | """ 27 | o1 = self.b1.get(self.key) 28 | o2 = self.b2.get(self.key) 29 | 30 | o1.set_data("object-1") 31 | o1.store() 32 | o2.set_data("object-2") 33 | o2.store() 34 | 35 | conflicted = self.b1.get(self.key) 36 | siblings = filter(bool, (s.get_data() for s in conflicted.get_siblings())) 37 | self.assertEqual(len(siblings), 2) 38 | 39 | def test_max_items(self): 40 | """ 41 | Verify items > max_items are removed. 42 | """ 43 | timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3) 44 | now = datetime.utcnow() 45 | 46 | timeline.add(self.key, 1, now) 47 | timeline.add(self.key, 2, now) 48 | timeline.add(self.key, 3, now) 49 | self.assertEqual(len(timeline.get(self.key)), 3) 50 | 51 | timeline.add(self.key, 4, now) 52 | self.assertEqual(len(timeline.get(self.key)), 3) 53 | 54 | def test_delete(self): 55 | timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3) 56 | now = datetime.utcnow() 57 | 58 | timeline.add(self.key, 1, now) 59 | self.assertEqual(len(timeline.get(self.key)), 1) 60 | 61 | timeline.delete(self.key, 1, now) 62 | self.assertEqual(len(timeline.get(self.key)), 0) 63 | 64 | def test_multi_writers(self): 65 | now = datetime.utcnow() 66 | 67 | t1 = Timeline(connection=self.c1, bucket=self.bucket, max_items=10) 68 | t2 = Timeline(connection=self.c2, bucket=self.bucket, max_items=10) 69 | 70 | t1.add(self.key, 1, now) 71 | t2.add(self.key, 2, now + timedelta(minutes=1)) 72 | 73 | self.assertEqual(t1.get(self.key), [2, 1]) 74 | 75 | def test_timestamp_scores(self): 76 | timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3) 77 | now = datetime.utcnow() 78 | 79 | timeline.add(self.key, 1, now) 80 | timeline.add(self.key, 2, now + timedelta(seconds=1)) 81 | timeline.add(self.key, 3, now + timedelta(seconds=2)) 82 | results = timeline.get(self.key) 83 | self.assertEqual(len(results), 3) 84 | self.assertEquals(results[0], 3) 85 | self.assertEquals(results[1], 2) 86 | self.assertEquals(results[2], 1) 87 | 88 | def test_non_timestamp_scores(self): 89 | timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3) 90 | 91 | timeline.add(self.key, 1, 3) 92 | timeline.add(self.key, 2, 2) 93 | timeline.add(self.key, 3, 1) 94 | results = timeline.get(self.key) 95 | self.assertEqual(len(results), 3) 96 | self.assertEquals(results[0], 1) 97 | self.assertEquals(results[1], 2) 98 | self.assertEquals(results[2], 3) 99 | 100 | def test_score_scoping(self): 101 | timeline = Timeline(connection=self.c1, bucket=self.bucket, max_items=3) 102 | 103 | timeline.add(self.key, 1, 3) 104 | timeline.add(self.key, 2, 2) 105 | timeline.add(self.key, 3, 1) 106 | results = timeline.get(self.key) 107 | self.assertEqual(len(results), 3) 108 | self.assertEquals(results[0], 1) 109 | self.assertEquals(results[1], 2) 110 | self.assertEquals(results[2], 3) 111 | 112 | timeline.add(self.key, 4, 0) 113 | results = timeline.get(self.key) 114 | self.assertEqual(len(results), 3) 115 | self.assertEquals(results[0], 1) 116 | self.assertEquals(results[1], 2) 117 | self.assertEquals(results[2], 3) 118 | 119 | timeline.add(self.key, 5, 5) 120 | results = timeline.get(self.key) 121 | self.assertEqual(len(results), 3) 122 | self.assertEquals(results[0], 5) 123 | self.assertEquals(results[1], 1) 124 | self.assertEquals(results[2], 2) 125 | -------------------------------------------------------------------------------- /timak/timelines.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | class Timeline(object): 5 | def __init__(self, connection=None, bucket="timelines", order='desc', 6 | max_items=1000): 7 | self.connection = connection 8 | self.bucket = bucket 9 | self.order = order 10 | self.max_items = max_items 11 | 12 | def get_connection(self): 13 | return self.connection 14 | 15 | def get_bucket(self): 16 | return self.connection.bucket(self.bucket) 17 | 18 | def _datetime_to_js(self, dt): 19 | return int(dt.strftime("%s") + dt.strftime("%f")[:3]) 20 | 21 | def _merge_two(self, obj1, obj2): 22 | """ 23 | Merges two data dictionaries, respecting the one with the most recent 24 | modified time per item. 25 | """ 26 | for uniq_ident in obj2.keys(): 27 | if (uniq_ident not in obj1) \ 28 | or (obj1[uniq_ident]['modified'] \ 29 | < obj2[uniq_ident]['modified']): 30 | obj1[uniq_ident] = obj2[uniq_ident] 31 | 32 | return obj1 # self._dict_to_list(obj1) 33 | 34 | def _list_to_dict(self, l): 35 | if not l: return {} 36 | d = {} 37 | for o in l: 38 | d[o['id']] = o 39 | return d 40 | 41 | def _dict_to_list(self, d): 42 | if not d: return [] 43 | l = d.values() 44 | reverse = self.order == 'desc' 45 | l.sort(key=lambda x: x['score'], reverse=reverse) 46 | return l 47 | 48 | def _list_to_data(self, l): 49 | """ 50 | Coerces a list of timeline objects into the data the user cares about. 51 | """ 52 | return [o.get('data', None) or o.get('id') 53 | for o in l 54 | if not o.get('deleted', False)] 55 | 56 | def _get_obj_and_data(self, key, write_merged=True): 57 | """ 58 | Returns RiakObject with proper vclock set and dictionary of merged entries. 59 | 60 | NOTE: The data on the object itself should not be used, the object is 61 | returned only so it can be used later for updates. 62 | """ 63 | bucket = self.get_bucket() 64 | 65 | obj = bucket.get(key) 66 | data = [self._list_to_dict(o.get_data()) for o 67 | in obj.get_siblings() 68 | if o.get_data() is not None] 69 | 70 | obj_data = obj.get_data() 71 | if obj_data is not None: 72 | data.append(self._list_to_dict(obj_data)) 73 | 74 | # if we have no data or only 1 sibling we can safetly return 75 | # it without merging 76 | if len(data) == 0: 77 | return obj, {} 78 | elif len(data) == 1: 79 | return obj, data[0] 80 | 81 | resolved_data = reduce(self._merge_two, data) 82 | # NOTE: is this really the only way to fix a conflict in the 83 | # python riak library? 84 | try: 85 | obj._vclock = obj.get_sibling(0).vclock() 86 | except IndexError: 87 | pass 88 | else: 89 | if write_merged: 90 | obj.set_data(self._dict_to_list(resolved_data)[:self.max_items]) 91 | obj.store() 92 | 93 | return obj, resolved_data 94 | 95 | def get(self, key, raw=False): 96 | """ 97 | Returns timeline as list. 98 | """ 99 | # TODO: Optimize this so we don't have to coerce 100 | # list->dict->list for the common case. 101 | result = self._dict_to_list(self._get_obj_and_data(key)[1]) 102 | if raw: 103 | return result 104 | return self._list_to_data(result) 105 | 106 | def _make_op(action): 107 | assert action in ('add', 'delete') 108 | def _op(self, key, uniq_ident, obj_score, obj_data=None, raw=False): 109 | now = self._datetime_to_js(datetime.datetime.utcnow()) 110 | obj, data = self._get_obj_and_data(key, write_merged=False) 111 | 112 | if isinstance(obj_score, datetime.datetime): 113 | obj_score = self._datetime_to_js(obj_score) 114 | 115 | new_item = {'id': uniq_ident, 116 | 'score': obj_score, 117 | 'modified': now} 118 | if obj_data: 119 | new_item['data'] = obj_data 120 | if action == 'delete': 121 | new_item['deleted'] = True 122 | 123 | existing = data.get(uniq_ident, None) 124 | if existing: 125 | if existing['modified'] < now: 126 | data[uniq_ident] = new_item 127 | else: 128 | data[uniq_ident] = new_item 129 | 130 | timeline = self._dict_to_list(data)[:self.max_items] 131 | obj.set_data(timeline) 132 | obj.store() 133 | if raw: 134 | return timeline 135 | return self._list_to_data(timeline) 136 | return _op 137 | 138 | add = _make_op("add") 139 | delete = _make_op("delete") 140 | --------------------------------------------------------------------------------