├── .dir-locals.el ├── .gitignore ├── LICENSE ├── README.md ├── examples ├── blog │ ├── main.py │ ├── static │ │ └── blog.css │ ├── tags.yaml │ └── templates │ │ ├── base.html │ │ ├── main.html │ │ └── post.html └── mysqlbench │ ├── bench.yaml │ ├── mysqlbench.py │ └── plot.py ├── schemaless ├── __init__.py ├── batch.py ├── column.py ├── datastore.py ├── guid.py ├── index.py ├── log.py └── orm │ ├── __init__.py │ ├── column.py │ ├── converters.py │ ├── document.py │ ├── index.py │ ├── session.py │ └── util.py ├── setup.py └── tests ├── tables.sql └── tests.py /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ((nil . ((indent-tabs-mode . nil) 2 | (tab-width . 4) 3 | (fill-column . 80))) 4 | (sh-mode . ((tab-width . 2) 5 | (indent-tabs-mode . nil))) 6 | (python-mode . ((tab-width . 4) 7 | (indent-tabs-mode . nil) 8 | (python-indent . 4)))) 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | *.sw[nop] 3 | *~ 4 | build 5 | \#*# 6 | \.\#* 7 | *.csv 8 | *.png 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010, Evan Klitzke 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Disclaimer: this is alpha-quality code, and the API is not yet stable** 2 | 3 | Introduction 4 | ============ 5 | 6 | Schemaless is a Python module that implements the pattern described by Bret 7 | Taylor in his post 8 | [How FriendFeed uses MySQL to store schema-less data](http://bret.appspot.com/entry/how-friendfeed-uses-mysql). There 9 | are a couple of other Python modules out there that do this already. Here's how 10 | schemaless is different: 11 | 12 | * Only MySQL is supported. That said, I'd love to add SQLite support in the 13 | future. 14 | * Sharding isn't yet supported. Should be pretty straightforward to implement, 15 | though. 16 | * There's an optional "ORM" (which isn't really relational) implemented as 17 | `schemaless.orm`. The "ORM" really is optional, and the interface described 18 | by FriendFeed is all usable and decoupled from the session/object stuff. 19 | * The ORM is designed to be mostly declarative and easy to use. That means that 20 | you can say, "I have have a document type `User`, and please can I have an 21 | index on `(user_id)`, and I'd also like an index on `(first_name, last_name)` 22 | please." The ORM will then create the necessary index tables and 23 | automatically update them when you add new users; it will also know how to 24 | pick the most specific index, given an arbitrary query. 25 | 26 | Basic Usage 27 | =========== 28 | 29 | The code exported under the `schemaless` module exactly mimics the behavior and 30 | interface described by FriendFeed. 31 | 32 | Example 33 | ------- 34 | 35 | Consider the following MySQL database schema: 36 | 37 | CREATE TABLE entities ( 38 | added_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, 39 | id BINARY(16) NOT NULL, 40 | updated TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, 41 | body MEDIUMBLOB, 42 | UNIQUE KEY (id), 43 | KEY (updated) 44 | ) ENGINE=InnoDB; 45 | 46 | CREATE TABLE index_user_id ( 47 | entity_id BINARY(16) NOT NULL UNIQUE, 48 | user_id CHAR(32) NOT NULL, 49 | PRIMARY KEY (user_id, entity_id) 50 | ) ENGINE=InnoDB; 51 | 52 | CREATE TABLE index_user_name ( 53 | entity_id BINARY(16) NOT NULL UNIQUE, 54 | first_name VARCHAR(255) NOT NULL, 55 | last_name VARCHAR(255) NOT NULL, 56 | PRIMARY KEY (first_name, last_name, entity_id) 57 | ) ENGINE=InnoDB; 58 | 59 | CREATE TABLE index_foo ( 60 | entity_id BINARY(16) NOT NULL UNIQUE, 61 | bar INTEGER NOT NULL, 62 | PRIMARY KEY (bar, entity_id) 63 | ) ENGINE=InnoDB; 64 | 65 | The meaning of all of these tables should be clear to you if you've read Bret's 66 | blog post. The following code is a simple example of the interface that 67 | Schemaless provides: 68 | 69 | import schemaless 70 | from schemaless import c 71 | 72 | ds = schemaless.DataStore(mysql_shards=['localhost:3306'], user='foo', password='foo', database='foo') 73 | 74 | # declare which indexes are available 75 | user = ds.define_index('index_user_id', ['user_id']) 76 | user_name = ds.define_index('index_user_name', ['first_name', 'last_name']) 77 | foo = ds.define_index('index_foo', ['bar']) 78 | 79 | # automatically knows that index entries should be created in index_user_id and 80 | # index_user_name, based on the keys in the row given 81 | row = ds.put({'first_name': 'evan', 'last_name': 'klitzke', 'user_id': schemaless.guid()}) 82 | 83 | # query based on user_id, using the index defined by 'index_user_id' 84 | print user.query(c.user_id == row.user_id) 85 | 86 | # query based on first/last name, using the index defined by 'index_user_name' 87 | print user_name.query(c.first_name == 'evan', c.last_name == 'klitzke') 88 | 89 | ORM Layer 90 | ========= 91 | 92 | There's an optional ORM layer, exported via the module `schemaless.orm`. When 93 | you use the ORM layer you can use indexes declaratively, and Schemaless can 94 | automatically pick the correct index to use based on your query. The ORM layer 95 | also knows how to do queries when a full index isn't available (e.g. if you add 96 | a query restriction that isn't fully covered by an index). 97 | 98 | Example 99 | ------- 100 | 101 | The best way to get a feel for the ORM is to look at the example in 102 | `examples/blog/main.py`. This is the implementation of a trivial "blog" 103 | application that uses Schemaless and Tornado. It's only about a hundred lines of 104 | code, and shows a few different working parts interacting together. 105 | 106 | Adding Indexes 107 | ============== 108 | 109 | There's a class called `IndexUpdater` exported by the `schemaless` module that 110 | provides a basic template for batches that add/update/prune indexes. It's 111 | probably easiest to understand how it works if you look at the source code for 112 | it, which provides an example of a batch that adds a new index in the module 113 | documentation. Look under `schemaless/batch.py`. 114 | -------------------------------------------------------------------------------- /examples/blog/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import optparse 3 | import datetime 4 | 5 | import tornado.web 6 | import tornado.ioloop 7 | import tornado.httpserver 8 | 9 | import schemaless 10 | from schemaless import c 11 | from schemaless import orm 12 | 13 | dirname = os.path.dirname(__file__) 14 | 15 | ############## 16 | # ORM Things 17 | ############## 18 | 19 | datastore = schemaless.DataStore(mysql_shards=['localhost:3306'], user='test', password='test', database='test') 20 | session = orm.Session(datastore) 21 | Base = orm.make_base(session, tags_file=os.path.join(dirname, 'tags.yaml')) 22 | 23 | class Post(Base): 24 | _columns = [ 25 | orm.String('title', 255, required=True), 26 | orm.Text('content', required=True), 27 | orm.DateTime('time_created', default=datetime.datetime.now) 28 | ] 29 | 30 | _indexes = [['time_created']] 31 | 32 | @classmethod 33 | def new_post(cls, title, content): 34 | return cls(post_id=schemaless.guid(), title=title, content=content).save() 35 | 36 | @property 37 | def comments(self): 38 | """Get all the comments for this post, ordered by time created.""" 39 | if not hasattr(self, '_comments'): 40 | comments = Comment.query(c.post_id == self.id) 41 | self._comments = sorted(comments, key=lambda c: c.time_created) 42 | return self._comments 43 | 44 | class Comment(Base): 45 | _columns = [ 46 | orm.Guid('post_id', 32, required=True), 47 | orm.String('author', 255), 48 | orm.Text('content', required=True), 49 | orm.DateTime('time_created', default=datetime.datetime.now) 50 | ] 51 | 52 | _indexes = [['comment_id']] 53 | 54 | @classmethod 55 | def reply(cls, post_id, author, content): 56 | return cls(comment_id=schemaless.guid(), post_id=post_id, author=author, content=content).save() 57 | 58 | ############## 59 | # Tornado Things 60 | ############## 61 | 62 | class MainHandler(tornado.web.RequestHandler): 63 | 64 | def get(self): 65 | posts = sorted(Post.all(), key=lambda x: x.time_created, reverse=True) 66 | self.render('main.html', title='Blog', posts=posts) 67 | 68 | class PostHandler(tornado.web.RequestHandler): 69 | 70 | def get(self): 71 | self.render('post.html', title='New Post') 72 | 73 | def post(self): 74 | title = self.get_argument('title') 75 | content = self.get_argument('content') 76 | Post.new_post(title, content) 77 | self.redirect('/') 78 | 79 | class CommentHandler(tornado.web.RequestHandler): 80 | 81 | def post(self): 82 | post_id = self.get_argument('post_id') 83 | author = self.get_argument('author') 84 | content = self.get_argument('content') 85 | Comment.reply(post_id, author, content) 86 | self.redirect('/') 87 | 88 | settings = { 89 | 'static_path': os.path.join(dirname, 'static'), 90 | 'template_path': os.path.join(dirname, 'templates'), 91 | 'cookie_secret': '61oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=', 92 | 'xsrf_cookies': True, 93 | } 94 | 95 | application = tornado.web.Application([ 96 | ('/', MainHandler), 97 | ('/post', PostHandler), 98 | ('/comment', CommentHandler)], **settings) 99 | 100 | if __name__ == '__main__': 101 | parser = optparse.OptionParser() 102 | parser.add_option('-p', '--port', type='int', default='8888', help='which port to listen on') 103 | parser.add_option('-c', '--clear', action='store_true', default=False, help='clear all tables when starting') 104 | opts, args = parser.parse_args() 105 | 106 | if opts.clear: 107 | tables = set() 108 | for d in datastore.connection.query('SHOW TABLES'): 109 | tables |= set(d.values()) 110 | for t in tables: 111 | datastore.connection.execute('DELETE FROM %s' % t) 112 | 113 | http_server = tornado.httpserver.HTTPServer(application) 114 | http_server.listen(opts.port) 115 | print 'blog waiting at http://localhost:%d' % opts.port 116 | tornado.ioloop.IOLoop.instance().start() 117 | -------------------------------------------------------------------------------- /examples/blog/static/blog.css: -------------------------------------------------------------------------------- 1 | h1 { font-weight: bold; font-size: 150%; } 2 | h2 { font-weight: bold; font-size: 120%; } 3 | h3 { font-weight: bold; font-size: 110%; } 4 | 5 | #bd { margin: 1em; max-width: 600px; } 6 | 7 | fieldset { border: 1px solid black; padding: 0.5em; } 8 | legend { font-style: italic; } 9 | .datetime { font-style: italic; } 10 | .comment_box { padding: 0.5em; } 11 | .secret { color: #777; } 12 | a { color: inherit; } 13 | -------------------------------------------------------------------------------- /examples/blog/tags.yaml: -------------------------------------------------------------------------------- 1 | Post: 1 2 | Comment: 2 3 | -------------------------------------------------------------------------------- /examples/blog/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{escape(title)}} 5 | 6 | 7 | 8 | 9 |
10 | {% block body %}{% end %} 11 |
12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/blog/templates/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block body %} 4 |

Schemaless Blog

5 | click here to write a new post 6 | {% for post in posts %} 7 |

{{escape(post.title)}}

8 |
{{post.time_created.strftime('%Y-%m-%d %H:%M')}}
9 |
10 | {{post.content}} {% comment unescaped!!! %} 11 |
12 | {% if post.comments %} 13 |

Comments

14 | {% for comment in post.comments %} 15 |
16 | {{escape(comment.author)}} ({{comment.time_created.strftime('%Y-%m-%d %H:%M')}}): 17 |
18 | {{escape(comment.content)}} 19 |
20 |
21 | {% end %} 22 | {% end %} 23 |
24 |
25 | Leave a Comment 26 | {{xsrf_form_html()}} 27 | 28 | 29 |
30 |
31 | 32 |
33 |
34 | {% end %} 35 | {% end %} 36 | -------------------------------------------------------------------------------- /examples/blog/templates/post.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block body %} 4 | return to blog 5 |
6 |
7 | new post 8 | {{xsrf_form_html()}} 9 | 10 |
11 |
12 | 13 |
14 |
15 | {% end %} 16 | -------------------------------------------------------------------------------- /examples/mysqlbench/bench.yaml: -------------------------------------------------------------------------------- 1 | user: test 2 | passwd: test 3 | db: test 4 | -------------------------------------------------------------------------------- /examples/mysqlbench/mysqlbench.py: -------------------------------------------------------------------------------- 1 | """MySQL benchmark tool, for comparing different table schemas. 2 | 3 | Some notes: 4 | * run like: python mysqlbench.py my_config_file.yaml 5 | * there's a few different options to change things up, invoke with -h to see 6 | them 7 | * you should run this with with a total of at least a quarter million rows or 8 | so (default is one million) to ensure that you see the slowdown from MySQL 9 | checking uniqueness contraints; i.e., make sure you see a slow down as 10 | iterations increase for the schemas that create tables with a unique uuid 11 | column 12 | * there's some overhead from having to generate uuids (which is done by reading 13 | 16 bytes from /dev/urandom); IME the benchmark is still very much 14 | MySQL-bound, but if you're concerned you can pre-allocate an array of uuids 15 | in the bench() function, at the cost of using gobs of memory 16 | * if you're on a machine without dedicated hardware (e.g. a VPS), you'll 17 | probably see interesting things with transaction times fluctuating wildly as 18 | your instance gets access to hardware 19 | 20 | An example yaml config file (ignore the lines starting with ---): 21 | 22 | --- start yaml file --- 23 | user: test 24 | passwd: test 25 | db: test 26 | --- end yaml file --- 27 | 28 | """ 29 | 30 | import os 31 | import csv 32 | import math 33 | import time 34 | import yaml 35 | import optparse 36 | import MySQLdb 37 | 38 | OVERALL_TIMES = [] 39 | 40 | def drop_test_entities(conn): 41 | c = conn.cursor() 42 | c.execute('SELECT table_name FROM information_schema.tables WHERE table_schema = DATABASE() AND table_name = %s', 'test_entities') 43 | if c.fetchone(): 44 | c.execute('DROP TABLE test_entities') 45 | 46 | def create_table(conn, lines, data): 47 | q = [] 48 | q.append('CREATE TABLE test_entities (') 49 | q.extend([' ' + l for l in lines]) 50 | q.append(') ENGINE=InnoDB') 51 | 52 | query = '\n'.join(q) 53 | c = conn.cursor() 54 | print query 55 | c.execute('\n'.join(q)) 56 | if data: 57 | c.execute('ALTER TABLE test_entities ADD COLUMN payload MEDIUMBLOB') 58 | print('ALTER TABLE test_entities ADD COLUMN payload MEDIUMBLOB') 59 | 60 | def increment_worker(c, data): 61 | os.urandom(16) # ensure that this has the same overhead as uuid_worker; 62 | # comment out if you don't like this fairness 63 | if data: 64 | c.execute('INSERT INTO test_entities (added_id, payload) VALUES (NULL, %s)', data) 65 | else: 66 | c.execute('INSERT INTO test_entities (added_id) VALUES (NULL)') 67 | 68 | def uuid_worker(c, data): 69 | if data: 70 | c.execute('INSERT INTO test_entities (id, payload) VALUES (%s, %s)', (os.urandom(16), data)) 71 | else: 72 | c.execute('INSERT INTO test_entities (id) VALUES (%s)', os.urandom(16)) 73 | 74 | def bench(name, opts, conn, data, schema, worker=uuid_worker): 75 | drop_test_entities(conn) 76 | print name 77 | print '=' * len(name) 78 | create_table(conn, schema, data=data) 79 | if opts.sleep: 80 | time.sleep(opts.sleep) 81 | times = [] 82 | c = conn.cursor() 83 | for x in xrange(opts.num_iterations): 84 | if not opts.autocommit: 85 | c.execute('SET TRANSACTION ISOLATION LEVEL REPEATABLE READ') 86 | ts = time.time() 87 | for y in xrange(opts.batch_size): 88 | worker(c, data) 89 | conn.commit() 90 | elapsed = time.time() - ts 91 | times.append(elapsed) 92 | print '% 4d %f' % (x + 1, elapsed) 93 | OVERALL_TIMES.append((name, times)) 94 | sorted_times = sorted(times) 95 | total = sum(times) 96 | avg = total / len(times) 97 | if len(times) % 2 == 0: 98 | idx = len(times) / 2 99 | med = (sorted_times[idx] + sorted_times[idx + 1]) / 2 100 | else: 101 | med = times[len(sorted_times) / 2] 102 | dev = math.sqrt(sum((x - avg)**2 for x in times) / len(times)) 103 | print 104 | print 'average = %1.3f' % (avg,) 105 | print 'median = %1.3f' % (med,) 106 | print 'std dev = %1.3f' % (dev,) 107 | print 108 | return times 109 | 110 | def main(opts, args): 111 | start = time.time() 112 | cfg = yaml.load(open(args[0]).read()) 113 | conn = MySQLdb.connect(**cfg) 114 | 115 | opsys, host, kernel, dt, arch = os.uname() 116 | print '%s %s' % (opsys, kernel) 117 | print 'MySQL ' + conn.get_server_info() 118 | print 119 | print 'running %d iterations of %d inserts per txn (%d rows total)' % (opts.num_iterations, opts.batch_size, opts.num_iterations * opts.batch_size) 120 | if opts.autocommit: 121 | conn.cursor().execute('SET autocommit = 1') 122 | print 'autocommit is ON' 123 | else: 124 | conn.cursor().execute('SET autocommit = 0') 125 | print 'autocommit is OFF' 126 | print 127 | 128 | data = os.urandom(opts.data) if opts.data else None 129 | bench('just auto_increment', opts, conn, data, 130 | ['added_id INTEGER NOT NULL AUTO_INCREMENT,', 131 | 'PRIMARY KEY (added_id)'], increment_worker) 132 | 133 | bench('auto_increment, key', opts, conn, data, 134 | ['added_id INTEGER NOT NULL AUTO_INCREMENT,', 135 | 'id BINARY(16) NOT NULL,', 136 | 'PRIMARY KEY (added_id),', 137 | 'KEY (id)']) 138 | 139 | bench('auto_increment, unique key', opts, conn, data, 140 | ['added_id INTEGER NOT NULL AUTO_INCREMENT,', 141 | 'id BINARY(16) NOT NULL,', 142 | 'PRIMARY KEY (added_id),', 143 | 'UNIQUE KEY (id)']) 144 | 145 | bench('w/o auto-increment, key', opts, conn, data, 146 | ['id BINARY(16) NOT NULL,' 147 | 'KEY (id)']) 148 | 149 | bench('w/o auto-increment, unique key', opts, conn, data, 150 | ['id BINARY(16) NOT NULL,', 151 | 'UNIQUE KEY (id)']) 152 | 153 | bench('w/o auto-increment, primary key', opts, conn, data, 154 | ['id BINARY(16) NOT NULL,', 155 | 'PRIMARY KEY (id)']) 156 | 157 | drop_test_entities(conn) 158 | if opts.csv: 159 | writer = csv.writer(open(opts.csv, 'w')) 160 | names = ['cumulative'] + [name for name, _ in OVERALL_TIMES] 161 | writer.writerow(names) 162 | writer.writerow([0 for x in xrange(len(OVERALL_TIMES) + 1)]) 163 | for x in xrange(opts.num_iterations): 164 | tot = (x + 1) * opts.batch_size 165 | writer.writerow([tot] + [t[x] for _, t in OVERALL_TIMES]) 166 | print 'csv output is in %r' % (opts.csv,) 167 | print 'total time was %1.3f seconds' % (time.time() - start) 168 | 169 | if __name__ == '__main__': 170 | parser = optparse.OptionParser() 171 | parser.add_option('-a', '--autocommit', action='store_true', default=False, help='Enable auto-commit') 172 | parser.add_option('-b', '--batch-size', type='int', default=10000, help='How many rows to insert per txn') 173 | parser.add_option('-c', '--csv', default=None, help='Store benchmark output in the specified CSV file') 174 | parser.add_option('-d', '--data', type='int', default=0, help='Add a data column, with this size') 175 | parser.add_option('-n', '--num-iterations', type='int', default=100, help='How many iterations to run') 176 | parser.add_option('-s', '--sleep', type='int', default=10, help='How long to sleep between tests') 177 | opts, args = parser.parse_args() 178 | if len(args) != 1: 179 | parser.error('must pass exactly one argument, the path to the mysql config file') 180 | main(opts, args) 181 | -------------------------------------------------------------------------------- /examples/mysqlbench/plot.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import optparse 4 | 5 | import matplotlib 6 | from matplotlib import pyplot 7 | 8 | pyplot.rcParams.update({ 9 | 'backend': 'cairo', 10 | 'axes.labelsize': 10, 11 | 'legend.fontsize': 10, 12 | 'xtick.labelsize': 8, 13 | 'ytick.labelsize': 8, 14 | 'font.sans-serif': ['Droid Sans']}) 15 | 16 | def main(csv_name, opts): 17 | reader = iter(csv.reader(open(csv_name))) 18 | names = reader.next() 19 | data = dict((n, []) for n in names) 20 | for row in reader: 21 | for name, val in zip(names, row): 22 | data[name].append(float(val)) 23 | 24 | for name in names[1:]: 25 | xs, ys = [], [] 26 | for x in xrange(len(data[name])): 27 | xs.append(data['cumulative'][x]) 28 | ys.append(data[name][x]) 29 | pyplot.plot(xs, ys, label=name) 30 | #pyplot.scatter(xs, ys, label=name) 31 | pyplot.xlabel('cumulative # of records inserted') 32 | pyplot.ylabel('seconds per 10k inserts') 33 | pyplot.legend(loc=2) 34 | if opts.title: 35 | pyplot.title(opts.title) 36 | 37 | pyplot.savefig(opts.output, format='png', dpi=120) 38 | 39 | if __name__ == '__main__': 40 | parser = optparse.OptionParser() 41 | parser.add_option('-t', '--title', default=None, help='the title to use') 42 | parser.add_option('-o', '--output', default='graph.png', help='what file to output to') 43 | opts, args = parser.parse_args() 44 | if len(args) != 1: 45 | parser.error('must specify an input file') 46 | main(args[0], opts) 47 | -------------------------------------------------------------------------------- /schemaless/__init__.py: -------------------------------------------------------------------------------- 1 | from schemaless.guid import * 2 | from schemaless.column import Entity, c 3 | from schemaless.index import Index 4 | from schemaless.datastore import DataStore 5 | from schemaless.batch import IndexUpdater, main 6 | -------------------------------------------------------------------------------- /schemaless/batch.py: -------------------------------------------------------------------------------- 1 | """Module to assist in writing index updating batches. 2 | 3 | Here's an example of a really simple batch that adds a user_id index. 4 | 5 | ---------------------------------------------- 6 | 7 | import schemaless 8 | 9 | class AddUserIdIndex(schemaless.IndexUpdater): 10 | 11 | def initialize(self): 12 | super(AddUserIdIndex, self).initialize() 13 | self.datastore = schemaless.DataStore(mysql_shards=['localhost:3306'], 14 | user='test', password='test', database='test') 15 | self.conn = self.datastore.connection 16 | 17 | def process_row(self, row, entity): 18 | if entity.get('user_id'): 19 | self.conn.execute('INSERT IGNORE INTO index_user_id (entity_id, user_id) VALUES (%s, %s)', 20 | schemaless.to_raw(entity.id), entity.user_id) 21 | 22 | if __name__ == '__main__': 23 | AddUserIdIndex().start() 24 | """ 25 | import time 26 | import logging 27 | import optparse 28 | 29 | from schemaless.column import Entity 30 | from schemaless.log import ClassLogger 31 | 32 | class IndexUpdater(object): 33 | """Class that implements a simple batch for updating indexes. This is meant 34 | to be a base class which is subclassed by the user, with the subclass doing 35 | whatever work is actually required to add the new index. Note that this 36 | batch is also appropriate for deleting data, or doing any other operation 37 | which requires iterating over a database table. 38 | 39 | At the very minimum you must implement your own process_row method. 40 | """ 41 | 42 | log = ClassLogger() 43 | use_zlib = True 44 | 45 | def __init__(self): 46 | self.parser = optparse.OptionParser() 47 | self.parser.add_option('--start-added-id', dest='start_added_id', type='int', default=0, help='Which added_id to start at') 48 | self.parser.add_option('--batch-size', dest='batch_size', type='int', default=100, help='How many rows to process at a time') 49 | 50 | def initialize(self): 51 | self.rows_processed = 0 52 | self.start_run = time.time() 53 | self.last_id_processed = self.opts.start_added_id 54 | self.configure_logging() 55 | 56 | def configure_logging(self): 57 | logging.basicConfig(level=logging.DEBUG) 58 | 59 | def start(self): 60 | self.opts, self.args = self.parser.parse_args() 61 | self.initialize() 62 | self.run() 63 | 64 | def row_iterator(self): 65 | conn = self.datastore.connection 66 | conn.execute('SET AUTOCOMMIT=1') 67 | next_row = self.opts.start_added_id 68 | while True: 69 | rows = conn.query('SELECT * FROM entities WHERE added_id >= %s ORDER BY added_id ASC LIMIT %s', next_row, self.opts.batch_size) 70 | if rows: 71 | for row in rows: 72 | yield row 73 | next_row = row['added_id'] + 1 74 | else: 75 | break 76 | 77 | def process_row(self, row, entity): 78 | """Every subclass must implement this method at a minimum. The function 79 | takes two arguments, the raw row returned by MySQL, and an entity object 80 | representing the BLOB data stored in the row. 81 | """ 82 | raise NotImplementedError 83 | 84 | def run(self): 85 | rows_processed = 0 86 | self.log.info('starting run loop') 87 | try: 88 | for row in self.row_iterator(): 89 | entity = Entity.from_row(row, use_zlib=self.use_zlib) 90 | self.process_row(row, entity) 91 | self.rows_processed += 1 92 | self.last_id_processed = row['added_id'] 93 | except: 94 | self.log.exception('exception during run loop!') 95 | finally: 96 | elapsed_time = time.time() - self.start_run 97 | self.log.info('finished run loop, elapsed time = %1.2f seconds, processed %d rows, last added_id was %d' % (elapsed_time, self.rows_processed, self.last_id_processed)) 98 | 99 | def main(batch_cls): 100 | batch_instance = batch_cls() 101 | batch_instance.start() 102 | -------------------------------------------------------------------------------- /schemaless/column.py: -------------------------------------------------------------------------------- 1 | import simplejson 2 | import zlib 3 | 4 | class Entity(dict): 5 | 6 | @classmethod 7 | def new(cls): 8 | return Entity(id=make_guid()) 9 | 10 | @classmethod 11 | def from_row(cls, row, use_zlib=False): 12 | body = row['body'] 13 | if use_zlib: 14 | body = zlib.decompress(body) 15 | d = simplejson.loads(body) 16 | d['id'] = row['id'].encode('hex') 17 | d['updated'] = row['updated'] 18 | return cls(d) 19 | 20 | def __hasattr__(self, name): 21 | return name in self or name in self.__dict__ 22 | 23 | def __getattr__(self, name): 24 | try: 25 | return self[name] 26 | except KeyError: 27 | raise AttributeError(name) 28 | 29 | def __setattr__(self, name, val): 30 | self[name] = val 31 | 32 | def __str__(self): 33 | return str(dict(self.items())) 34 | 35 | class Column(object): 36 | 37 | def __init__(self, name): 38 | self.name = name 39 | 40 | def to_string(self): 41 | return self.name 42 | 43 | __str__ = to_string 44 | 45 | def __lt__(self, val): 46 | return ColumnExpression(self.name, ColumnExpression.OP_LT, val) 47 | 48 | def __le__(self, val): 49 | return ColumnExpression(self.name, ColumnExpression.OP_LE, val) 50 | 51 | def __eq__(self, val): 52 | return ColumnExpression(self.name, ColumnExpression.OP_EQ, val) 53 | 54 | def __ne__(self, val): 55 | return ColumnExpression(self.name, ColumnExpression.OP_NE, val) 56 | 57 | def __gt__(self, val): 58 | return ColumnExpression(self.name, ColumnExpression.OP_GT, val) 59 | 60 | def __ge__(self, val): 61 | return ColumnExpression(self.name, ColumnExpression.OP_GE, val) 62 | 63 | def in_(self, vals): 64 | return ColumnExpression(self.name, ColumnExpression.OP_IN, vals) 65 | 66 | class ColumnExpression(object): 67 | 68 | OP_LT = 1 69 | OP_LE = 2 70 | OP_EQ = 3 71 | OP_NE = 4 72 | OP_GT = 5 73 | OP_GE = 6 74 | OP_IN = 7 75 | 76 | def __init__(self, name, op, rhs): 77 | self.name = name 78 | self.op = op 79 | self.rhs = rhs 80 | 81 | def build(self): 82 | if self.op == self.OP_LT: 83 | return self.name + ' < %s', [self.rhs] 84 | elif self.op == self.OP_LE: 85 | return self.name + ' <= %s', [self.rhs] 86 | elif self.op == self.OP_EQ: 87 | if self.rhs is None: 88 | return '%s IS NULL' % self.name, [] 89 | else: 90 | return (self.name + ' = %s'), [self.rhs] 91 | elif self.op == self.OP_NE: 92 | if self.rhs is None: 93 | return '%s IS NOT NULL' % self.name, [] 94 | else: 95 | return (self.name + ' != %s'), [self.rhs] 96 | elif self.op == self.OP_GT: 97 | return (self.name + ' > %s'), [self.rhs] 98 | elif self.op == self.OP_GE: 99 | return (self.name + ' >= %s'), [self.rhs] 100 | elif self.op == self.OP_IN: 101 | sql = self.name + ' IN (' + ', '.join('%s' for x in self.rhs) + ')' 102 | return sql, self.rhs 103 | else: 104 | raise ValueError('Unknown operator') 105 | 106 | def check(self, val): 107 | val = val[self.name] 108 | if self.op == self.OP_LT: 109 | return val < self.rhs 110 | elif self.op == self.OP_LE: 111 | return val <= self.rhs 112 | elif self.op == self.OP_EQ: 113 | return val == self.rhs 114 | elif self.op == self.OP_NE: 115 | return val != self.rhs 116 | elif self.op == self.OP_GT: 117 | return val > self.rhs 118 | elif self.op == self.OP_GE: 119 | return val >= self.rhs 120 | elif self.op == self.OP_IN: 121 | return val in self.rhs 122 | else: 123 | raise ValueError('Unknown operator') 124 | 125 | def __str__(self): 126 | return '%s(name=%r, op=%d, rhs=%r)' % (self.__class__.__name__, self.name, self.op, self.rhs) 127 | __repr__ = __str__ 128 | 129 | class ColumnBuilder(object): 130 | 131 | def __init__(self): 132 | self._columns = {} 133 | 134 | def __getattr__(self, name): 135 | if name not in self._columns: 136 | self._columns[name] = Column(name) 137 | return self._columns[name] 138 | 139 | c = ColumnBuilder() 140 | -------------------------------------------------------------------------------- /schemaless/datastore.py: -------------------------------------------------------------------------------- 1 | import time 2 | import simplejson 3 | import zlib 4 | 5 | import tornado.database 6 | 7 | from schemaless.column import Entity 8 | from schemaless.index import Index 9 | from schemaless.guid import raw_guid 10 | from schemaless.log import ClassLogger 11 | 12 | class DataStore(object): 13 | 14 | log = ClassLogger() 15 | 16 | def __init__(self, mysql_shards=[], user=None, database=None, password=None, use_zlib=True, indexes=[], create_entities=True): 17 | if not mysql_shards: 18 | raise ValueError('Must specify at least one MySQL shard') 19 | if len(mysql_shards) > 1: 20 | raise NotImplementedError 21 | self.use_zlib = use_zlib 22 | self.indexes = [Index('entities', ['tag'])] 23 | self.connection = tornado.database.Connection(host=mysql_shards[0], user=user, password=password, database=database) 24 | if create_entities and not self.check_table_exists('entities'): 25 | self.create_entities_table() 26 | 27 | @property 28 | def tag_index(self): 29 | return self.indexes[0] 30 | 31 | def define_index(self, table, properties=[], match_on={}, shard_on=None): 32 | idx = Index(table=table, properties=properties, match_on=match_on, shard_on=shard_on, connection=self.connection, use_zlib=self.use_zlib) 33 | self.indexes.append(idx) 34 | return idx 35 | 36 | def _find_indexes(self, entity, include_entities=False): 37 | """Find all of the indexes that may index an entity, based on the keys 38 | in the entity. 39 | """ 40 | keys = frozenset(entity.keys()) 41 | for idx in self.indexes: 42 | if idx.matches(entity, keys): 43 | if idx.table != 'entities': 44 | yield idx 45 | elif include_entities: 46 | yield idx 47 | 48 | def put(self, entity, tag=None): 49 | is_update = False 50 | entity['updated'] = time.time() 51 | entity_id = None 52 | 53 | entity_copy = entity.copy() 54 | 55 | # get the entity_id (or create a new one) 56 | entity_id = entity_copy.pop('id', None) 57 | if entity_id is None: 58 | entity_id = raw_guid() 59 | else: 60 | is_update = True 61 | if len(entity_id) != 16: 62 | entity_id = entity_id.decode('hex') 63 | body = simplejson.dumps(entity_copy) 64 | if self.use_zlib: 65 | body = zlib.compress(body, 1) 66 | 67 | if is_update: 68 | self._put_update(entity_id, entity_copy, body) 69 | return entity 70 | else: 71 | return self._put_new(entity_id, entity_copy, tag, body) 72 | 73 | def _insert_index(self, index, entity_id, entity): 74 | pnames = ['entity_id'] 75 | vals = [entity_id] 76 | for p in index.properties: 77 | pnames.append(p) 78 | vals.append(entity[p]) 79 | 80 | q = 'INSERT INTO %s (%s) VALUES (' % (index.table, ', '.join(pnames)) 81 | q += ', '.join('%s' for x in pnames) 82 | q += ')' 83 | try: 84 | self.connection.execute(q, *vals) 85 | except tornado.database.OperationalError: 86 | self.log.exception('query = %s, vals = %s' % (q, vals)) 87 | raise 88 | 89 | def _update_index(self, index, entity_id, entity): 90 | row = self.connection.get('SELECT * FROM %s WHERE entity_id = %%s' % (index.table,), entity_id) 91 | if row: 92 | vals = [] 93 | q = 'UPDATE %s SET ' % index.table 94 | qs = [] 95 | for p in index.properties: 96 | qs.append('%s = %%s' % p) 97 | vals.append(entity[p]) 98 | q += ', '.join(qs) 99 | q += ' WHERE entity_id = %s' 100 | vals.append(entity_id) 101 | self.connection.execute(q, *vals) 102 | else: 103 | self._insert_index(index, entity_id, entity) 104 | 105 | def _put_new(self, entity_id, entity, tag, body): 106 | pk = self.connection.execute('INSERT INTO entities (id, updated, tag, body) VALUES (%s, FROM_UNIXTIME(%s), %s, %s)', entity_id, int(entity['updated']), tag, body) 107 | for idx in self._find_indexes(entity): 108 | self._insert_index(idx, entity_id, entity) 109 | return self.by_id(entity_id) 110 | 111 | def _put_update(self, entity_id, entity, body): 112 | self.connection.execute('UPDATE entities SET updated = CURRENT_TIMESTAMP, body = %s WHERE id = %s', body, entity_id) 113 | for idx in self._find_indexes(entity): 114 | self._update_index(idx, entity_id, entity) 115 | 116 | def delete(self, entity=None, id=None): 117 | if entity is None and id is None: 118 | raise ValueError('Must provide delete with an entity and an id') 119 | if entity and 'id' not in entity: 120 | raise ValueError('Cannot provide an entity without an id') 121 | if not entity: 122 | entity = self.by_id(id) 123 | if not entity: 124 | return 0 125 | entity_id = entity['id'].decode('hex') 126 | 127 | def _delete(table_name): 128 | col = 'id' if table_name == 'entities' else 'entity_id' 129 | return int(bool(self.connection.execute('DELETE FROM %s WHERE %s = %%s' % (table_name, col), entity_id))) 130 | 131 | deleted = 0 132 | seen_entities = False 133 | for idx in self._find_indexes(entity): 134 | if idx.table == 'entities': 135 | seen_entities = True 136 | deleted += _delete(idx.table) 137 | if not seen_entities: 138 | deleted += _delete('entities') 139 | return deleted 140 | 141 | def by_id(self, id): 142 | if len(id) == 32: 143 | id = id.decode('hex') 144 | row = self.connection.get('SELECT * FROM entities WHERE id = %s', id) 145 | return Entity.from_row(row, use_zlib=self.use_zlib) if row else None 146 | 147 | def check_table_exists(self, table_name): 148 | row = self.connection.get('SELECT COUNT(*) AS tbl_count FROM information_schema.tables WHERE table_schema = DATABASE() AND table_name = %s', table_name) 149 | return bool(row['tbl_count']) 150 | 151 | def create_entities_table(self): 152 | self.connection.execute(""" 153 | CREATE TABLE entities ( 154 | added_id INTEGER NOT NULL AUTO_INCREMENT, 155 | id BINARY(16) NOT NULL, 156 | updated TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, 157 | tag MEDIUMINT, 158 | body MEDIUMBLOB NOT NULL, 159 | PRIMARY KEY (added_id), 160 | UNIQUE KEY (id), 161 | KEY (updated) 162 | ) ENGINE=InnoDB""") 163 | -------------------------------------------------------------------------------- /schemaless/guid.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | GUID_SIZE = 16 4 | 5 | def raw_guid(size=GUID_SIZE): 6 | return os.urandom(size) 7 | 8 | def guid(size=GUID_SIZE): 9 | return raw_guid(size=size).encode('hex') 10 | 11 | def to_raw(s): 12 | return s.decode('hex') 13 | 14 | def to_str(r): 15 | return r.encode('hex') 16 | -------------------------------------------------------------------------------- /schemaless/index.py: -------------------------------------------------------------------------------- 1 | from schemaless.column import ColumnExpression, Entity 2 | 3 | class Order(object): 4 | 5 | def __init__(self, name, asc=False, desc=False): 6 | self.name = name 7 | self.order = 'ASC' if asc else 'DESC' 8 | 9 | def reduce_args(*exprs, **kwargs): 10 | limit = kwargs.pop('limit', None) 11 | order_by = kwargs.pop('order_by', None) 12 | asc = kwargs.pop('asc', False) 13 | desc = kwargs.pop('desc', False) 14 | if asc and desc: 15 | raise ValueError('Cannot specify both asc=True and desc=True') 16 | if order_by: 17 | if not (asc or desc): 18 | asc = True 19 | order_by = Order(order_by, asc=asc, desc=desc) 20 | 21 | exprs = list(exprs) 22 | for k, v in kwargs.iteritems(): 23 | exprs.append(ColumnExpression(k, ColumnExpression.OP_EQ, v)) 24 | 25 | # if it's just an order_by, check for the order_by column not nulll 26 | #if order_by and not exprs: 27 | # exprs.append(ColumnExpression(order_by.name, ColumnExpression.OP_NE, None)) 28 | 29 | if not (order_by or exprs): 30 | raise ValueError('Must provide args/kwargs for a WHERE clause') 31 | return exprs, order_by, limit 32 | 33 | class Index(object): 34 | 35 | def __init__(self, table, properties=[], match_on={}, shard_on=None, connection=None, use_zlib=True): 36 | if shard_on is not None: 37 | raise NotImplementedError 38 | if any(',' in p for p in properties): 39 | raise ValueError('Bad property name: %r' % (p,)) 40 | 41 | self.table = table 42 | self.properties = frozenset(properties) 43 | self.match_on = match_on 44 | self.connection = connection 45 | self.use_zlib = use_zlib 46 | 47 | def __str__(self): 48 | return '%s(table=%s, properties=%s, match_on=%s)' % (self.__class__.__name__, self.table, self.properties, self.match_on) 49 | __repr__ = __str__ 50 | 51 | def __cmp__(self, other): 52 | return cmp(self.table, other.table) 53 | 54 | def matches(self, entity, keys): 55 | if not (self.properties <= keys): 56 | return False 57 | for k, v in self.match_on.iteritems(): 58 | if entity.get(k) != v: 59 | return False 60 | return True 61 | 62 | def _query(self, *exprs, **kwargs): 63 | exprs, order_by, limit = reduce_args(*exprs, **kwargs) 64 | return self._do_query(exprs, order_by, limit) 65 | 66 | def _do_query(self, exprs, order_by, limit): 67 | values = [] 68 | where_clause = [] 69 | for e in exprs: 70 | if e.name not in self.properties: 71 | raise ValueError('This index has no column named %r' % (e.name,)) 72 | expr_string, vals = e.build() 73 | where_clause.append(expr_string) 74 | values.extend(vals) 75 | 76 | if self.table == 'entities': 77 | # XXX: this is a bit hacky 78 | q = 'SELECT * FROM entities WHERE ' + ' AND '.join(where_clause) 79 | if order_by: 80 | q += ' ORDER BY %s %s' % (order_by.name, order_by.order) 81 | if limit: 82 | q += ' LIMIT %d' % (limit,) 83 | entity_rows = self.connection.query(q, *values) 84 | else: 85 | q = 'SELECT entity_id FROM %s' % self.table 86 | if where_clause: 87 | q += ' WHERE ' + ' AND '.join(where_clause) 88 | if order_by: 89 | q += ' ORDER BY %s %s' % (order_by.name, order_by.order) 90 | if limit: 91 | q += ' LIMIT %d' % (limit,) 92 | 93 | rows = self.connection.query(q, *values) 94 | if rows: 95 | entity_ids = [r['entity_id'] for r in rows] 96 | q = 'SELECT * FROM entities WHERE id IN (' 97 | q += ', '.join('%s' for x in rows) 98 | q += ')' 99 | entity_rows = self.connection.query(q, *entity_ids) 100 | else: 101 | return [] 102 | 103 | if not order_by: 104 | #sorted_entities = sorted(entity_rows, key=lambda x: x['updated'], reverse=True) 105 | sorted_entities = sorted(entity_rows, key=lambda x: x['updated']) 106 | else: 107 | # XXX: this is O(n^2), bad 108 | sorted_entities = [] 109 | for row_id in (row['entity_id'] for row in rows): 110 | for e in entity_rows: 111 | if e['id'] == row_id: 112 | sorted_entities.append(e) 113 | break 114 | else: 115 | assert False 116 | 117 | return [Entity.from_row(row, use_zlib=self.use_zlib) for row in sorted_entities] 118 | 119 | def get(self, *exprs, **kwargs): 120 | kwargs['limit'] = 1 121 | rows = self._query(*exprs, **kwargs) 122 | if len(rows) == 0: 123 | return None 124 | elif len(rows) == 1: 125 | return rows[0] 126 | else: 127 | assert False 128 | 129 | def query(self, *exprs, **kwargs): 130 | return self._query(*exprs, **kwargs) 131 | 132 | def all(self): 133 | return self._query(c.entity_id != None) 134 | -------------------------------------------------------------------------------- /schemaless/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class ClassLogger(object): 5 | 6 | def __get__(self, obj, obj_type=None): 7 | object_class = obj_type or obj.__class__ 8 | return logging.getLogger(object_class.__module__ + '.' + object_class.__name__) 9 | 10 | formatter = logging.Formatter('%(asctime)s :: %(name)s (%(levelname)s) :: %(message)s') 11 | logger = logging.getLogger('schemaless') 12 | -------------------------------------------------------------------------------- /schemaless/orm/__init__.py: -------------------------------------------------------------------------------- 1 | from session import Session 2 | from index import Index 3 | from column import * 4 | from document import make_base 5 | import converters 6 | -------------------------------------------------------------------------------- /schemaless/orm/column.py: -------------------------------------------------------------------------------- 1 | import schemaless.orm.converters 2 | 3 | DEFAULT_NONCE = Ellipsis 4 | 5 | class Column(object): 6 | 7 | def __init__(self, name, default=DEFAULT_NONCE, required=False, convert=None): 8 | self.name = name 9 | self.default = default 10 | self.required = required 11 | self.convert = convert 12 | 13 | def to_string(self): 14 | return 'COLUMN' 15 | 16 | def __str__(self): 17 | s = '`%s` %s' % (self.name, self.to_string()) 18 | if self.required: 19 | s += ' NOT NULL' 20 | return s 21 | 22 | class Char(Column): 23 | 24 | def __init__(self, name, length, **kwargs): 25 | super(Char, self).__init__(name, **kwargs) 26 | self.length = length 27 | 28 | def to_string(self): 29 | return 'CHAR(%d)' % (self.length,) 30 | 31 | class Binary(Column): 32 | 33 | def __init__(self, name, length, **kwargs): 34 | super(Binary, self).__init__(name, **kwargs) 35 | self.length = length 36 | 37 | def to_string(self): 38 | return 'BINARY(%d)' % (self.length,) 39 | 40 | class String(Column): 41 | def __init__(self, name, length, **kwargs): 42 | super(String, self).__init__(name, **kwargs) 43 | self.length = length 44 | 45 | def to_string(self): 46 | return 'VARCHAR(%d)' % (self.length,) 47 | 48 | class Text(Column): 49 | 50 | def to_string(self): 51 | return 'TEXT' 52 | 53 | class DateTime(Column): 54 | 55 | def __init__(self, name, **kwargs): 56 | if not kwargs.get('convert'): 57 | kwargs['convert'] = schemaless.orm.converters.DateTimeConverter 58 | super(DateTime, self).__init__(name, **kwargs) 59 | 60 | def to_string(self): 61 | return 'INTEGER UNSIGNED' 62 | 63 | class Guid(Char): 64 | 65 | def __init__(self, name, **kwargs): 66 | super(Guid, self).__init__(name, 32, **kwargs) 67 | UUID = GUID = Guid 68 | 69 | class Bool(Column): 70 | 71 | def __init__(self, name, **kwargs): 72 | if not kwargs.get('convert'): 73 | kwargs['convert'] = schemaless.orm.converters.BooleanConverter 74 | super(Bool, self).__init__(name, **kwargs) 75 | 76 | def to_string(self): 77 | return 'BOOL' 78 | 79 | Bit = Boolean = Bool 80 | -------------------------------------------------------------------------------- /schemaless/orm/converters.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | 4 | class Converter(object): 5 | 6 | @classmethod 7 | def to_db(cls, obj): 8 | raise NotImplementedError 9 | 10 | @classmethod 11 | def from_db(cls, val): 12 | raise NotImplementedError 13 | 14 | class DateTimeConverter(Converter): 15 | 16 | @classmethod 17 | def to_db(cls, obj): 18 | return time.mktime(obj.timetuple()) if obj else None 19 | 20 | @classmethod 21 | def from_db(cls, val): 22 | return datetime.datetime.fromtimestamp(val) if val else None 23 | 24 | class BooleanConverter(Converter): 25 | 26 | @classmethod 27 | def to_db(cls, obj): 28 | return 1 if obj else 0 29 | 30 | @classmethod 31 | def from_db(cls, val): 32 | return bool(val) 33 | -------------------------------------------------------------------------------- /schemaless/orm/document.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from collections import defaultdict 3 | from index import IndexCollection 4 | from schemaless.index import reduce_args 5 | from schemaless.log import ClassLogger 6 | from schemaless.orm.util import is_type_list 7 | from schemaless.orm.index import Index 8 | from schemaless.orm.column import Column, DEFAULT_NONCE 9 | from schemaless import c 10 | 11 | def _collect_fields(x): 12 | return set((k, v) for k, v in x.__dict__.iteritems() if k != 'tag' and not k.startswith('_') and not callable(v)) 13 | 14 | def make_base(session, meta_base=type, base_cls=object, tags_file=None, tags_db=None): 15 | """Create a base class for ORM documents. 16 | 17 | meta_base -- the base class for metaclass 18 | base_cls -- the base class for the document class 19 | tags_file -- the path of a YAML file containing tags declarations 20 | tags_db -- an explicit maping (as a dict) or tags declarations 21 | """ 22 | 23 | # tags that have been registered 24 | tags = set() 25 | 26 | tags_db = tags_db or {} 27 | assert len(tags_db.keys()) == len(tags_db.values()) 28 | 29 | if not tags_db and tags_file is not None: 30 | yaml_cfg = yaml.load(open(tags_file, 'r').read()) 31 | tags_db.update(yaml_cfg) 32 | 33 | class metacls(meta_base): 34 | 35 | def __new__(mcs, name, bases, cls_dict): 36 | 37 | if 'tag' not in cls_dict and name in tags_db: 38 | cls_dict['tag'] = tags_db[name] 39 | 40 | if 'tag' in cls_dict: 41 | if cls_dict['tag'] in tags: 42 | raise TypeError('Tag %r has already been defined' % (cls_dict['tag'],)) 43 | tags.add(cls_dict['tag']) 44 | 45 | s = set() 46 | for b in bases: 47 | s |= set(getattr(b, '_columns', set())) 48 | s |= set(cls_dict.get('_columns', set())) 49 | for x in s: 50 | if not isinstance(x, Column): 51 | raise TypeError('Got unexpected %r instead of Column' % (x,)) 52 | 53 | cls_dict['_columns'] = s 54 | cls_dict['_column_map'] = dict((c.name, c) for c in s) 55 | cls_dict['_column_names'] = frozenset(c.name for c in s) 56 | cls_dict['_required_columns'] = frozenset(c.name for c in s if c.required) 57 | 58 | if not '_abstract' in cls_dict: 59 | cls_dict.setdefault('_indexes', []) 60 | tag_index = Index('entities', ['tag']) 61 | tag_index.underlying = session.datastore.tag_index 62 | indexes = [tag_index] 63 | for idx in cls_dict.get('_indexes', []): 64 | if isinstance(idx, Index): 65 | indexes.append(idx) 66 | elif cls_dict.get('tag') and is_type_list(basestring, idx): 67 | cols = [cls_dict['_column_map'][name] for name in idx] 68 | indexes.append(Index.automatic(cls_dict['tag'], cols, session.datastore, declare=False)) 69 | else: 70 | raise ValueError("Sorry, I don't know how to make an index for %s from %r" % (name, idx)) 71 | cls_dict['_indexes'] = indexes 72 | cls_dict['_schemaless_index_collection'] = IndexCollection(indexes) 73 | for idx in indexes: 74 | idx.declare(session.datastore, tag=cls_dict['tag']) 75 | 76 | cls_dict['_session'] = session 77 | return meta_base.__new__(mcs, name, bases, cls_dict) 78 | 79 | class Document(base_cls): 80 | 81 | __metaclass__ = metacls 82 | 83 | _abstract = True 84 | _columns = [Column('tag')] 85 | _indexes = [] 86 | _id_field = None 87 | 88 | log = ClassLogger() 89 | 90 | def __init__(self, from_dict=None, is_dirty=True, **kwargs): 91 | 92 | if base_cls is not object: 93 | super(Document, self).__init__() 94 | 95 | if from_dict is None: 96 | from_dict = kwargs 97 | 98 | if hasattr(self, 'tag') and 'tag' in from_dict: 99 | if getattr(self, 'tag') != from_dict['tag']: 100 | raise TypeError('Inconsistent tag') 101 | 102 | # FIXME: ought to grab other attributes off the class dict as well 103 | self.__dict__['_schemaless_collected_fields'] = set(['tag']) 104 | self.__dict__['_schemaless_id'] = from_dict.get('id', None) 105 | 106 | for k, v in from_dict.iteritems(): 107 | if k in self._column_names: 108 | self.__dict__[k] = v 109 | self._schemaless_collected_fields.add(k) 110 | 111 | # Add default values 112 | dict_keys = from_dict.keys() 113 | for c in self._columns: 114 | if c.default != DEFAULT_NONCE and c.name not in dict_keys: 115 | if callable(c.default): 116 | v = c.default() 117 | else: 118 | v = c.default 119 | self.__dict__[c.name] = v 120 | self._schemaless_collected_fields.add(c.name) 121 | 122 | self._schemaless_dirty = is_dirty 123 | if self._schemaless_dirty and self._saveable(): 124 | self._session.dirty_documents.add(self) 125 | 126 | def _saveable(self): 127 | return self._schemaless_collected_fields >= self._required_columns 128 | 129 | def __setattr__(self, k, v): 130 | if k in self._column_names: 131 | self._schemaless_collected_fields.add(k) 132 | self._schemaless_dirty = True 133 | if self not in self._session.dirty_documents and self._saveable(): 134 | self._session.dirty_documents.add(self) 135 | super(Document, self).__setattr__(k, v) 136 | 137 | def __delattr__(self, k): 138 | try: 139 | self._schemaless_collected_fields.remove(k) 140 | except KeyError: 141 | pass 142 | super(Document, self).__delattr__(k) 143 | 144 | @property 145 | def is_dirty(self): 146 | return self._schemaless_dirty 147 | 148 | @classmethod 149 | def from_datastore(cls, d): 150 | if d['tag'] != cls.tag: 151 | raise ValueError('Expected item with tag %d, instead got item with tag %d' % (cls.tag, d['tag'])) 152 | missing = cls._required_columns - set(d.keys()) 153 | if missing: 154 | raise ValueError('Missing from %s the following keys: %s' % (d, ', '.join(k for k in sorted(missing)))) 155 | for k, v in d.iteritems(): 156 | c = cls._column_map.get(k) 157 | if c and c.convert: 158 | d[k] = c.convert.from_db(v) 159 | 160 | obj = cls(d, is_dirty=False) 161 | obj.updated = d['updated'] 162 | return obj 163 | 164 | def to_dict(self): 165 | d = {'id': self.id} 166 | for f in self._column_names: 167 | if f in self._required_columns: 168 | val = getattr(self, f) 169 | elif hasattr(self, f): 170 | val = getattr(self, f) 171 | else: 172 | continue 173 | if self._column_map[f].convert: 174 | val = self._column_map[f].convert.to_db(val) 175 | d[f] = val 176 | return d 177 | 178 | @property 179 | def id(self): 180 | return getattr(self, '_schemaless_id', None) 181 | 182 | def save(self, clear_session=True): 183 | if not self._saveable(): 184 | missing = self._required_columns - self._schemaless_collected_fields 185 | raise ValueError('This object is not yet saveable, missing: %s' % (', '.join(str(k) for k in missing),)) 186 | if self._schemaless_dirty: 187 | obj = self._session.datastore.put(self.to_dict(), self.tag) 188 | self.updated = obj['updated'] 189 | self._schemaless_id = obj['id'] 190 | self._schemaless_dirty = False 191 | if clear_session and self in self._session.dirty_documents: 192 | self._session.dirty_documents.remove(self) 193 | return self 194 | 195 | def delete(self, clear_session=True): 196 | if not self._saveable(): 197 | raise ValueError('This object is not yet saveable') 198 | if not hasattr(self, '_schemaless_id'): 199 | raise ValueError('This object has no entity id (or has not been persisted)') 200 | self._session.datastore.delete(id=self._schemaless_id) 201 | if clear_session and self in self._session.dirty_documents: 202 | self._session.dirty_documents.remove(self) 203 | 204 | @classmethod 205 | def _query(cls, *exprs, **kwargs): 206 | exprs, order_by, limit = reduce_args(*exprs, **kwargs) 207 | columns = set(e.name for e in exprs) 208 | if order_by: 209 | columns.add(order_by.name) 210 | idx = cls._schemaless_index_collection.best_index(columns) 211 | cls._last_index_used = idx 212 | using = idx.field_set & columns 213 | 214 | if not using: 215 | raise ValueError('cannot do this query, no indexes can be used') 216 | 217 | query_exprs = [e for e in exprs if e.name in using] 218 | result = idx.underlying._do_query(query_exprs, order_by, limit) 219 | retained_result = [] 220 | for x in result: 221 | if all(e.check(x) for e in exprs): 222 | retained_result.append(cls.from_datastore(x)) 223 | return retained_result 224 | 225 | @classmethod 226 | def get(cls, *exprs, **kwargs): 227 | kwargs['limit'] = 1 228 | result = cls._query(*exprs, **kwargs) 229 | if len(result) == 0: 230 | return None 231 | elif len(result) == 1: 232 | return result[0] 233 | else: 234 | raise ValueError('Got more than one result') 235 | 236 | @classmethod 237 | def query(cls, *exprs, **kwargs): 238 | return cls._query(*exprs, **kwargs) 239 | 240 | @classmethod 241 | def all(cls): 242 | return cls._query(c.tag == cls.tag) 243 | 244 | @classmethod 245 | def by_id(cls, id): 246 | entity = cls._session.datastore.by_id(id) 247 | if not entity: 248 | return entity 249 | if entity.tag != cls.tag: 250 | raise ValueError('Entity had tag %r, our class has tag %r' % (entity.tag, cls.tag)) 251 | return cls.from_datastore(entity) 252 | 253 | def __eq__(self, other): 254 | return self.__class__ is type(other) and _collect_fields(self) == _collect_fields(other) 255 | 256 | return Document 257 | -------------------------------------------------------------------------------- /schemaless/orm/index.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import schemaless.index 3 | from schemaless.log import ClassLogger 4 | 5 | class Index(object): 6 | 7 | log = ClassLogger() 8 | 9 | def __init__(self, table_name, fields): 10 | self.table_name = table_name 11 | self.fields = fields 12 | self.field_set = frozenset(fields) 13 | self.underlying = None 14 | 15 | @classmethod 16 | def automatic(cls, tag, fields, datastore, declare=True): 17 | """This is an "internal" method for declaratively creating 18 | indexes. Arguments are like this: 19 | 20 | tag -- the tag of the document that this is being created for 21 | fields -- a list of typed Column objects like [Binary('foo', 16), VarChar('email', 255)] 22 | datastore -- a handle to the datastore 23 | 24 | A unique table name will be created using the tag and an md5 of the 25 | field names. The table will be created, if necessary. 26 | """ 27 | 28 | field_string = ', '.join('`%s`' % (f.name,) for f in fields) 29 | field_hash = hashlib.md5(field_string).hexdigest() 30 | table_name = 'index_%05d_%s' % (tag, field_hash) 31 | 32 | if not datastore.check_table_exists(table_name): 33 | cls.log.info('Creating %s' % (table_name,)) 34 | sql = ['CREATE TABLE %s (' % (table_name,)] 35 | for f in fields: 36 | sql.append(' %s,' % (f,)) 37 | sql.append(' `entity_id` BINARY(16) NOT NULL,') 38 | sql.append(' UNIQUE KEY (`entity_id`),') 39 | sql.append(' PRIMARY KEY (%s, `entity_id`)' % (field_string,)) 40 | sql.append(') ENGINE=InnoDB') 41 | sql = '\n'.join(sql) 42 | 43 | # by this point, sql will contain a query like: 44 | # 45 | # CREATE TABLE index_00003_850f22a7c399fd1483275d62703d49de ( 46 | # `business_id` BINARY(16) NOT NULL, 47 | # `entity_id` BINARY(16) NOT NULL, 48 | # KEY (`entity_id`), 49 | # PRIMARY KEY (`business_id`, `entity_id`) 50 | # ) ENGINE=InnoDB 51 | # 52 | # XXX: no support for unique columns yet 53 | 54 | # create the table 55 | datastore.connection.execute(sql) 56 | 57 | obj = cls(table_name, [f.name for f in fields]) 58 | if declare: 59 | obj.declare(datastore, tag=tag) 60 | return obj 61 | 62 | def declare(self, datastore, tag=None): 63 | match_on = {} 64 | if tag is not None: 65 | match_on = {'tag': tag} 66 | self.underlying = datastore.define_index(self.table_name, self.fields, match_on=match_on) 67 | return self.underlying 68 | 69 | def __str__(self): 70 | if self.underlying is None: 71 | return '%s(%r, %s)' % (self.__class__.__name__, self.table_name, self.fields) 72 | else: 73 | return '%s(%s)' % (self.__class__.__name__, self.underlying) 74 | __repr__ = __str__ 75 | 76 | class IndexCollection(object): 77 | 78 | log = ClassLogger() 79 | 80 | def __init__(self, indexes): 81 | self.indexes = indexes 82 | self.answer_cache = {} 83 | 84 | def best_index(self, fields): 85 | """Given some collection of fields (e.g. ['user_id', 'first_name', 86 | 'last_name']) try to determine which index in the collection will match 87 | the most fields. 88 | """ 89 | fields = frozenset(fields) 90 | if fields in self.answer_cache: 91 | return self.answer_cache[fields] 92 | 93 | # try to find the index that covers the most columns possible, and where 94 | # the index has the least number of fields possible 95 | best = (-1, 0, None) 96 | for idx in self.indexes: 97 | common = len(fields & idx.field_set) 98 | val = (common, -len(idx.field_set), idx) 99 | if val > best: 100 | best = val 101 | 102 | best = best[-1] 103 | self.log.debug('from %s chose %s as best index for %s' % (self.indexes, best, fields)) 104 | self.answer_cache[fields] = best 105 | return best 106 | -------------------------------------------------------------------------------- /schemaless/orm/session.py: -------------------------------------------------------------------------------- 1 | class Session(object): 2 | 3 | def __init__(self, datastore): 4 | self.datastore = datastore 5 | self.dirty_documents = set() 6 | 7 | def save(self): 8 | for d in self.dirty_documents: 9 | d.save(clear_session=False) 10 | self.dirty_documents.clear() 11 | -------------------------------------------------------------------------------- /schemaless/orm/util.py: -------------------------------------------------------------------------------- 1 | def is_type_list(t, xs): 2 | try: 3 | return all(isinstance(x, t) for x in xs) 4 | except TypeError: 5 | return False 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='Schemaless', 4 | version='0.2.1', 5 | description='Schema-less MySQL pattern', 6 | author='Evan Klitzke', 7 | author_email='evan@eklitzke.org', 8 | packages=['schemaless', 'schemaless.orm'] 9 | ) 10 | -------------------------------------------------------------------------------- /tests/tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS `index_birthdate` ( 2 | `entity_id` binary(16) NOT NULL, 3 | `birthdate` varchar(64) NOT NULL DEFAULT '', 4 | PRIMARY KEY (`birthdate`,`entity_id`) 5 | ) ENGINE=InnoDB; 6 | 7 | CREATE TABLE IF NOT EXISTS `index_foo` ( 8 | `entity_id` binary(16) NOT NULL, 9 | `bar` int(11) NOT NULL, 10 | PRIMARY KEY (`bar`,`entity_id`), 11 | UNIQUE KEY `entity_id` (`entity_id`) 12 | ) ENGINE=InnoDB; 13 | 14 | CREATE TABLE IF NOT EXISTS `index_user_id` ( 15 | `entity_id` binary(16) NOT NULL, 16 | `user_id` char(32) NOT NULL, 17 | PRIMARY KEY (`user_id`,`entity_id`), 18 | UNIQUE KEY `entity_id` (`entity_id`) 19 | ) ENGINE=InnoDB; 20 | 21 | CREATE TABLE IF NOT EXISTS `index_user_name` ( 22 | `entity_id` binary(16) NOT NULL, 23 | `first_name` varchar(255) NOT NULL, 24 | `last_name` varchar(255) NOT NULL, 25 | PRIMARY KEY (`first_name`,`last_name`,`entity_id`), 26 | UNIQUE KEY `entity_id` (`entity_id`) 27 | ) ENGINE=InnoDB; 28 | 29 | CREATE TABLE IF NOT EXISTS `index_todo_user_id` ( 30 | `entity_id` binary(16) NOT NULL, 31 | `user_id` char(32) NOT NULL, 32 | PRIMARY KEY (`user_id`,`entity_id`), 33 | UNIQUE KEY `entity_id` (`entity_id`) 34 | ) ENGINE=InnoDB; 35 | -------------------------------------------------------------------------------- /tests/tests.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import unittest 4 | 5 | import schemaless 6 | from schemaless import orm 7 | from schemaless import c 8 | 9 | class TestBase(unittest.TestCase): 10 | 11 | def clear_tables(self, datastore): 12 | tables = set() 13 | for d in datastore.connection.query('SHOW TABLES'): 14 | for v in d.itervalues(): 15 | tables.add(v) 16 | for tbl in tables: 17 | datastore.connection.execute('DELETE FROM %s' % (tbl,)) 18 | 19 | def assert_equal(self, a, b): 20 | return self.assertEqual(a, b) 21 | 22 | def assert_len(self, a, b): 23 | return self.assertEqual(a, len(b)) 24 | 25 | def assert_used_index(self, document_cls, name): 26 | index = getattr(document_cls, '_last_index_used', None) 27 | if index is None: 28 | return False 29 | if index.table_name != name: 30 | self.assert_(False, 'Expected to use index %s but actually used %s' % (name, index.table_name)) 31 | 32 | class SchemalessTestCase(TestBase): 33 | 34 | def setUp(self): 35 | super(SchemalessTestCase, self).setUp() 36 | self.ds = schemaless.DataStore(mysql_shards=['localhost:3306'], user='test', password='test', database='test') 37 | self.user = self.ds.define_index('index_user_id', ['user_id']) 38 | self.user_name = self.ds.define_index('index_user_name', ['first_name', 'last_name']) 39 | self.foo = self.ds.define_index('index_foo', ['bar'], {'m': 'right'}) 40 | self.clear_tables(self.ds) 41 | 42 | self.entity = self.ds.put({'user_id': schemaless.guid(), 'first_name': 'evan', 'last_name': 'klitzke'}) 43 | 44 | def test_query(self): 45 | self.assert_len(1, self.user.query(c.user_id == self.entity.user_id)) 46 | self.assert_len(1, self.user_name.query(c.first_name == 'evan', c.last_name == 'klitzke')) 47 | 48 | new_entity = self.ds.put({'user_id': schemaless.guid(), 'first_name': 'george'}) 49 | self.assert_len(1, self.user.query(c.user_id == new_entity.user_id)) 50 | self.assert_len(0, self.user_name.query(c.first_name == 'george')) # didn't have a full index 51 | 52 | def test_delete_by_entity(self): 53 | self.ds.delete(self.entity) 54 | self.assert_len(0, self.user.query(c.user_id == self.entity.user_id)) 55 | 56 | def test_delete_by_entity_id(self): 57 | self.ds.delete(id=self.entity.id) 58 | self.assert_len(0, self.user.query(c.user_id == self.entity.user_id)) 59 | 60 | def test_match_on(self): 61 | entity_one = self.ds.put({'foo_id': schemaless.guid(), 'bar': 1, 'm': 'left'}) 62 | entity_two = self.ds.put({'foo_id': schemaless.guid(), 'bar': 1, 'm': 'right'}) # only this should match 63 | 64 | rows = self.foo.query(c.bar == 1) 65 | self.assert_len(1, rows) 66 | self.assert_equal(rows[0].foo_id, entity_two.foo_id) 67 | 68 | def test_in_queries(self): 69 | user_ids = [self.entity.user_id] 70 | user_ids.append(self.ds.put({'user_id': schemaless.guid()}).user_id) 71 | 72 | rows = self.user.query(c.user_id.in_(user_ids)) 73 | self.assert_len(2, rows) 74 | self.assert_equal(set(user_ids), set(row['user_id'] for row in rows)) 75 | 76 | class ORMTestCase(TestBase): 77 | def setUp(self): 78 | datastore = schemaless.DataStore(mysql_shards=['localhost:3306'], user='test', password='test', database='test') 79 | self.clear_tables(datastore) 80 | 81 | tags_db = { 82 | 'User': 1, 83 | 'ToDo': 2, 84 | 'Business': 3} 85 | 86 | self.session = orm.Session(datastore) 87 | self.base_class = orm.make_base(self.session, tags_db=tags_db) 88 | 89 | @property 90 | def connection(self): 91 | return self.session.datastore.connection 92 | 93 | def get_index_count(self, index_name): 94 | row = self.connection.get('SELECT COUNT(*) AS count FROM %s' % (index_name,)) 95 | return row['count'] 96 | 97 | class SchemalessORMTestCase(ORMTestCase): 98 | 99 | def setUp(self): 100 | super(SchemalessORMTestCase, self).setUp() 101 | 102 | class User(self.base_class): 103 | _columns = [orm.Column('user_id', required=True), 104 | orm.Column('first_name', required=True), 105 | orm.Column('last_name', required=True), 106 | orm.Column('birthdate'), 107 | orm.Column('time_created', default=datetime.datetime.now, convert=schemaless.orm.converters.DateTimeConverter)] 108 | _indexes = [orm.Index('index_user_id', ['user_id']), 109 | orm.Index('index_birthdate', ['birthdate']), 110 | orm.Index('index_user_name', ['first_name', 'last_name'])] 111 | 112 | self.User = User 113 | 114 | def test_create_object_save_delete(self): 115 | # create a new, empty object 116 | u = self.User() 117 | assert not u._saveable() 118 | assert u.is_dirty 119 | 120 | # populate some, but not all of the fields; the object should be dirty, 121 | # but not saveable 122 | u.user_id = schemaless.guid() 123 | u.first_name = 'evan' 124 | assert not u._saveable() 125 | assert u.is_dirty 126 | user = self.User.get(c.user_id == u.user_id) 127 | assert not user 128 | 129 | # finish populating the fields, check that the object is saveable 130 | u.last_name = 'klitzke' 131 | assert u._saveable() 132 | assert u.is_dirty 133 | 134 | # persist the object, check that it made it to the datastore 135 | u.save() 136 | assert u._saveable() 137 | assert not u.is_dirty 138 | user = self.User.get(c.user_id == u.user_id) 139 | assert user 140 | 141 | # delete the object, check that it's deleted from the datastore 142 | u.delete() 143 | assert u._saveable() 144 | assert not u.is_dirty 145 | user = self.User.get(c.user_id == u.user_id) 146 | assert not user 147 | 148 | def test_in_query(self): 149 | user_ids = [] 150 | users = [] 151 | for x in range(5): 152 | u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar') 153 | user_ids.append(u.user_id) 154 | users.append(u) 155 | self.session.save() 156 | 157 | fetched_users = self.User.query(c.user_id.in_(user_ids[:3])) 158 | self.assert_used_index(self.User, 'index_user_id') 159 | self.assert_equal(set(user_ids[:3]), set(u.user_id for u in users[:3])) 160 | 161 | def test_name_query(self): 162 | u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar') 163 | u.save() 164 | v = self.User.get(c.first_name == 'foo', c.last_name == 'bar') 165 | self.assert_used_index(self.User, 'index_user_name') 166 | self.assert_equal(u.user_id, v.user_id) 167 | 168 | def test_update(self): 169 | u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar') 170 | u.save() 171 | v = self.User.get(c.first_name == 'foo', c.last_name == 'bar') 172 | self.assert_used_index(self.User, 'index_user_name') 173 | self.assert_equal(u.id, v.id) 174 | self.assert_equal(u.user_id, v.user_id) 175 | 176 | u.first_name = 'baz' 177 | u.save() 178 | v = self.User.get(c.first_name == 'foo', c.last_name == 'bar') 179 | self.assert_used_index(self.User, 'index_user_name') 180 | self.assert_equal(None, v) 181 | v = self.User.get(c.first_name == 'baz', c.last_name == 'bar') 182 | self.assert_used_index(self.User, 'index_user_name') 183 | self.assert_equal(u.id, v.id) 184 | self.assert_equal(u.user_id, v.user_id) 185 | 186 | def test_double_delete(self): 187 | u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar') 188 | u.save() 189 | u.delete() 190 | u.delete() 191 | 192 | def test_update_preserves_id(self): 193 | u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar') 194 | u.save() 195 | orig_id = u.id 196 | 197 | u.first_name = 'baz' 198 | u.save() 199 | self.assert_equal(orig_id, u.id) 200 | 201 | def test_converter(self): 202 | u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar') 203 | u.save() 204 | self.assert_(isinstance(u.time_created, datetime.datetime)) 205 | 206 | v = self.User.get(c.user_id == u.user_id) 207 | self.assert_(isinstance(v.time_created, datetime.datetime)) 208 | 209 | def test_index_update(self): 210 | u = self.User(user_id=schemaless.guid(), first_name='evan', last_name='klitzke') 211 | u.save() 212 | 213 | self.assert_equal(self.get_index_count('index_birthdate'), 0) 214 | 215 | u.birthdate = '1986-09-19' 216 | u.save() 217 | self.assert_equal(self.get_index_count('index_birthdate'), 1) 218 | 219 | class ManyToOneORMTestCase(ORMTestCase): 220 | 221 | def setUp(self): 222 | super(ManyToOneORMTestCase, self).setUp() 223 | 224 | class ToDo(self.base_class): 225 | _columns = [orm.Column('user_id'), 226 | orm.Column('action'), 227 | orm.Column('completion_time', default=None, convert=schemaless.orm.converters.DateTimeConverter)] 228 | _indexes = [orm.Index('index_todo_user_id', ['user_id'])] 229 | #orm.Index('index_todo_user_id_time', ['user_id', 'completion_time'])] 230 | 231 | self.ToDo = ToDo 232 | 233 | @property 234 | def connection(self): 235 | return self.session.datastore.connection 236 | 237 | def test_update_multiple(self): 238 | user_id = schemaless.guid() 239 | item1 = self.ToDo(user_id=user_id, action='buy groceries').save() 240 | item2 = self.ToDo(user_id=user_id, action='buy groceries').save() 241 | 242 | self.assert_equal(self.get_index_count('index_todo_user_id'), 2) 243 | 244 | item1.completion_time = datetime.datetime.now() 245 | item1.save() 246 | 247 | self.assert_equal(self.get_index_count('index_todo_user_id'), 2) 248 | 249 | class AutomaticORMTestCase(ORMTestCase): 250 | """Test ORM documents with automatic indexes.""" 251 | 252 | def setUp(self): 253 | super(AutomaticORMTestCase, self).setUp() 254 | 255 | class Business(self.base_class): 256 | _columns = [orm.Char('business_id', 32), 257 | orm.String('city', 255), 258 | orm.Char('state', 2), 259 | orm.Bool('active', default=True)] 260 | _indexes = [('business_id',), 261 | ('city', 'state')] 262 | 263 | self.Business = Business 264 | 265 | def add_biz(self, city='Oakland', state='CA'): 266 | return self.Business(business_id=schemaless.guid(), city='Oakland', state='CA').save() 267 | 268 | def test_querying(self): 269 | b = self.add_biz() 270 | assert not b.is_dirty 271 | 272 | self.assert_equal(b, self.Business.get(c.business_id == b.business_id)) 273 | self.assert_used_index(self.Business, 'index_00003_850f22a7c399fd1483275d62703d49de') 274 | 275 | self.assert_equal(b, self.Business.get(c.city == b.city, c.state == b.state)) 276 | self.assert_used_index(self.Business, 'index_00003_8e5e9c3d848aa30749151092bbff622d') 277 | 278 | def test_all(self): 279 | b = self.add_biz() 280 | assert not b.is_dirty 281 | 282 | self.assert_len(1, self.Business.all()) 283 | self.assert_used_index(self.Business, 'entities') 284 | 285 | def test_bool(self): 286 | b = self.add_biz() 287 | self.assert_equal(b, self.Business.get(c.business_id == b.business_id, c.active == True)) 288 | self.assert_equal(b, self.Business.get(c.business_id == b.business_id, c.active != False)) 289 | 290 | if __name__ == '__main__': 291 | unittest.main() 292 | --------------------------------------------------------------------------------