├── .dir-locals.el
├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── blog
    │   ├── main.py
    │   ├── static
    │   │   └── blog.css
    │   ├── tags.yaml
    │   └── templates
    │   │   ├── base.html
    │   │   ├── main.html
    │   │   └── post.html
    └── mysqlbench
    │   ├── bench.yaml
    │   ├── mysqlbench.py
    │   └── plot.py
├── schemaless
    ├── __init__.py
    ├── batch.py
    ├── column.py
    ├── datastore.py
    ├── guid.py
    ├── index.py
    ├── log.py
    └── orm
    │   ├── __init__.py
    │   ├── column.py
    │   ├── converters.py
    │   ├── document.py
    │   ├── index.py
    │   ├── session.py
    │   └── util.py
├── setup.py
└── tests
    ├── tables.sql
    └── tests.py


/.dir-locals.el:
--------------------------------------------------------------------------------
1 | ((nil . ((indent-tabs-mode . nil)
2 |          (tab-width . 4)
3 |          (fill-column . 80)))
4 |  (sh-mode . ((tab-width . 2)
5 |              (indent-tabs-mode . nil)))
6 |  (python-mode . ((tab-width . 4)
7 |                  (indent-tabs-mode . nil)
8 |                  (python-indent . 4))))
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[co]
2 | *.sw[nop]
3 | *~
4 | build
5 | \#*#
6 | \.\#*
7 | *.csv
8 | *.png
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010, Evan Klitzke <evan@eklitzke.org>
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **Disclaimer: this is alpha-quality code, and the API is not yet stable**
  2 | 
  3 | Introduction
  4 | ============
  5 | 
  6 | Schemaless is a Python module that implements the pattern described by Bret
  7 | Taylor in his post
  8 | [How FriendFeed uses MySQL to store schema-less data](http://bret.appspot.com/entry/how-friendfeed-uses-mysql). There
  9 | are a couple of other Python modules out there that do this already. Here's how
 10 | schemaless is different:
 11 | 
 12 |  * Only MySQL is supported. That said, I'd love to add SQLite support in the
 13 |    future.
 14 |  * Sharding isn't yet supported. Should be pretty straightforward to implement,
 15 |    though.
 16 |  * There's an optional "ORM" (which isn't really relational) implemented as
 17 |    `schemaless.orm`. The "ORM" really is optional, and the interface described
 18 |    by FriendFeed is all usable and decoupled from the session/object stuff.
 19 |  * The ORM is designed to be mostly declarative and easy to use. That means that
 20 |    you can say, "I have have a document type `User`, and please can I have an
 21 |    index on `(user_id)`, and I'd also like an index on `(first_name, last_name)`
 22 |    please." The ORM will then create the necessary index tables and
 23 |    automatically update them when you add new users; it will also know how to
 24 |    pick the most specific index, given an arbitrary query.
 25 | 
 26 | Basic Usage
 27 | ===========
 28 | 
 29 | The code exported under the `schemaless` module exactly mimics the behavior and
 30 | interface described by FriendFeed.
 31 | 
 32 | Example
 33 | -------
 34 | 
 35 | Consider the following MySQL database schema:
 36 | 
 37 |     CREATE TABLE entities (
 38 |         added_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
 39 |         id BINARY(16) NOT NULL,
 40 |         updated TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
 41 |         body MEDIUMBLOB,
 42 |         UNIQUE KEY (id),
 43 |         KEY (updated)
 44 |     ) ENGINE=InnoDB;
 45 |     
 46 |     CREATE TABLE index_user_id (
 47 |         entity_id BINARY(16) NOT NULL UNIQUE,
 48 |         user_id CHAR(32) NOT NULL,
 49 |         PRIMARY KEY (user_id, entity_id)
 50 |     ) ENGINE=InnoDB;
 51 |     
 52 |     CREATE TABLE index_user_name (
 53 |         entity_id BINARY(16) NOT NULL UNIQUE,
 54 |         first_name VARCHAR(255) NOT NULL,
 55 |         last_name VARCHAR(255) NOT NULL,
 56 |         PRIMARY KEY (first_name, last_name, entity_id)
 57 |     ) ENGINE=InnoDB;
 58 |     
 59 |     CREATE TABLE index_foo (
 60 |         entity_id BINARY(16) NOT NULL UNIQUE,
 61 |         bar INTEGER NOT NULL,
 62 |         PRIMARY KEY (bar, entity_id)
 63 |     ) ENGINE=InnoDB;
 64 |     
 65 | The meaning of all of these tables should be clear to you if you've read Bret's
 66 | blog post. The following code is a simple example of the interface that
 67 | Schemaless provides:
 68 | 
 69 |     import schemaless
 70 |     from schemaless import c
 71 |     
 72 |     ds = schemaless.DataStore(mysql_shards=['localhost:3306'], user='foo', password='foo', database='foo')
 73 |     
 74 |     # declare which indexes are available
 75 |     user = ds.define_index('index_user_id', ['user_id'])
 76 |     user_name = ds.define_index('index_user_name', ['first_name', 'last_name'])
 77 |     foo = ds.define_index('index_foo', ['bar'])
 78 |     
 79 |     # automatically knows that index entries should be created in index_user_id and
 80 |     # index_user_name, based on the keys in the row given
 81 |     row = ds.put({'first_name': 'evan', 'last_name': 'klitzke', 'user_id': schemaless.guid()})
 82 |     
 83 |     # query based on user_id, using the index defined by 'index_user_id'
 84 |     print user.query(c.user_id == row.user_id)
 85 |     
 86 |     # query based on first/last name, using the index defined by 'index_user_name'
 87 |     print user_name.query(c.first_name == 'evan', c.last_name == 'klitzke')
 88 | 
 89 | ORM Layer
 90 | =========
 91 | 
 92 | There's an optional ORM layer, exported via the module `schemaless.orm`. When
 93 | you use the ORM layer you can use indexes declaratively, and Schemaless can
 94 | automatically pick the correct index to use based on your query. The ORM layer
 95 | also knows how to do queries when a full index isn't available (e.g. if you add
 96 | a query restriction that isn't fully covered by an index).
 97 | 
 98 | Example
 99 | -------
100 | 
101 | The best way to get a feel for the ORM is to look at the example in
102 | `examples/blog/main.py`. This is the implementation of a trivial "blog"
103 | application that uses Schemaless and Tornado. It's only about a hundred lines of
104 | code, and shows a few different working parts interacting together.
105 | 
106 | Adding Indexes
107 | ==============
108 | 
109 | There's a class called `IndexUpdater` exported by the `schemaless` module that
110 | provides a basic template for batches that add/update/prune indexes. It's
111 | probably easiest to understand how it works if you look at the source code for
112 | it, which provides an example of a batch that adds a new index in the module
113 | documentation. Look under `schemaless/batch.py`.
114 | 


--------------------------------------------------------------------------------
/examples/blog/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import optparse
  3 | import datetime
  4 | 
  5 | import tornado.web
  6 | import tornado.ioloop
  7 | import tornado.httpserver
  8 | 
  9 | import schemaless
 10 | from schemaless import c
 11 | from schemaless import orm
 12 | 
 13 | dirname = os.path.dirname(__file__)
 14 | 
 15 | ##############
 16 | # ORM Things
 17 | ##############
 18 | 
 19 | datastore = schemaless.DataStore(mysql_shards=['localhost:3306'], user='test', password='test', database='test')
 20 | session = orm.Session(datastore)
 21 | Base = orm.make_base(session, tags_file=os.path.join(dirname, 'tags.yaml'))
 22 | 
 23 | class Post(Base):
 24 |     _columns = [
 25 |         orm.String('title', 255, required=True),
 26 |         orm.Text('content', required=True),
 27 |         orm.DateTime('time_created', default=datetime.datetime.now)
 28 |         ]
 29 | 
 30 |     _indexes = [['time_created']]
 31 | 
 32 |     @classmethod
 33 |     def new_post(cls, title, content):
 34 |         return cls(post_id=schemaless.guid(), title=title, content=content).save()
 35 | 
 36 |     @property
 37 |     def comments(self):
 38 |         """Get all the comments for this post, ordered by time created."""
 39 |         if not hasattr(self, '_comments'):
 40 |             comments = Comment.query(c.post_id == self.id)
 41 |             self._comments = sorted(comments, key=lambda c: c.time_created)
 42 |         return self._comments
 43 | 
 44 | class Comment(Base):
 45 |     _columns = [
 46 |         orm.Guid('post_id', 32, required=True),
 47 |         orm.String('author', 255),
 48 |         orm.Text('content', required=True),
 49 |         orm.DateTime('time_created', default=datetime.datetime.now)
 50 |         ]
 51 | 
 52 |     _indexes = [['comment_id']]
 53 | 
 54 |     @classmethod
 55 |     def reply(cls, post_id, author, content):
 56 |         return cls(comment_id=schemaless.guid(), post_id=post_id, author=author, content=content).save()
 57 | 
 58 | ##############
 59 | # Tornado Things
 60 | ##############
 61 | 
 62 | class MainHandler(tornado.web.RequestHandler):
 63 | 
 64 |     def get(self):
 65 |         posts = sorted(Post.all(), key=lambda x: x.time_created, reverse=True)
 66 |         self.render('main.html', title='Blog', posts=posts)
 67 | 
 68 | class PostHandler(tornado.web.RequestHandler):
 69 | 
 70 |     def get(self):
 71 |         self.render('post.html', title='New Post')
 72 | 
 73 |     def post(self):
 74 |         title = self.get_argument('title')
 75 |         content = self.get_argument('content')
 76 |         Post.new_post(title, content)
 77 |         self.redirect('/')
 78 | 
 79 | class CommentHandler(tornado.web.RequestHandler):
 80 | 
 81 |     def post(self):
 82 |         post_id = self.get_argument('post_id')
 83 |         author = self.get_argument('author')
 84 |         content = self.get_argument('content')
 85 |         Comment.reply(post_id, author, content)
 86 |         self.redirect('/')
 87 | 
 88 | settings = {
 89 |     'static_path': os.path.join(dirname, 'static'),
 90 |     'template_path': os.path.join(dirname, 'templates'),
 91 |     'cookie_secret': '61oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=',
 92 |     'xsrf_cookies': True,
 93 | }
 94 | 
 95 | application = tornado.web.Application([
 96 |         ('/', MainHandler),
 97 |         ('/post', PostHandler),
 98 |         ('/comment', CommentHandler)], **settings)
 99 | 
100 | if __name__ == '__main__':
101 |     parser = optparse.OptionParser()
102 |     parser.add_option('-p', '--port', type='int', default='8888', help='which port to listen on')
103 |     parser.add_option('-c', '--clear', action='store_true', default=False, help='clear all tables when starting')
104 |     opts, args = parser.parse_args()
105 | 
106 |     if opts.clear:
107 |         tables = set()
108 |         for d in datastore.connection.query('SHOW TABLES'):
109 |             tables |= set(d.values())
110 |         for t in tables:
111 |             datastore.connection.execute('DELETE FROM %s' % t)
112 | 
113 |     http_server = tornado.httpserver.HTTPServer(application)
114 |     http_server.listen(opts.port)
115 |     print 'blog waiting at http://localhost:%d' % opts.port
116 |     tornado.ioloop.IOLoop.instance().start()
117 | 


--------------------------------------------------------------------------------
/examples/blog/static/blog.css:
--------------------------------------------------------------------------------
 1 | h1 { font-weight: bold; font-size: 150%; }
 2 | h2 { font-weight: bold; font-size: 120%; }
 3 | h3 { font-weight: bold; font-size: 110%; }
 4 | 
 5 | #bd { margin: 1em; max-width: 600px; }
 6 | 
 7 | fieldset { border: 1px solid black; padding: 0.5em; }
 8 | legend { font-style: italic; }
 9 | .datetime { font-style: italic; }
10 | .comment_box { padding: 0.5em; }
11 | .secret { color: #777; }
12 | a { color: inherit; }
13 | 


--------------------------------------------------------------------------------
/examples/blog/tags.yaml:
--------------------------------------------------------------------------------
1 | Post: 1
2 | Comment: 2
3 | 


--------------------------------------------------------------------------------
/examples/blog/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>{{escape(title)}}</title>
 5 |     <link rel="stylesheet" type="text/css" href="http://yui.yahooapis.com/3.1.2/build/cssreset/reset-min.css">
 6 |     <link rel="stylesheet" type="text/css" href="{{static_url('blog.css')}}">
 7 |   </head>
 8 |   <body>
 9 |     <div id="bd">
10 |     {% block body %}{% end %}
11 |     </div>
12 |   </body>
13 | </html>
14 | 


--------------------------------------------------------------------------------
/examples/blog/templates/main.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block body %}
 4 | <h1>Schemaless Blog</h1>
 5 | <a class="secret" href="/post">click here to write a new post</a>
 6 | {% for post in posts %}
 7 |   <h2>{{escape(post.title)}}</h2>
 8 |   <div class="datetime">{{post.time_created.strftime('%Y-%m-%d %H:%M')}}</div>
 9 |   <div class="post_content">
10 |   {{post.content}} {% comment unescaped!!! %}
11 |   </div>
12 |   {% if post.comments %}
13 |   <h3>Comments</h3>
14 |   {% for comment in post.comments %}
15 |   <div class="comment_box">
16 |   <span class="author">{{escape(comment.author)}}</span> (<span class="datetime">{{comment.time_created.strftime('%Y-%m-%d %H:%M')}}</span>):
17 |   <div class="comment_content">
18 |   {{escape(comment.content)}}
19 |   </div>
20 |   </div>
21 |   {% end %}
22 |   {% end %}
23 |   <form action="/comment" method="POST">
24 |     <fieldset>
25 |       <legend>Leave a Comment</legend>
26 |       {{xsrf_form_html()}}
27 |       <input type="hidden" name="post_id" value="{{post.post_id}}">
28 |       <label for="author">Your Name: </label>
29 |       <input id="author" name="author" type="text"><br>
30 |       <textarea cols=60 rows=3 name="content"></textarea><br>
31 |       <input type="submit">
32 |     </fieldset>
33 |   </form>
34 | {% end %}
35 | {% end %}
36 | 


--------------------------------------------------------------------------------
/examples/blog/templates/post.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block body %}
 4 | <a href="/">return to blog</a>
 5 | <form action="/post" method="POST">
 6 |   <fieldset>
 7 |     <legend>new post</legend>
 8 |     {{xsrf_form_html()}}
 9 |     <label for="title">title: </label>
10 |     <input type="text" name="title" id="title"> <br>
11 |     <textarea cols=80 rows=5 id="post_content" name="content"></textarea> <br>
12 |     <input type="submit">
13 |   </fieldset>
14 | </form>
15 | {% end %}
16 | 


--------------------------------------------------------------------------------
/examples/mysqlbench/bench.yaml:
--------------------------------------------------------------------------------
1 | user: test
2 | passwd: test
3 | db: test
4 | 


--------------------------------------------------------------------------------
/examples/mysqlbench/mysqlbench.py:
--------------------------------------------------------------------------------
  1 | """MySQL benchmark tool, for comparing different table schemas.
  2 | 
  3 | Some notes:
  4 |  * run like: python mysqlbench.py my_config_file.yaml
  5 |  * there's a few different options to change things up, invoke with -h to see
  6 |    them
  7 |  * you should run this with with a total of at least a quarter million rows or
  8 |    so (default is one million) to ensure that you see the slowdown from MySQL
  9 |    checking uniqueness contraints; i.e., make sure you see a slow down as
 10 |    iterations increase for the schemas that create tables with a unique uuid
 11 |    column
 12 |  * there's some overhead from having to generate uuids (which is done by reading
 13 |    16 bytes from /dev/urandom); IME the benchmark is still very much
 14 |    MySQL-bound, but if you're concerned you can pre-allocate an array of uuids
 15 |    in the bench() function, at the cost of using gobs of memory
 16 |  * if you're on a machine without dedicated hardware (e.g. a VPS), you'll
 17 |    probably see interesting things with transaction times fluctuating wildly as
 18 |    your instance gets access to hardware
 19 | 
 20 | An example yaml config file (ignore the lines starting with ---):
 21 | 
 22 | --- start yaml file ---
 23 | user: test
 24 | passwd: test
 25 | db: test
 26 | --- end yaml file ---
 27 | 
 28 | """
 29 | 
 30 | import os
 31 | import csv
 32 | import math
 33 | import time
 34 | import yaml
 35 | import optparse
 36 | import MySQLdb
 37 | 
 38 | OVERALL_TIMES = []
 39 | 
 40 | def drop_test_entities(conn):
 41 |     c = conn.cursor()
 42 |     c.execute('SELECT table_name FROM information_schema.tables WHERE table_schema = DATABASE() AND table_name = %s', 'test_entities')
 43 |     if c.fetchone():
 44 |         c.execute('DROP TABLE test_entities')
 45 | 
 46 | def create_table(conn, lines, data):
 47 |     q = []
 48 |     q.append('CREATE TABLE test_entities (')
 49 |     q.extend(['    ' + l for l in lines])
 50 |     q.append(') ENGINE=InnoDB')
 51 | 
 52 |     query = '\n'.join(q)
 53 |     c = conn.cursor()
 54 |     print query
 55 |     c.execute('\n'.join(q))
 56 |     if data:
 57 |         c.execute('ALTER TABLE test_entities ADD COLUMN payload MEDIUMBLOB')
 58 |         print('ALTER TABLE test_entities ADD COLUMN payload MEDIUMBLOB')
 59 | 
 60 | def increment_worker(c, data):
 61 |     os.urandom(16) # ensure that this has the same overhead as uuid_worker;
 62 |                    # comment out if you don't like this fairness
 63 |     if data:
 64 |         c.execute('INSERT INTO test_entities (added_id, payload) VALUES (NULL, %s)', data)
 65 |     else:
 66 |         c.execute('INSERT INTO test_entities (added_id) VALUES (NULL)')
 67 | 
 68 | def uuid_worker(c, data):
 69 |     if data:
 70 |         c.execute('INSERT INTO test_entities (id, payload) VALUES (%s, %s)', (os.urandom(16), data))
 71 |     else:
 72 |         c.execute('INSERT INTO test_entities (id) VALUES (%s)', os.urandom(16))
 73 | 
 74 | def bench(name, opts, conn, data, schema, worker=uuid_worker):
 75 |     drop_test_entities(conn)
 76 |     print name
 77 |     print '=' * len(name)
 78 |     create_table(conn, schema, data=data)
 79 |     if opts.sleep:
 80 |         time.sleep(opts.sleep)
 81 |     times = []
 82 |     c = conn.cursor()
 83 |     for x in xrange(opts.num_iterations):
 84 |         if not opts.autocommit:
 85 |             c.execute('SET TRANSACTION ISOLATION LEVEL REPEATABLE READ')
 86 |         ts = time.time()
 87 |         for y in xrange(opts.batch_size):
 88 |             worker(c, data)
 89 |         conn.commit()
 90 |         elapsed = time.time() - ts
 91 |         times.append(elapsed)
 92 |         print '% 4d    %f' % (x + 1, elapsed)
 93 |     OVERALL_TIMES.append((name, times))
 94 |     sorted_times = sorted(times)
 95 |     total = sum(times)
 96 |     avg = total / len(times)
 97 |     if len(times) % 2 == 0:
 98 |         idx = len(times) / 2
 99 |         med = (sorted_times[idx] + sorted_times[idx + 1]) / 2
100 |     else:
101 |         med = times[len(sorted_times) / 2]
102 |     dev = math.sqrt(sum((x - avg)**2 for x in times) / len(times))
103 |     print
104 |     print 'average = %1.3f' % (avg,)
105 |     print 'median  = %1.3f' % (med,)
106 |     print 'std dev = %1.3f' % (dev,)
107 |     print
108 |     return times
109 | 
110 | def main(opts, args):
111 |     start = time.time()
112 |     cfg = yaml.load(open(args[0]).read())
113 |     conn = MySQLdb.connect(**cfg)
114 | 
115 |     opsys, host, kernel, dt, arch = os.uname()
116 |     print '%s %s' % (opsys, kernel)
117 |     print 'MySQL ' + conn.get_server_info()
118 |     print
119 |     print 'running %d iterations of %d inserts per txn (%d rows total)' % (opts.num_iterations, opts.batch_size, opts.num_iterations * opts.batch_size)
120 |     if opts.autocommit:
121 |         conn.cursor().execute('SET autocommit = 1')
122 |         print 'autocommit is ON'
123 |     else:
124 |         conn.cursor().execute('SET autocommit = 0')
125 |         print 'autocommit is OFF'
126 |     print
127 | 
128 |     data = os.urandom(opts.data) if opts.data else None
129 |     bench('just auto_increment', opts, conn, data,
130 |           ['added_id INTEGER NOT NULL AUTO_INCREMENT,',
131 |            'PRIMARY KEY (added_id)'], increment_worker)
132 | 
133 |     bench('auto_increment, key', opts, conn, data,
134 |           ['added_id INTEGER NOT NULL AUTO_INCREMENT,',
135 |            'id BINARY(16) NOT NULL,',
136 |            'PRIMARY KEY (added_id),',
137 |            'KEY (id)'])
138 | 
139 |     bench('auto_increment, unique key', opts, conn, data,
140 |           ['added_id INTEGER NOT NULL AUTO_INCREMENT,',
141 |            'id BINARY(16) NOT NULL,',
142 |            'PRIMARY KEY (added_id),',
143 |            'UNIQUE KEY (id)'])
144 | 
145 |     bench('w/o auto-increment, key', opts, conn, data,
146 |           ['id BINARY(16) NOT NULL,'
147 |            'KEY (id)'])
148 | 
149 |     bench('w/o auto-increment, unique key', opts, conn, data,
150 |           ['id BINARY(16) NOT NULL,',
151 |            'UNIQUE KEY (id)'])
152 | 
153 |     bench('w/o auto-increment, primary key', opts, conn, data,
154 |           ['id BINARY(16) NOT NULL,',
155 |            'PRIMARY KEY (id)'])
156 | 
157 |     drop_test_entities(conn)
158 |     if opts.csv:
159 |         writer = csv.writer(open(opts.csv, 'w'))
160 |         names = ['cumulative'] + [name for name, _ in OVERALL_TIMES]
161 |         writer.writerow(names)
162 |         writer.writerow([0 for x in xrange(len(OVERALL_TIMES) + 1)])
163 |         for x in xrange(opts.num_iterations):
164 |             tot = (x + 1) * opts.batch_size
165 |             writer.writerow([tot] + [t[x] for _, t in OVERALL_TIMES])
166 |         print 'csv output is in %r' % (opts.csv,)
167 |     print 'total time was %1.3f seconds' % (time.time() - start)
168 | 
169 | if __name__ == '__main__':
170 |     parser = optparse.OptionParser()
171 |     parser.add_option('-a', '--autocommit', action='store_true', default=False, help='Enable auto-commit')
172 |     parser.add_option('-b', '--batch-size', type='int', default=10000, help='How many rows to insert per txn')
173 |     parser.add_option('-c', '--csv', default=None, help='Store benchmark output in the specified CSV file')
174 |     parser.add_option('-d', '--data', type='int', default=0, help='Add a data column, with this size')
175 |     parser.add_option('-n', '--num-iterations', type='int', default=100, help='How many iterations to run')
176 |     parser.add_option('-s', '--sleep', type='int', default=10, help='How long to sleep between tests')
177 |     opts, args = parser.parse_args()
178 |     if len(args) != 1:
179 |         parser.error('must pass exactly one argument, the path to the mysql config file')
180 |     main(opts, args)
181 | 


--------------------------------------------------------------------------------
/examples/mysqlbench/plot.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | import optparse
 4 | 
 5 | import matplotlib
 6 | from matplotlib import pyplot
 7 | 
 8 | pyplot.rcParams.update({
 9 |         'backend': 'cairo',
10 |         'axes.labelsize': 10,
11 |         'legend.fontsize': 10,
12 |         'xtick.labelsize': 8,
13 |         'ytick.labelsize': 8,
14 |         'font.sans-serif': ['Droid Sans']})
15 | 
16 | def main(csv_name, opts):
17 | 	reader = iter(csv.reader(open(csv_name)))
18 | 	names = reader.next()
19 | 	data = dict((n, []) for n in names)
20 | 	for row in reader:
21 | 		for name, val in zip(names, row):
22 | 			data[name].append(float(val))
23 | 
24 | 	for name in names[1:]:
25 | 		xs, ys = [], []
26 | 		for x in xrange(len(data[name])):
27 | 			xs.append(data['cumulative'][x])
28 | 			ys.append(data[name][x])
29 | 		pyplot.plot(xs, ys, label=name)
30 | 		#pyplot.scatter(xs, ys, label=name)
31 | 	pyplot.xlabel('cumulative # of records inserted')
32 | 	pyplot.ylabel('seconds per 10k inserts')
33 | 	pyplot.legend(loc=2)
34 | 	if opts.title:
35 | 		pyplot.title(opts.title)
36 | 
37 | 	pyplot.savefig(opts.output, format='png', dpi=120)
38 | 
39 | if __name__ == '__main__':
40 | 	parser = optparse.OptionParser()
41 | 	parser.add_option('-t', '--title', default=None, help='the title to use')
42 | 	parser.add_option('-o', '--output', default='graph.png', help='what file to output to')
43 | 	opts, args = parser.parse_args()
44 | 	if len(args) != 1:
45 | 		parser.error('must specify an input file')
46 | 	main(args[0], opts)
47 | 


--------------------------------------------------------------------------------
/schemaless/__init__.py:
--------------------------------------------------------------------------------
1 | from schemaless.guid import *
2 | from schemaless.column import Entity, c
3 | from schemaless.index import Index
4 | from schemaless.datastore import DataStore
5 | from schemaless.batch import IndexUpdater, main
6 | 


--------------------------------------------------------------------------------
/schemaless/batch.py:
--------------------------------------------------------------------------------
  1 | """Module to assist in writing index updating batches.
  2 | 
  3 | Here's an example of a really simple batch that adds a user_id index.
  4 | 
  5 | ----------------------------------------------
  6 | 
  7 | import schemaless
  8 | 
  9 | class AddUserIdIndex(schemaless.IndexUpdater):
 10 | 
 11 |     def initialize(self):
 12 |         super(AddUserIdIndex, self).initialize()
 13 |         self.datastore = schemaless.DataStore(mysql_shards=['localhost:3306'],
 14 |                 user='test', password='test', database='test')
 15 |         self.conn = self.datastore.connection
 16 | 
 17 |     def process_row(self, row, entity):
 18 |         if entity.get('user_id'):
 19 |             self.conn.execute('INSERT IGNORE INTO index_user_id (entity_id, user_id) VALUES (%s, %s)',
 20 |                     schemaless.to_raw(entity.id), entity.user_id)
 21 | 
 22 | if __name__ == '__main__':
 23 |     AddUserIdIndex().start()
 24 | """
 25 | import time
 26 | import logging
 27 | import optparse
 28 | 
 29 | from schemaless.column import Entity
 30 | from schemaless.log import ClassLogger
 31 | 
 32 | class IndexUpdater(object):
 33 |     """Class that implements a simple batch for updating indexes. This is meant
 34 |     to be a base class which is subclassed by the user, with the subclass doing
 35 |     whatever work is actually required to add the new index. Note that this
 36 |     batch is also appropriate for deleting data, or doing any other operation
 37 |     which requires iterating over a database table.
 38 | 
 39 |     At the very minimum you must implement your own process_row method.
 40 |     """
 41 | 
 42 |     log = ClassLogger()
 43 |     use_zlib = True
 44 | 
 45 |     def __init__(self):
 46 |         self.parser = optparse.OptionParser()
 47 |         self.parser.add_option('--start-added-id', dest='start_added_id', type='int', default=0, help='Which added_id to start at')
 48 |         self.parser.add_option('--batch-size', dest='batch_size', type='int', default=100, help='How many rows to process at a time')
 49 | 
 50 |     def initialize(self):
 51 |         self.rows_processed = 0
 52 |         self.start_run = time.time()
 53 |         self.last_id_processed = self.opts.start_added_id
 54 |         self.configure_logging()
 55 | 
 56 |     def configure_logging(self):
 57 |         logging.basicConfig(level=logging.DEBUG)
 58 | 
 59 |     def start(self):
 60 |         self.opts, self.args = self.parser.parse_args()
 61 |         self.initialize()
 62 |         self.run()
 63 | 
 64 |     def row_iterator(self):
 65 |         conn = self.datastore.connection
 66 |         conn.execute('SET AUTOCOMMIT=1')
 67 |         next_row = self.opts.start_added_id
 68 |         while True:
 69 |             rows = conn.query('SELECT * FROM entities WHERE added_id >= %s ORDER BY added_id ASC LIMIT %s', next_row, self.opts.batch_size)
 70 |             if rows:
 71 |                 for row in rows:
 72 |                     yield row
 73 |                 next_row = row['added_id'] + 1
 74 |             else:
 75 |                 break
 76 | 
 77 |     def process_row(self, row, entity):
 78 |         """Every subclass must implement this method at a minimum. The function
 79 |         takes two arguments, the raw row returned by MySQL, and an entity object
 80 |         representing the BLOB data stored in the row.
 81 |         """
 82 |         raise NotImplementedError
 83 | 
 84 |     def run(self):
 85 |         rows_processed = 0
 86 |         self.log.info('starting run loop')
 87 |         try:
 88 |             for row in self.row_iterator():
 89 |                 entity = Entity.from_row(row, use_zlib=self.use_zlib)
 90 |                 self.process_row(row, entity)
 91 |                 self.rows_processed += 1
 92 |                 self.last_id_processed = row['added_id']
 93 |         except:
 94 |             self.log.exception('exception during run loop!')
 95 |         finally:
 96 |             elapsed_time = time.time() - self.start_run
 97 |             self.log.info('finished run loop, elapsed time = %1.2f seconds, processed %d rows, last added_id was %d' % (elapsed_time, self.rows_processed, self.last_id_processed))
 98 | 
 99 | def main(batch_cls):
100 |     batch_instance = batch_cls()
101 |     batch_instance.start()
102 | 


--------------------------------------------------------------------------------
/schemaless/column.py:
--------------------------------------------------------------------------------
  1 | import simplejson
  2 | import zlib
  3 | 
  4 | class Entity(dict):
  5 | 
  6 |     @classmethod
  7 |     def new(cls):
  8 |         return Entity(id=make_guid())
  9 | 
 10 |     @classmethod
 11 |     def from_row(cls, row, use_zlib=False):
 12 |         body = row['body']
 13 |         if use_zlib:
 14 |             body = zlib.decompress(body)
 15 |         d = simplejson.loads(body)
 16 |         d['id'] = row['id'].encode('hex')
 17 |         d['updated'] = row['updated']
 18 |         return cls(d)
 19 | 
 20 |     def __hasattr__(self, name):
 21 |         return name in self or name in self.__dict__
 22 | 
 23 |     def __getattr__(self, name):
 24 |         try:
 25 |             return self[name]
 26 |         except KeyError:
 27 |             raise AttributeError(name)
 28 |     
 29 |     def __setattr__(self, name, val):
 30 |         self[name] = val
 31 | 
 32 |     def __str__(self):
 33 |         return str(dict(self.items()))
 34 | 
 35 | class Column(object):
 36 | 
 37 |     def __init__(self, name):
 38 |         self.name = name
 39 |     
 40 |     def to_string(self):
 41 |         return self.name
 42 | 
 43 |     __str__ = to_string
 44 | 
 45 |     def __lt__(self, val):
 46 |         return ColumnExpression(self.name, ColumnExpression.OP_LT, val)
 47 | 
 48 |     def __le__(self, val):
 49 |         return ColumnExpression(self.name, ColumnExpression.OP_LE, val)
 50 | 
 51 |     def __eq__(self, val):
 52 |         return ColumnExpression(self.name, ColumnExpression.OP_EQ, val)
 53 | 
 54 |     def __ne__(self, val):
 55 |         return ColumnExpression(self.name, ColumnExpression.OP_NE, val)
 56 | 
 57 |     def __gt__(self, val):
 58 |         return ColumnExpression(self.name, ColumnExpression.OP_GT, val)
 59 | 
 60 |     def __ge__(self, val):
 61 |         return ColumnExpression(self.name, ColumnExpression.OP_GE, val)
 62 | 
 63 |     def in_(self, vals):
 64 |         return ColumnExpression(self.name, ColumnExpression.OP_IN, vals)
 65 | 
 66 | class ColumnExpression(object):
 67 | 
 68 |     OP_LT = 1
 69 |     OP_LE = 2
 70 |     OP_EQ = 3
 71 |     OP_NE = 4
 72 |     OP_GT = 5
 73 |     OP_GE = 6
 74 |     OP_IN = 7
 75 | 
 76 |     def __init__(self, name, op, rhs):
 77 |         self.name = name
 78 |         self.op = op
 79 |         self.rhs = rhs
 80 | 
 81 |     def build(self):
 82 |         if self.op == self.OP_LT:
 83 |             return self.name + ' < %s', [self.rhs]
 84 |         elif self.op == self.OP_LE:
 85 |             return self.name + ' <= %s', [self.rhs]
 86 |         elif self.op == self.OP_EQ:
 87 |             if self.rhs is None:
 88 |                 return '%s IS NULL' % self.name, []
 89 |             else:
 90 |                 return (self.name + ' = %s'), [self.rhs]
 91 |         elif self.op == self.OP_NE:
 92 |             if self.rhs is None:
 93 |                 return '%s IS NOT NULL' % self.name, []
 94 |             else:
 95 |                 return (self.name + ' != %s'), [self.rhs]
 96 |         elif self.op == self.OP_GT:
 97 |             return (self.name + ' > %s'), [self.rhs]
 98 |         elif self.op == self.OP_GE:
 99 |             return (self.name + ' >= %s'), [self.rhs]
100 |         elif self.op == self.OP_IN:
101 |             sql = self.name + ' IN (' + ', '.join('%s' for x in self.rhs) + ')'
102 |             return sql, self.rhs
103 |         else:
104 |             raise ValueError('Unknown operator')
105 | 
106 |     def check(self, val):
107 |         val = val[self.name]
108 |         if self.op == self.OP_LT:
109 |             return val < self.rhs
110 |         elif self.op == self.OP_LE:
111 |             return val <= self.rhs
112 |         elif self.op == self.OP_EQ:
113 |             return val == self.rhs
114 |         elif self.op == self.OP_NE:
115 |             return val != self.rhs
116 |         elif self.op == self.OP_GT:
117 |             return val > self.rhs
118 |         elif self.op == self.OP_GE:
119 |             return val >= self.rhs
120 |         elif self.op == self.OP_IN:
121 |             return val in self.rhs
122 |         else:
123 |             raise ValueError('Unknown operator')
124 | 
125 |     def __str__(self):
126 |         return '%s(name=%r, op=%d, rhs=%r)' % (self.__class__.__name__, self.name, self.op, self.rhs)
127 |     __repr__ = __str__
128 | 
129 | class ColumnBuilder(object):
130 | 
131 |     def __init__(self):
132 |         self._columns = {}
133 | 
134 |     def __getattr__(self, name):
135 |         if name not in self._columns:
136 |             self._columns[name] = Column(name)
137 |         return self._columns[name]
138 | 
139 | c = ColumnBuilder()
140 | 


--------------------------------------------------------------------------------
/schemaless/datastore.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import simplejson
  3 | import zlib
  4 | 
  5 | import tornado.database
  6 | 
  7 | from schemaless.column import Entity
  8 | from schemaless.index import Index
  9 | from schemaless.guid import raw_guid
 10 | from schemaless.log import ClassLogger
 11 | 
 12 | class DataStore(object):
 13 | 
 14 |     log = ClassLogger()
 15 | 
 16 |     def __init__(self, mysql_shards=[], user=None, database=None, password=None, use_zlib=True, indexes=[], create_entities=True):
 17 |         if not mysql_shards:
 18 |             raise ValueError('Must specify at least one MySQL shard')
 19 |         if len(mysql_shards) > 1:
 20 |             raise NotImplementedError
 21 |         self.use_zlib = use_zlib
 22 |         self.indexes = [Index('entities', ['tag'])]
 23 |         self.connection = tornado.database.Connection(host=mysql_shards[0], user=user, password=password, database=database)
 24 |         if create_entities and not self.check_table_exists('entities'):
 25 |             self.create_entities_table()
 26 | 
 27 |     @property
 28 |     def tag_index(self):
 29 |         return self.indexes[0]
 30 | 
 31 |     def define_index(self, table, properties=[], match_on={}, shard_on=None):
 32 |         idx = Index(table=table, properties=properties, match_on=match_on, shard_on=shard_on, connection=self.connection, use_zlib=self.use_zlib)
 33 |         self.indexes.append(idx)
 34 |         return idx
 35 | 
 36 |     def _find_indexes(self, entity, include_entities=False):
 37 |         """Find all of the indexes that may index an entity, based on the keys
 38 |         in the entity.
 39 |         """
 40 |         keys = frozenset(entity.keys())
 41 |         for idx in self.indexes:
 42 |             if idx.matches(entity, keys):
 43 |                 if idx.table != 'entities':
 44 |                     yield idx
 45 |                 elif include_entities:
 46 |                     yield idx
 47 |     
 48 |     def put(self, entity, tag=None):
 49 |         is_update = False
 50 |         entity['updated'] = time.time()
 51 |         entity_id = None
 52 | 
 53 |         entity_copy = entity.copy()
 54 | 
 55 |         # get the entity_id (or create a new one)
 56 |         entity_id = entity_copy.pop('id', None)
 57 |         if entity_id is None:
 58 |             entity_id = raw_guid()
 59 |         else:
 60 |             is_update = True
 61 |             if len(entity_id) != 16:
 62 |                 entity_id = entity_id.decode('hex')
 63 |         body = simplejson.dumps(entity_copy)
 64 |         if self.use_zlib:
 65 |             body = zlib.compress(body, 1)
 66 | 
 67 |         if is_update:
 68 |             self._put_update(entity_id, entity_copy, body)
 69 |             return entity
 70 |         else:
 71 |             return self._put_new(entity_id, entity_copy, tag, body)
 72 | 
 73 |     def _insert_index(self, index, entity_id, entity):
 74 |         pnames = ['entity_id']
 75 |         vals = [entity_id]
 76 |         for p in index.properties:
 77 |             pnames.append(p)
 78 |             vals.append(entity[p])
 79 | 
 80 |         q = 'INSERT INTO %s (%s) VALUES (' % (index.table, ', '.join(pnames))
 81 |         q += ', '.join('%s' for x in pnames)
 82 |         q += ')'
 83 |         try:
 84 |             self.connection.execute(q, *vals)
 85 |         except tornado.database.OperationalError:
 86 |             self.log.exception('query = %s, vals = %s' % (q, vals))
 87 |             raise
 88 | 
 89 |     def _update_index(self, index, entity_id, entity):
 90 |         row = self.connection.get('SELECT * FROM %s WHERE entity_id = %%s' % (index.table,), entity_id)
 91 |         if row:
 92 |             vals = []
 93 |             q = 'UPDATE %s SET ' % index.table
 94 |             qs = []
 95 |             for p in index.properties:
 96 |                 qs.append('%s = %%s' % p)
 97 |                 vals.append(entity[p])
 98 |             q += ', '.join(qs)
 99 |             q += ' WHERE entity_id = %s'
100 |             vals.append(entity_id)
101 |             self.connection.execute(q, *vals)
102 |         else:
103 |             self._insert_index(index, entity_id, entity)
104 | 
105 |     def _put_new(self, entity_id, entity, tag, body):
106 |         pk = self.connection.execute('INSERT INTO entities (id, updated, tag, body) VALUES (%s, FROM_UNIXTIME(%s), %s, %s)', entity_id, int(entity['updated']), tag, body)
107 |         for idx in self._find_indexes(entity):
108 |             self._insert_index(idx, entity_id, entity)
109 |         return self.by_id(entity_id)
110 | 
111 |     def _put_update(self, entity_id, entity, body):
112 |         self.connection.execute('UPDATE entities SET updated = CURRENT_TIMESTAMP, body = %s WHERE id = %s', body, entity_id)
113 |         for idx in self._find_indexes(entity):
114 |             self._update_index(idx, entity_id, entity)
115 | 
116 |     def delete(self, entity=None, id=None):
117 |         if entity is None and id is None:
118 |             raise ValueError('Must provide delete with an entity and an id')
119 |         if entity and 'id' not in entity:
120 |             raise ValueError('Cannot provide an entity without an id')
121 |         if not entity:
122 |             entity = self.by_id(id)
123 |             if not entity:
124 |                 return 0
125 |         entity_id = entity['id'].decode('hex')
126 | 
127 |         def _delete(table_name):
128 |             col = 'id' if table_name == 'entities' else 'entity_id'
129 |             return int(bool(self.connection.execute('DELETE FROM %s WHERE %s = %%s' % (table_name, col), entity_id)))
130 | 
131 |         deleted = 0
132 |         seen_entities = False
133 |         for idx in self._find_indexes(entity):
134 |             if idx.table == 'entities':
135 |                 seen_entities = True
136 |             deleted += _delete(idx.table)
137 |         if not seen_entities:
138 |             deleted += _delete('entities')
139 |         return deleted
140 | 
141 |     def by_id(self, id):
142 |         if len(id) == 32:
143 |             id = id.decode('hex')
144 |         row = self.connection.get('SELECT * FROM entities WHERE id = %s', id)
145 |         return Entity.from_row(row, use_zlib=self.use_zlib) if row else None
146 | 
147 |     def check_table_exists(self, table_name):
148 |         row = self.connection.get('SELECT COUNT(*) AS tbl_count FROM information_schema.tables WHERE table_schema = DATABASE() AND table_name = %s', table_name)
149 |         return bool(row['tbl_count'])
150 | 
151 |     def create_entities_table(self):
152 |         self.connection.execute("""
153 |             CREATE TABLE entities (
154 |                 added_id INTEGER NOT NULL AUTO_INCREMENT,
155 |                 id BINARY(16) NOT NULL,
156 |                 updated TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
157 |                 tag MEDIUMINT,
158 |                 body MEDIUMBLOB NOT NULL,
159 |                 PRIMARY KEY (added_id),
160 |                 UNIQUE KEY (id),
161 |                 KEY (updated)
162 |             ) ENGINE=InnoDB""")
163 | 


--------------------------------------------------------------------------------
/schemaless/guid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | GUID_SIZE = 16
 4 | 
 5 | def raw_guid(size=GUID_SIZE):
 6 |     return os.urandom(size)
 7 | 
 8 | def guid(size=GUID_SIZE):
 9 |     return raw_guid(size=size).encode('hex')
10 | 
11 | def to_raw(s):
12 |     return s.decode('hex')
13 | 
14 | def to_str(r):
15 |     return r.encode('hex')
16 | 


--------------------------------------------------------------------------------
/schemaless/index.py:
--------------------------------------------------------------------------------
  1 | from schemaless.column import ColumnExpression, Entity
  2 | 
  3 | class Order(object):
  4 | 
  5 |     def __init__(self, name, asc=False, desc=False):
  6 |         self.name = name
  7 |         self.order = 'ASC' if asc else 'DESC'
  8 | 
  9 | def reduce_args(*exprs, **kwargs):
 10 |     limit = kwargs.pop('limit', None)
 11 |     order_by = kwargs.pop('order_by', None)
 12 |     asc = kwargs.pop('asc', False)
 13 |     desc = kwargs.pop('desc', False)
 14 |     if asc and desc:
 15 |         raise ValueError('Cannot specify both asc=True and desc=True')
 16 |     if order_by:
 17 |         if not (asc or desc):
 18 |             asc = True
 19 |         order_by = Order(order_by, asc=asc, desc=desc)
 20 | 
 21 |     exprs = list(exprs)
 22 |     for k, v in kwargs.iteritems():
 23 |         exprs.append(ColumnExpression(k, ColumnExpression.OP_EQ, v))
 24 | 
 25 |     # if it's just an order_by, check for the order_by column not nulll
 26 |     #if order_by and not exprs:
 27 |     #    exprs.append(ColumnExpression(order_by.name, ColumnExpression.OP_NE, None))
 28 | 
 29 |     if not (order_by or exprs):
 30 |         raise ValueError('Must provide args/kwargs for a WHERE clause')
 31 |     return exprs, order_by, limit
 32 | 
 33 | class Index(object):
 34 | 
 35 |     def __init__(self, table, properties=[], match_on={}, shard_on=None, connection=None, use_zlib=True):
 36 |         if shard_on is not None:
 37 |             raise NotImplementedError
 38 |         if any(',' in p for p in properties):
 39 |             raise ValueError('Bad property name: %r' % (p,))
 40 | 
 41 |         self.table = table
 42 |         self.properties = frozenset(properties)
 43 |         self.match_on = match_on
 44 |         self.connection = connection
 45 |         self.use_zlib = use_zlib
 46 | 
 47 |     def __str__(self):
 48 |         return '%s(table=%s, properties=%s, match_on=%s)' % (self.__class__.__name__, self.table, self.properties, self.match_on)
 49 |     __repr__ = __str__
 50 | 
 51 |     def __cmp__(self, other):
 52 |         return cmp(self.table, other.table)
 53 | 
 54 |     def matches(self, entity, keys):
 55 |         if not (self.properties <= keys):
 56 |             return False
 57 |         for k, v in self.match_on.iteritems():
 58 |             if entity.get(k) != v:
 59 |                 return False
 60 |         return True
 61 | 
 62 |     def _query(self, *exprs, **kwargs):
 63 |         exprs, order_by, limit = reduce_args(*exprs, **kwargs)
 64 |         return self._do_query(exprs, order_by, limit)
 65 | 
 66 |     def _do_query(self, exprs, order_by, limit):
 67 |         values = []
 68 |         where_clause = []
 69 |         for e in exprs:
 70 |             if e.name not in self.properties:
 71 |                 raise ValueError('This index has no column named %r' % (e.name,))
 72 |             expr_string, vals = e.build()
 73 |             where_clause.append(expr_string)
 74 |             values.extend(vals)
 75 | 
 76 |         if self.table == 'entities':
 77 |             # XXX: this is a bit hacky
 78 |             q = 'SELECT * FROM entities WHERE ' + ' AND '.join(where_clause)
 79 |             if order_by:
 80 |                 q += ' ORDER BY %s %s' % (order_by.name, order_by.order)
 81 |             if limit:
 82 |                 q += ' LIMIT %d' % (limit,)
 83 |             entity_rows = self.connection.query(q, *values)
 84 |         else:
 85 |             q = 'SELECT entity_id FROM %s' % self.table
 86 |             if where_clause:
 87 |                 q += ' WHERE ' + ' AND '.join(where_clause)
 88 |             if order_by:
 89 |                 q += ' ORDER BY %s %s' % (order_by.name, order_by.order)
 90 |             if limit:
 91 |                 q += ' LIMIT %d' % (limit,)
 92 | 
 93 |             rows = self.connection.query(q, *values)
 94 |             if rows:
 95 |                 entity_ids = [r['entity_id'] for r in rows]
 96 |                 q = 'SELECT * FROM entities WHERE id IN ('
 97 |                 q += ', '.join('%s' for x in rows)
 98 |                 q += ')'
 99 |                 entity_rows = self.connection.query(q, *entity_ids)
100 |             else:
101 |                 return []
102 | 
103 |         if not order_by:
104 |             #sorted_entities = sorted(entity_rows, key=lambda x: x['updated'], reverse=True)
105 |             sorted_entities = sorted(entity_rows, key=lambda x: x['updated'])
106 |         else:
107 |             # XXX: this is O(n^2), bad
108 |             sorted_entities = []
109 |             for row_id in (row['entity_id'] for row in rows):
110 |                 for e in entity_rows:
111 |                     if e['id'] == row_id:
112 |                         sorted_entities.append(e)
113 |                         break
114 |                 else:
115 |                     assert False
116 | 
117 |         return [Entity.from_row(row, use_zlib=self.use_zlib) for row in sorted_entities]
118 | 
119 |     def get(self, *exprs, **kwargs):
120 |         kwargs['limit'] = 1
121 |         rows = self._query(*exprs, **kwargs)
122 |         if len(rows) == 0:
123 |             return None
124 |         elif len(rows) == 1:
125 |             return rows[0]
126 |         else:
127 |             assert False
128 | 
129 |     def query(self, *exprs, **kwargs):
130 |         return self._query(*exprs, **kwargs)
131 | 
132 |     def all(self):
133 |         return self._query(c.entity_id != None)
134 | 


--------------------------------------------------------------------------------
/schemaless/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class ClassLogger(object):
 5 | 
 6 |     def __get__(self, obj, obj_type=None):
 7 |         object_class = obj_type or obj.__class__
 8 |         return logging.getLogger(object_class.__module__ + '.' + object_class.__name__)
 9 | 
10 | formatter = logging.Formatter('%(asctime)s :: %(name)s (%(levelname)s) :: %(message)s')
11 | logger = logging.getLogger('schemaless')
12 | 


--------------------------------------------------------------------------------
/schemaless/orm/__init__.py:
--------------------------------------------------------------------------------
1 | from session import Session
2 | from index import Index
3 | from column import *
4 | from document import make_base
5 | import converters
6 | 


--------------------------------------------------------------------------------
/schemaless/orm/column.py:
--------------------------------------------------------------------------------
 1 | import schemaless.orm.converters
 2 | 
 3 | DEFAULT_NONCE = Ellipsis
 4 | 
 5 | class Column(object):
 6 | 
 7 |     def __init__(self, name, default=DEFAULT_NONCE, required=False, convert=None):
 8 |         self.name = name
 9 |         self.default = default
10 |         self.required = required
11 |         self.convert = convert
12 | 
13 |     def to_string(self):
14 |         return 'COLUMN'
15 | 
16 |     def __str__(self):
17 |         s = '`%s` %s' % (self.name, self.to_string())
18 |         if self.required:
19 |             s += ' NOT NULL'
20 |         return s
21 | 
22 | class Char(Column):
23 | 
24 |     def __init__(self, name, length, **kwargs):
25 |         super(Char, self).__init__(name, **kwargs)
26 |         self.length = length
27 | 
28 |     def to_string(self):
29 |         return 'CHAR(%d)' % (self.length,)
30 | 
31 | class Binary(Column):
32 | 
33 |     def __init__(self, name, length, **kwargs):
34 |         super(Binary, self).__init__(name, **kwargs)
35 |         self.length = length
36 | 
37 |     def to_string(self):
38 |         return 'BINARY(%d)' % (self.length,)
39 | 
40 | class String(Column):
41 |     def __init__(self, name, length, **kwargs):
42 |         super(String, self).__init__(name, **kwargs)
43 |         self.length = length
44 | 
45 |     def to_string(self):
46 |         return 'VARCHAR(%d)' % (self.length,)
47 | 
48 | class Text(Column):
49 | 
50 |     def to_string(self):
51 |         return 'TEXT'
52 | 
53 | class DateTime(Column):
54 | 
55 |     def __init__(self, name, **kwargs):
56 |         if not kwargs.get('convert'):
57 |             kwargs['convert'] = schemaless.orm.converters.DateTimeConverter
58 |         super(DateTime, self).__init__(name, **kwargs)
59 | 
60 |     def to_string(self):
61 |         return 'INTEGER UNSIGNED'
62 | 
63 | class Guid(Char):
64 | 
65 |     def __init__(self, name, **kwargs):
66 |         super(Guid, self).__init__(name, 32, **kwargs)
67 | UUID = GUID = Guid
68 | 
69 | class Bool(Column):
70 | 
71 |     def __init__(self, name, **kwargs):
72 |         if not kwargs.get('convert'):
73 |             kwargs['convert'] = schemaless.orm.converters.BooleanConverter
74 |         super(Bool, self).__init__(name, **kwargs)
75 | 
76 |     def to_string(self):
77 |         return 'BOOL'
78 | 
79 | Bit = Boolean = Bool
80 | 


--------------------------------------------------------------------------------
/schemaless/orm/converters.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import datetime
 3 | 
 4 | class Converter(object):
 5 | 
 6 |     @classmethod
 7 |     def to_db(cls, obj):
 8 |         raise NotImplementedError
 9 | 
10 |     @classmethod
11 |     def from_db(cls, val):
12 |         raise NotImplementedError
13 | 
14 | class DateTimeConverter(Converter):
15 | 
16 |     @classmethod
17 |     def to_db(cls, obj):
18 |         return time.mktime(obj.timetuple()) if obj else None
19 | 
20 |     @classmethod
21 |     def from_db(cls, val):
22 |         return datetime.datetime.fromtimestamp(val) if val else None
23 | 
24 | class BooleanConverter(Converter):
25 | 
26 |     @classmethod
27 |     def to_db(cls, obj):
28 |         return 1 if obj else 0
29 | 
30 |     @classmethod
31 |     def from_db(cls, val):
32 |         return bool(val)
33 | 


--------------------------------------------------------------------------------
/schemaless/orm/document.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | from collections import defaultdict
  3 | from index import IndexCollection
  4 | from schemaless.index import reduce_args
  5 | from schemaless.log import ClassLogger
  6 | from schemaless.orm.util import is_type_list
  7 | from schemaless.orm.index import Index
  8 | from schemaless.orm.column import Column, DEFAULT_NONCE
  9 | from schemaless import c
 10 | 
 11 | def _collect_fields(x):
 12 |     return set((k, v) for k, v in x.__dict__.iteritems() if k != 'tag' and not k.startswith('_') and not callable(v))
 13 | 
 14 | def make_base(session, meta_base=type, base_cls=object, tags_file=None, tags_db=None):
 15 |     """Create a base class for ORM documents.
 16 | 
 17 |     meta_base -- the base class for metaclass
 18 |     base_cls -- the base class for the document class
 19 |     tags_file -- the path of a YAML file containing tags declarations
 20 |     tags_db -- an explicit maping (as a dict) or tags declarations
 21 |     """
 22 | 
 23 |     # tags that have been registered
 24 |     tags = set()
 25 | 
 26 |     tags_db = tags_db or {}
 27 |     assert len(tags_db.keys()) == len(tags_db.values())
 28 | 
 29 |     if not tags_db and tags_file is not None:
 30 |         yaml_cfg = yaml.load(open(tags_file, 'r').read())
 31 |         tags_db.update(yaml_cfg)
 32 | 
 33 |     class metacls(meta_base):
 34 | 
 35 |         def __new__(mcs, name, bases, cls_dict):
 36 | 
 37 |             if 'tag' not in cls_dict and name in tags_db:
 38 |                 cls_dict['tag'] = tags_db[name]
 39 | 
 40 |             if 'tag' in cls_dict:
 41 |                 if cls_dict['tag'] in tags:
 42 |                     raise TypeError('Tag %r has already been defined' % (cls_dict['tag'],))
 43 |                 tags.add(cls_dict['tag'])
 44 | 
 45 |             s = set()
 46 |             for b in bases:
 47 |                 s |= set(getattr(b, '_columns', set()))
 48 |             s |= set(cls_dict.get('_columns', set()))
 49 |             for x in s:
 50 |                 if not isinstance(x, Column):
 51 |                     raise TypeError('Got unexpected %r instead of Column' % (x,))
 52 | 
 53 |             cls_dict['_columns'] = s
 54 |             cls_dict['_column_map'] = dict((c.name, c) for c in s)
 55 |             cls_dict['_column_names'] = frozenset(c.name for c in s)
 56 |             cls_dict['_required_columns'] = frozenset(c.name for c in s if c.required)
 57 | 
 58 |             if not '_abstract' in cls_dict:
 59 |                 cls_dict.setdefault('_indexes', [])
 60 |                 tag_index = Index('entities', ['tag'])
 61 |                 tag_index.underlying = session.datastore.tag_index
 62 |                 indexes = [tag_index]
 63 |                 for idx in cls_dict.get('_indexes', []):
 64 |                     if isinstance(idx, Index):
 65 |                         indexes.append(idx)
 66 |                     elif cls_dict.get('tag') and is_type_list(basestring, idx):
 67 |                         cols = [cls_dict['_column_map'][name] for name in idx]
 68 |                         indexes.append(Index.automatic(cls_dict['tag'], cols, session.datastore, declare=False))
 69 |                     else:
 70 |                         raise ValueError("Sorry, I don't know how to make an index for %s from %r" % (name, idx))
 71 |                 cls_dict['_indexes'] = indexes
 72 |                 cls_dict['_schemaless_index_collection'] = IndexCollection(indexes)
 73 |                 for idx in indexes:
 74 |                     idx.declare(session.datastore, tag=cls_dict['tag'])
 75 | 
 76 |             cls_dict['_session'] = session
 77 |             return meta_base.__new__(mcs, name, bases, cls_dict)
 78 | 
 79 |     class Document(base_cls):
 80 | 
 81 |         __metaclass__ = metacls
 82 | 
 83 |         _abstract = True
 84 |         _columns = [Column('tag')]
 85 |         _indexes = []
 86 |         _id_field = None
 87 | 
 88 |         log = ClassLogger()
 89 | 
 90 |         def __init__(self, from_dict=None, is_dirty=True, **kwargs):
 91 | 
 92 |             if base_cls is not object:
 93 |                 super(Document, self).__init__()
 94 | 
 95 |             if from_dict is None:
 96 |                 from_dict = kwargs
 97 | 
 98 |             if hasattr(self, 'tag') and 'tag' in from_dict:
 99 |                 if getattr(self, 'tag') != from_dict['tag']:
100 |                     raise TypeError('Inconsistent tag')
101 |             
102 |             # FIXME: ought to grab other attributes off the class dict as well
103 |             self.__dict__['_schemaless_collected_fields'] = set(['tag'])
104 |             self.__dict__['_schemaless_id'] = from_dict.get('id', None)
105 | 
106 |             for k, v in from_dict.iteritems():
107 |                 if k in self._column_names:
108 |                     self.__dict__[k] = v
109 |                     self._schemaless_collected_fields.add(k)
110 | 
111 |             # Add default values
112 |             dict_keys = from_dict.keys()
113 |             for c in self._columns:
114 |                 if c.default != DEFAULT_NONCE and c.name not in dict_keys:
115 |                     if callable(c.default):
116 |                         v = c.default()
117 |                     else:
118 |                         v = c.default
119 |                     self.__dict__[c.name] = v
120 |                     self._schemaless_collected_fields.add(c.name)
121 | 
122 |             self._schemaless_dirty = is_dirty
123 |             if self._schemaless_dirty and self._saveable():
124 |                 self._session.dirty_documents.add(self)
125 | 
126 |         def _saveable(self):
127 |             return self._schemaless_collected_fields >= self._required_columns
128 | 
129 |         def __setattr__(self, k, v):
130 |             if k in self._column_names:
131 |                 self._schemaless_collected_fields.add(k)
132 |                 self._schemaless_dirty = True
133 |                 if self not in self._session.dirty_documents and self._saveable():
134 |                     self._session.dirty_documents.add(self)
135 |             super(Document, self).__setattr__(k, v)
136 | 
137 |         def __delattr__(self, k):
138 |             try:
139 |                 self._schemaless_collected_fields.remove(k)
140 |             except KeyError:
141 |                 pass
142 |             super(Document, self).__delattr__(k)
143 | 
144 |         @property
145 |         def is_dirty(self):
146 |             return self._schemaless_dirty
147 | 
148 |         @classmethod
149 |         def from_datastore(cls, d):
150 |             if d['tag'] != cls.tag:
151 |                 raise ValueError('Expected item with tag %d, instead got item with tag %d' % (cls.tag, d['tag']))
152 |             missing = cls._required_columns - set(d.keys())
153 |             if missing:
154 |                 raise ValueError('Missing from %s the following keys: %s' % (d, ', '.join(k for k in sorted(missing))))
155 |             for k, v in d.iteritems():
156 |                 c = cls._column_map.get(k)
157 |                 if c and c.convert:
158 |                     d[k] = c.convert.from_db(v)
159 | 
160 |             obj = cls(d, is_dirty=False)
161 |             obj.updated = d['updated']
162 |             return obj
163 | 
164 |         def to_dict(self):
165 |             d = {'id': self.id}
166 |             for f in self._column_names:
167 |                 if f in self._required_columns:
168 |                     val = getattr(self, f)
169 |                 elif hasattr(self, f):
170 |                     val = getattr(self, f)
171 |                 else:
172 |                     continue
173 |                 if self._column_map[f].convert:
174 |                     val = self._column_map[f].convert.to_db(val)
175 |                 d[f] = val
176 |             return d
177 | 
178 |         @property
179 |         def id(self):
180 |             return getattr(self, '_schemaless_id', None)
181 | 
182 |         def save(self, clear_session=True):
183 |             if not self._saveable():
184 |                 missing = self._required_columns - self._schemaless_collected_fields
185 |                 raise ValueError('This object is not yet saveable, missing: %s' % (', '.join(str(k) for k in missing),))
186 |             if self._schemaless_dirty:
187 |                 obj = self._session.datastore.put(self.to_dict(), self.tag)
188 |                 self.updated = obj['updated']
189 |                 self._schemaless_id = obj['id']
190 |                 self._schemaless_dirty = False
191 |                 if clear_session and self in self._session.dirty_documents:
192 |                     self._session.dirty_documents.remove(self)
193 |             return self
194 | 
195 |         def delete(self, clear_session=True):
196 |             if not self._saveable():
197 |                 raise ValueError('This object is not yet saveable')
198 |             if not hasattr(self, '_schemaless_id'):
199 |                 raise ValueError('This object has no entity id (or has not been persisted)')
200 |             self._session.datastore.delete(id=self._schemaless_id)
201 |             if clear_session and self in self._session.dirty_documents:
202 |                 self._session.dirty_documents.remove(self)
203 | 
204 |         @classmethod
205 |         def _query(cls, *exprs, **kwargs):
206 |             exprs, order_by, limit = reduce_args(*exprs, **kwargs)
207 |             columns = set(e.name for e in exprs)
208 |             if order_by:
209 |                 columns.add(order_by.name)
210 |             idx = cls._schemaless_index_collection.best_index(columns)
211 |             cls._last_index_used = idx
212 |             using = idx.field_set & columns
213 | 
214 |             if not using:
215 |                 raise ValueError('cannot do this query, no indexes can be used')
216 | 
217 |             query_exprs = [e for e in exprs if e.name in using]
218 |             result = idx.underlying._do_query(query_exprs, order_by, limit)
219 |             retained_result = []
220 |             for x in result:
221 |                 if all(e.check(x) for e in exprs):
222 |                     retained_result.append(cls.from_datastore(x))
223 |             return retained_result
224 | 
225 |         @classmethod
226 |         def get(cls, *exprs, **kwargs):
227 |             kwargs['limit'] = 1
228 |             result = cls._query(*exprs, **kwargs)
229 |             if len(result) == 0:
230 |                 return None
231 |             elif len(result) == 1:
232 |                 return result[0]
233 |             else:
234 |                 raise ValueError('Got more than one result')
235 | 
236 |         @classmethod
237 |         def query(cls, *exprs, **kwargs):
238 |             return cls._query(*exprs, **kwargs)
239 | 
240 |         @classmethod
241 |         def all(cls):
242 |             return cls._query(c.tag == cls.tag)
243 | 
244 |         @classmethod
245 |         def by_id(cls, id):
246 |             entity = cls._session.datastore.by_id(id)
247 |             if not entity:
248 |                 return entity
249 |             if entity.tag != cls.tag:
250 |                 raise ValueError('Entity had tag %r, our class has tag %r' % (entity.tag, cls.tag))
251 |             return cls.from_datastore(entity)
252 | 
253 |         def __eq__(self, other):
254 |             return self.__class__ is type(other) and _collect_fields(self) == _collect_fields(other)
255 | 
256 |     return Document
257 | 


--------------------------------------------------------------------------------
/schemaless/orm/index.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import schemaless.index
  3 | from schemaless.log import ClassLogger
  4 | 
  5 | class Index(object):
  6 | 
  7 |     log = ClassLogger()
  8 | 
  9 |     def __init__(self, table_name, fields):
 10 |         self.table_name = table_name
 11 |         self.fields = fields
 12 |         self.field_set = frozenset(fields)
 13 |         self.underlying = None
 14 | 
 15 |     @classmethod
 16 |     def automatic(cls, tag, fields, datastore, declare=True):
 17 |         """This is an "internal" method for declaratively creating
 18 |         indexes. Arguments are like this:
 19 | 
 20 |         tag -- the tag of the document that this is being created for
 21 |         fields -- a list of typed Column objects like [Binary('foo', 16), VarChar('email', 255)]
 22 |         datastore -- a handle to the datastore
 23 | 
 24 |         A unique table name will be created using the tag and an md5 of the
 25 |         field names. The table will be created, if necessary.
 26 |         """
 27 | 
 28 |         field_string = ', '.join('`%s`' % (f.name,) for f in fields)
 29 |         field_hash = hashlib.md5(field_string).hexdigest()
 30 |         table_name = 'index_%05d_%s' % (tag, field_hash)
 31 | 
 32 |         if not datastore.check_table_exists(table_name):
 33 |             cls.log.info('Creating %s' % (table_name,))
 34 |             sql = ['CREATE TABLE %s (' % (table_name,)]
 35 |             for f in fields:
 36 |                 sql.append('    %s,' % (f,))
 37 |             sql.append('    `entity_id` BINARY(16) NOT NULL,')
 38 |             sql.append('    UNIQUE KEY (`entity_id`),')
 39 |             sql.append('    PRIMARY KEY (%s, `entity_id`)' % (field_string,))
 40 |             sql.append(') ENGINE=InnoDB')
 41 |             sql = '\n'.join(sql)
 42 | 
 43 |             # by this point, sql will contain a query like:
 44 |             #
 45 |             # CREATE TABLE index_00003_850f22a7c399fd1483275d62703d49de (
 46 |             #     `business_id` BINARY(16) NOT NULL,
 47 |             #     `entity_id` BINARY(16) NOT NULL,
 48 |             #     KEY (`entity_id`),
 49 |             #     PRIMARY KEY (`business_id`, `entity_id`)
 50 |             # ) ENGINE=InnoDB
 51 |             #
 52 |             # XXX: no support for unique columns yet
 53 | 
 54 |             # create the table
 55 |             datastore.connection.execute(sql)
 56 | 
 57 |         obj = cls(table_name, [f.name for f in fields])
 58 |         if declare:
 59 |             obj.declare(datastore, tag=tag)
 60 |         return obj
 61 | 
 62 |     def declare(self, datastore, tag=None):
 63 |         match_on = {}
 64 |         if tag is not None:
 65 |             match_on = {'tag': tag}
 66 |         self.underlying = datastore.define_index(self.table_name, self.fields, match_on=match_on)
 67 |         return self.underlying
 68 | 
 69 |     def __str__(self):
 70 |         if self.underlying is None:
 71 |             return '%s(%r, %s)' % (self.__class__.__name__, self.table_name, self.fields)
 72 |         else:
 73 |             return '%s(%s)' % (self.__class__.__name__, self.underlying)
 74 |     __repr__ = __str__
 75 | 
 76 | class IndexCollection(object):
 77 | 
 78 |     log = ClassLogger()
 79 | 
 80 |     def __init__(self, indexes):
 81 |         self.indexes = indexes
 82 |         self.answer_cache = {}
 83 | 
 84 |     def best_index(self, fields):
 85 |         """Given some collection of fields (e.g. ['user_id', 'first_name',
 86 |         'last_name']) try to determine which index in the collection will match
 87 |         the most fields.
 88 |         """
 89 |         fields = frozenset(fields)
 90 |         if fields in self.answer_cache:
 91 |             return self.answer_cache[fields]
 92 | 
 93 |         # try to find the index that covers the most columns possible, and where
 94 |         # the index has the least number of fields possible
 95 |         best = (-1, 0, None)
 96 |         for idx in self.indexes:
 97 |             common = len(fields & idx.field_set)
 98 |             val = (common, -len(idx.field_set), idx)
 99 |             if val > best:
100 |                 best = val
101 | 
102 |         best = best[-1]
103 |         self.log.debug('from %s chose %s as best index for %s' % (self.indexes, best, fields))
104 |         self.answer_cache[fields] = best
105 |         return best
106 | 


--------------------------------------------------------------------------------
/schemaless/orm/session.py:
--------------------------------------------------------------------------------
 1 | class Session(object):
 2 | 
 3 |     def __init__(self, datastore):
 4 |         self.datastore = datastore
 5 |         self.dirty_documents = set()
 6 | 
 7 |     def save(self):
 8 |         for d in self.dirty_documents:
 9 |             d.save(clear_session=False)
10 |         self.dirty_documents.clear()
11 | 


--------------------------------------------------------------------------------
/schemaless/orm/util.py:
--------------------------------------------------------------------------------
1 | def is_type_list(t, xs):
2 |     try:
3 |         return all(isinstance(x, t) for x in xs)
4 |     except TypeError:
5 |         return False
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(name='Schemaless',
 4 |       version='0.2.1',
 5 |       description='Schema-less MySQL pattern',
 6 |       author='Evan Klitzke',
 7 |       author_email='evan@eklitzke.org',
 8 |       packages=['schemaless', 'schemaless.orm']
 9 |      )
10 | 


--------------------------------------------------------------------------------
/tests/tables.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS `index_birthdate` (
 2 |   `entity_id` binary(16) NOT NULL,
 3 |   `birthdate` varchar(64) NOT NULL DEFAULT '',
 4 |   PRIMARY KEY (`birthdate`,`entity_id`)
 5 | ) ENGINE=InnoDB;
 6 | 
 7 | CREATE TABLE IF NOT EXISTS `index_foo` (
 8 |   `entity_id` binary(16) NOT NULL,
 9 |   `bar` int(11) NOT NULL,
10 |   PRIMARY KEY (`bar`,`entity_id`),
11 |   UNIQUE KEY `entity_id` (`entity_id`)
12 | ) ENGINE=InnoDB;
13 | 
14 | CREATE TABLE IF NOT EXISTS `index_user_id` (
15 |   `entity_id` binary(16) NOT NULL,
16 |   `user_id` char(32) NOT NULL,
17 |   PRIMARY KEY (`user_id`,`entity_id`),
18 |   UNIQUE KEY `entity_id` (`entity_id`)
19 | ) ENGINE=InnoDB;
20 | 
21 | CREATE TABLE IF NOT EXISTS `index_user_name` (
22 |   `entity_id` binary(16) NOT NULL,
23 |   `first_name` varchar(255) NOT NULL,
24 |   `last_name` varchar(255) NOT NULL,
25 |   PRIMARY KEY (`first_name`,`last_name`,`entity_id`),
26 |   UNIQUE KEY `entity_id` (`entity_id`)
27 | ) ENGINE=InnoDB;
28 | 
29 | CREATE TABLE IF NOT EXISTS `index_todo_user_id` (
30 |   `entity_id` binary(16) NOT NULL,
31 |   `user_id` char(32) NOT NULL,
32 |   PRIMARY KEY (`user_id`,`entity_id`),
33 |   UNIQUE KEY `entity_id` (`entity_id`)
34 | ) ENGINE=InnoDB;
35 | 


--------------------------------------------------------------------------------
/tests/tests.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import unittest
  4 | 
  5 | import schemaless
  6 | from schemaless import orm
  7 | from schemaless import c
  8 | 
  9 | class TestBase(unittest.TestCase):
 10 | 
 11 |     def clear_tables(self, datastore):
 12 |         tables = set()
 13 |         for d in datastore.connection.query('SHOW TABLES'):
 14 |             for v in d.itervalues():
 15 |                 tables.add(v)
 16 |         for tbl in tables:
 17 |             datastore.connection.execute('DELETE FROM %s' % (tbl,))
 18 | 
 19 |     def assert_equal(self, a, b):
 20 |         return self.assertEqual(a, b)
 21 | 
 22 |     def assert_len(self, a, b):
 23 |         return self.assertEqual(a, len(b))
 24 | 
 25 |     def assert_used_index(self, document_cls, name):
 26 |         index = getattr(document_cls, '_last_index_used', None)
 27 |         if index is None:
 28 |             return False
 29 |         if index.table_name != name:
 30 |             self.assert_(False, 'Expected to use index %s but actually used %s' % (name, index.table_name))
 31 | 
 32 | class SchemalessTestCase(TestBase):
 33 | 
 34 |     def setUp(self):
 35 |         super(SchemalessTestCase, self).setUp()
 36 |         self.ds = schemaless.DataStore(mysql_shards=['localhost:3306'], user='test', password='test', database='test')
 37 |         self.user = self.ds.define_index('index_user_id', ['user_id'])
 38 |         self.user_name = self.ds.define_index('index_user_name', ['first_name', 'last_name'])
 39 |         self.foo = self.ds.define_index('index_foo', ['bar'], {'m': 'right'})
 40 |         self.clear_tables(self.ds)
 41 | 
 42 |         self.entity = self.ds.put({'user_id': schemaless.guid(), 'first_name': 'evan', 'last_name': 'klitzke'})
 43 | 
 44 |     def test_query(self):
 45 |         self.assert_len(1, self.user.query(c.user_id == self.entity.user_id))
 46 |         self.assert_len(1, self.user_name.query(c.first_name == 'evan', c.last_name == 'klitzke'))
 47 | 
 48 |         new_entity = self.ds.put({'user_id': schemaless.guid(), 'first_name': 'george'})
 49 |         self.assert_len(1, self.user.query(c.user_id == new_entity.user_id))
 50 |         self.assert_len(0, self.user_name.query(c.first_name == 'george')) # didn't have a full index
 51 | 
 52 |     def test_delete_by_entity(self):
 53 |         self.ds.delete(self.entity)
 54 |         self.assert_len(0, self.user.query(c.user_id == self.entity.user_id))
 55 | 
 56 |     def test_delete_by_entity_id(self):
 57 |         self.ds.delete(id=self.entity.id)
 58 |         self.assert_len(0, self.user.query(c.user_id == self.entity.user_id))
 59 | 
 60 |     def test_match_on(self):
 61 |         entity_one = self.ds.put({'foo_id': schemaless.guid(), 'bar': 1, 'm': 'left'})
 62 |         entity_two = self.ds.put({'foo_id': schemaless.guid(), 'bar': 1, 'm': 'right'}) # only this should match
 63 | 
 64 |         rows = self.foo.query(c.bar == 1)
 65 |         self.assert_len(1, rows)
 66 |         self.assert_equal(rows[0].foo_id, entity_two.foo_id)
 67 | 
 68 |     def test_in_queries(self):
 69 |         user_ids = [self.entity.user_id]
 70 |         user_ids.append(self.ds.put({'user_id': schemaless.guid()}).user_id)
 71 | 
 72 |         rows = self.user.query(c.user_id.in_(user_ids))
 73 |         self.assert_len(2, rows)
 74 |         self.assert_equal(set(user_ids), set(row['user_id'] for row in rows))
 75 | 
 76 | class ORMTestCase(TestBase):
 77 |     def setUp(self):
 78 |         datastore = schemaless.DataStore(mysql_shards=['localhost:3306'], user='test', password='test', database='test')
 79 |         self.clear_tables(datastore)
 80 | 
 81 |         tags_db = {
 82 |             'User': 1,
 83 |             'ToDo': 2,
 84 |             'Business': 3}
 85 | 
 86 |         self.session = orm.Session(datastore)
 87 |         self.base_class = orm.make_base(self.session, tags_db=tags_db)
 88 | 
 89 |     @property
 90 |     def connection(self):
 91 |         return self.session.datastore.connection
 92 | 
 93 |     def get_index_count(self, index_name):
 94 |         row = self.connection.get('SELECT COUNT(*) AS count FROM %s' % (index_name,))
 95 |         return row['count']
 96 | 
 97 | class SchemalessORMTestCase(ORMTestCase):
 98 | 
 99 |     def setUp(self):
100 |         super(SchemalessORMTestCase, self).setUp()
101 | 
102 |         class User(self.base_class):
103 |             _columns = [orm.Column('user_id', required=True),
104 |                         orm.Column('first_name', required=True),
105 |                         orm.Column('last_name', required=True),
106 |                         orm.Column('birthdate'),
107 |                         orm.Column('time_created', default=datetime.datetime.now, convert=schemaless.orm.converters.DateTimeConverter)]
108 |             _indexes = [orm.Index('index_user_id', ['user_id']),
109 |                         orm.Index('index_birthdate', ['birthdate']),
110 |                         orm.Index('index_user_name', ['first_name', 'last_name'])]
111 | 
112 |         self.User = User
113 | 
114 |     def test_create_object_save_delete(self):
115 |         # create a new, empty object
116 |         u = self.User()
117 |         assert not u._saveable()
118 |         assert u.is_dirty
119 | 
120 |         # populate some, but not all of the fields; the object should be dirty,
121 |         # but not saveable
122 |         u.user_id = schemaless.guid()
123 |         u.first_name = 'evan'
124 |         assert not u._saveable()
125 |         assert u.is_dirty
126 |         user = self.User.get(c.user_id == u.user_id)
127 |         assert not user
128 | 
129 |         # finish populating the fields, check that the object is saveable
130 |         u.last_name = 'klitzke'
131 |         assert u._saveable()
132 |         assert u.is_dirty
133 | 
134 |         # persist the object, check that it made it to the datastore
135 |         u.save()
136 |         assert u._saveable()
137 |         assert not u.is_dirty
138 |         user = self.User.get(c.user_id == u.user_id)
139 |         assert user
140 | 
141 |         # delete the object, check that it's deleted from the datastore
142 |         u.delete()
143 |         assert u._saveable()
144 |         assert not u.is_dirty
145 |         user = self.User.get(c.user_id == u.user_id)
146 |         assert not user
147 | 
148 |     def test_in_query(self):
149 |         user_ids = []
150 |         users = []
151 |         for x in range(5):
152 |             u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar')
153 |             user_ids.append(u.user_id)
154 |             users.append(u)
155 |         self.session.save()
156 | 
157 |         fetched_users = self.User.query(c.user_id.in_(user_ids[:3]))
158 |         self.assert_used_index(self.User, 'index_user_id')
159 |         self.assert_equal(set(user_ids[:3]), set(u.user_id for u in users[:3]))
160 | 
161 |     def test_name_query(self):
162 |         u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar')
163 |         u.save()
164 |         v = self.User.get(c.first_name == 'foo', c.last_name == 'bar')
165 |         self.assert_used_index(self.User, 'index_user_name')
166 |         self.assert_equal(u.user_id, v.user_id)
167 | 
168 |     def test_update(self):
169 |         u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar')
170 |         u.save()
171 |         v = self.User.get(c.first_name == 'foo', c.last_name == 'bar')
172 |         self.assert_used_index(self.User, 'index_user_name')
173 |         self.assert_equal(u.id, v.id)
174 |         self.assert_equal(u.user_id, v.user_id)
175 | 
176 |         u.first_name = 'baz'
177 |         u.save()
178 |         v = self.User.get(c.first_name == 'foo', c.last_name == 'bar')
179 |         self.assert_used_index(self.User, 'index_user_name')
180 |         self.assert_equal(None, v)
181 |         v = self.User.get(c.first_name == 'baz', c.last_name == 'bar')
182 |         self.assert_used_index(self.User, 'index_user_name')
183 |         self.assert_equal(u.id, v.id)
184 |         self.assert_equal(u.user_id, v.user_id)
185 | 
186 |     def test_double_delete(self):
187 |         u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar')
188 |         u.save()
189 |         u.delete()
190 |         u.delete()
191 | 
192 |     def test_update_preserves_id(self):
193 |         u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar')
194 |         u.save()
195 |         orig_id = u.id
196 | 
197 |         u.first_name = 'baz'
198 |         u.save()
199 |         self.assert_equal(orig_id, u.id)
200 | 
201 |     def test_converter(self):
202 |         u = self.User(user_id=schemaless.guid(), first_name='foo', last_name='bar')
203 |         u.save()
204 |         self.assert_(isinstance(u.time_created, datetime.datetime))
205 | 
206 |         v = self.User.get(c.user_id == u.user_id)
207 |         self.assert_(isinstance(v.time_created, datetime.datetime))
208 | 
209 |     def test_index_update(self):
210 |         u = self.User(user_id=schemaless.guid(), first_name='evan', last_name='klitzke')
211 |         u.save()
212 | 
213 |         self.assert_equal(self.get_index_count('index_birthdate'), 0)
214 | 
215 |         u.birthdate = '1986-09-19'
216 |         u.save()
217 |         self.assert_equal(self.get_index_count('index_birthdate'), 1)
218 | 
219 | class ManyToOneORMTestCase(ORMTestCase):
220 | 
221 |     def setUp(self):
222 |         super(ManyToOneORMTestCase, self).setUp()
223 | 
224 |         class ToDo(self.base_class):
225 |             _columns = [orm.Column('user_id'),
226 |                         orm.Column('action'),
227 |                         orm.Column('completion_time', default=None, convert=schemaless.orm.converters.DateTimeConverter)]
228 |             _indexes = [orm.Index('index_todo_user_id', ['user_id'])]
229 |                         #orm.Index('index_todo_user_id_time', ['user_id', 'completion_time'])]
230 | 
231 |         self.ToDo = ToDo
232 | 
233 |     @property
234 |     def connection(self):
235 |         return self.session.datastore.connection
236 | 
237 |     def test_update_multiple(self):
238 |         user_id = schemaless.guid()
239 |         item1 = self.ToDo(user_id=user_id, action='buy groceries').save()
240 |         item2 = self.ToDo(user_id=user_id, action='buy groceries').save()
241 | 
242 |         self.assert_equal(self.get_index_count('index_todo_user_id'), 2)
243 | 
244 |         item1.completion_time = datetime.datetime.now()
245 |         item1.save()
246 | 
247 |         self.assert_equal(self.get_index_count('index_todo_user_id'), 2)
248 |        
249 | class AutomaticORMTestCase(ORMTestCase):
250 |     """Test ORM documents with automatic indexes."""
251 | 
252 |     def setUp(self):
253 |         super(AutomaticORMTestCase, self).setUp()
254 | 
255 |         class Business(self.base_class):
256 |             _columns = [orm.Char('business_id', 32),
257 |                         orm.String('city', 255),
258 |                         orm.Char('state', 2),
259 |                         orm.Bool('active', default=True)]
260 |             _indexes = [('business_id',),
261 |                         ('city', 'state')]
262 | 
263 |         self.Business = Business
264 | 
265 |     def add_biz(self, city='Oakland', state='CA'):
266 |         return self.Business(business_id=schemaless.guid(), city='Oakland', state='CA').save()
267 | 
268 |     def test_querying(self):
269 |         b = self.add_biz()
270 |         assert not b.is_dirty
271 | 
272 |         self.assert_equal(b, self.Business.get(c.business_id == b.business_id))
273 |         self.assert_used_index(self.Business, 'index_00003_850f22a7c399fd1483275d62703d49de')
274 | 
275 |         self.assert_equal(b, self.Business.get(c.city == b.city, c.state == b.state))
276 |         self.assert_used_index(self.Business, 'index_00003_8e5e9c3d848aa30749151092bbff622d')
277 | 
278 |     def test_all(self):
279 |         b = self.add_biz()
280 |         assert not b.is_dirty
281 | 
282 |         self.assert_len(1, self.Business.all())
283 |         self.assert_used_index(self.Business, 'entities')
284 | 
285 |     def test_bool(self):
286 |         b = self.add_biz()
287 |         self.assert_equal(b, self.Business.get(c.business_id == b.business_id, c.active == True))
288 |         self.assert_equal(b, self.Business.get(c.business_id == b.business_id, c.active != False))
289 | 
290 | if __name__ == '__main__':
291 |     unittest.main()
292 | 


--------------------------------------------------------------------------------