├── .gitignore ├── .travis.yml ├── CHANGELOG ├── LICENSE ├── README.md ├── bin ├── __init__.py ├── hustle └── insert ├── bootstrap.sh ├── deps ├── .gitignore ├── AUTHORS.txt ├── INSTALL.txt ├── LICENSE.txt ├── Makefile ├── cardunion │ ├── cardunion.pyx │ └── test │ │ └── cardunion_test.py ├── libebset │ ├── LICENSE │ ├── boolarray.h │ ├── ewah.h │ ├── ewahutil.h │ ├── pyebset.pyx │ ├── runninglengthword.h │ └── test │ │ └── pyebset_test.py ├── liblmdb │ ├── COPYRIGHT │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── cmdb.pxd │ ├── db.pyx │ ├── lmdb.h │ ├── mdb.c │ ├── mdb_copy.1 │ ├── mdb_copy.c │ ├── mdb_stat.1 │ ├── mdb_stat.c │ ├── midl.c │ ├── midl.h │ ├── setup.py │ └── test │ │ ├── test_cursor.py │ │ ├── test_db.py │ │ ├── test_greater_failure.py │ │ ├── test_intdb.py │ │ ├── test_intintdb.py │ │ ├── test_intreader.py │ │ ├── test_reader.py │ │ └── test_strintdb.py ├── liblru │ ├── clru.h │ └── pylru.pyx ├── liblz4 │ ├── clz4.c │ ├── lz4.c │ ├── lz4.h │ ├── lz4hc.c │ └── lz4hc.h ├── liblzf │ ├── LICENSE │ ├── clzf.c │ ├── lzf.h │ ├── lzfP.h │ ├── lzf_c.c │ └── lzf_d.c ├── librtrie │ ├── main.c │ ├── pyrtrie.c │ ├── rtrie.c │ └── rtrie.h ├── libwtrie │ ├── test │ │ └── test_wtrie.py │ └── wtrie.pyx ├── maxhash │ ├── maxhash.pyx │ └── test │ │ └── maxhash_test.py ├── scamurmur3 │ ├── murmur3.c │ ├── murmur3.h │ └── scamurmur3.c └── setup.py ├── doc ├── .gitignore ├── Makefile ├── _static │ └── hustle.png ├── _templates │ └── layout.html ├── api │ ├── core.rst │ └── hustle.rst ├── conf.py ├── howto │ ├── cli.rst │ ├── configure.rst │ ├── delete.rst │ ├── insert.rst │ ├── integration_tests.rst │ ├── query.rst │ └── schema.rst ├── hustle.png ├── index.rst └── start │ ├── install.rst │ └── tutorial.rst ├── hustle ├── __init__.py ├── cardinality.py └── core │ ├── __init__.py │ ├── column_fn.py │ ├── marble.py │ ├── pipeline.py │ ├── pipeworker.py │ ├── settings.py │ ├── stat.py │ └── util.py ├── integration_test ├── README ├── fixtures │ ├── imps.json │ ├── ip.json │ └── pixel.json ├── setup.py ├── test_aggregation.py ├── test_bool.py ├── test_cardinality.py ├── test_column_fn.py ├── test_drop.py ├── test_join.py ├── test_project_order.py └── test_simple_query.py ├── requirements-dev.txt ├── requirements.txt ├── settings.yaml ├── setup.py └── test ├── fixtures └── keys ├── test_column.py ├── test_expression.py ├── test_lru_dict.py ├── test_marble.py ├── test_merge_wrapper.py ├── test_pipeline.py ├── test_pipeworker.py ├── test_query_checker.py ├── test_rtrie.py ├── test_stress_wtrie.py ├── test_table.py ├── test_util.py └── test_wtrie.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | *.a 8 | 9 | # Distribution / packaging 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | doc/_build 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | .idea 24 | .ropeproject 25 | 26 | # Installer logs 27 | pip-log.txt 28 | pip-delete-this-directory.txt 29 | 30 | # Vim 31 | *.swp 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | group: deprecated-2017Q4 4 | 5 | language: python 6 | notification: 7 | email: false 8 | 9 | python: 10 | - "2.7" 11 | 12 | virtualenv: 13 | system_site_packages: true 14 | 15 | before_install: 16 | - pushd . 17 | - sudo apt-get update -qq 18 | - sudo apt-get install -y curl erlang python-dev 19 | - git clone https://github.com/discoproject/disco.git /tmp/disco 20 | - cd /tmp/disco && git checkout develop && sudo make install 21 | - cd lib && pip install . 22 | - popd 23 | 24 | install: 25 | - sudo bash ./bootstrap.sh 26 | - sudo pip install -r requirements-dev.txt 27 | 28 | script: nosetests test/ 29 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 0.2.7 (Nov 7 2014) 2 | Fix issues in libebset for 32 bit architectures 3 | 4 | 0.2.6 (July 29 2014) 5 | Optimized #48. Making "limit" run faster 6 | Added column functions. Being able to do column transforms, e.g. ip_aton, 7 | ip_ntoa... 8 | Added aggregate function h_combine. Being able to combine multiple values 9 | with similar keys 10 | Added feature #66. Making query result purgable 11 | Added "server" option for the Hustle CLI 12 | 13 | 0.2.5 (June 27 2014) 14 | Fixed issue #50. BooleanIx didn't handle missing values 15 | Added feature #11. Tag the query result in DDFS 16 | Added feature #49. Auto purge query related data if applicable 17 | Added feature to set the max cores that could be used by a query 18 | Added feature to enable profile for a query 19 | Fixed preprocess function of insert function to support filter functionality 20 | Added option for insert function to leave the marble for other uses 21 | 22 | 0.2.4 (May 29 2014) 23 | Fixed issue #44. BooleanIx kept an invalid txn, crashed the insert 24 | Upgrade the exception handling to let Disco retry the failed task 25 | 26 | 0.2.3 (May 16 2014) 27 | Fixed issue #39. Partition can't handle integer types 28 | Fixed issue in the Future class to be compatible with cat() 29 | Added feature #41. Support partition filters 30 | Optimized IO performance, speeded up tight loops 31 | 32 | 0.2.2 (April 30 2014) 33 | Fixed issue #34. Wrong data types for aggregation columns 34 | Added decoder for CSV-like files 35 | Optimized performance for aggregation queries 36 | 37 | 0.2.1 (April 22 2014) 38 | Fixed wrong argument of mget in MarbleStream 39 | Fixed the client of EWAHBoolArray to check the index faster 40 | Fixed the name collision of nested table 41 | Added PyYAML to the setup.py 42 | Unified 'dump' and 'edump' to the new function 'cat' 43 | 44 | 0.2.0 (April 21 2014) 45 | Added feature #11. Serialization of Hustle tables 46 | Added feature #15. Compression of partition data 47 | Added feature #21. Optimization for in-stage combine 48 | Added feature #22. Boolean data type 49 | Added feature #25. Support for csv_decoder 50 | Added feature #27. Support for HyperLogLog and MinHash aggregation 51 | Added feature #32. Optimization for duplicate adjacent values 52 | Added functionality to get statistical information of Hustle tables 53 | Fixed issue #26. Can't insert data with a huge number of partitions 54 | Known issues to fix: #34 55 | 56 | 0.1.3 (March 19 2014) 57 | Added mget() for lmdb Python client 58 | Improved the hustle_input_stream by using mget() 59 | Upgraded EWAHBoolArray from upstream 60 | Fixed a bug in libebset. len() is inaccurate on inverted bitsets 61 | 62 | 0.1.2 (March 13 2014) 63 | Added feature #16. Alias column or aggregate functions 64 | Added feature #18. Non-blocking select() 65 | Fixed #17 66 | 67 | 0.1.1 (March 12 2014) 68 | Added feature #8. 'delete' and 'drop' table 69 | Added feature #9. Query supports 'in' and 'not in' 70 | Simplified 'join' clause #10 71 | Fixed #19 72 | 73 | 0.1.0 (March 6 2014) 74 | Initial release 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is the MIT license: http://www.opensource.org/licenses/mit-license.php 2 | 3 | Copyright (c) 2014 Chango Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 9 | to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or 12 | substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Hustle](doc/_static/hustle.png) 2 | 3 | A column oriented, embarrassingly distributed, relational event database. 4 | 5 | Features 6 | -------- 7 | 8 | * column oriented - super fast queries 9 | * events - write only semantics 10 | * distributed insert - designed for petabyte scale distributed datasets with massive write loads 11 | * compressed - bitmap indexes, lz4, and prefix trie compression 12 | * relational - join gigantic data sets 13 | * partitioned - smart shards 14 | * embarrassingly distributed ([based on Disco](http://discoproject.org/)) 15 | * embarrassingly fast ([uses LMDB](http://symas.com/mdb/)) 16 | * NoSQL - Python DSL 17 | * bulk append only semantics 18 | * highly available, horizontally scalable 19 | * REPL/CLI query interface 20 | 21 | Example Query 22 | ------------- 23 | 24 | ``` 25 | select(impressions.ad_id, impressions.date, h_sum(pix.amount), h_count(), 26 | where=((impressions.date < '2014-01-13') & (impressions.ad_id == 30010), 27 | pix.date < '2014-01-13'), 28 | join=(impressions.site_id, pix.site_id), 29 | order_by=impressions.date) 30 | ``` 31 | 32 | 33 | Installation 34 | ------------ 35 | 36 | After cloning this repo, here are some considerations: 37 | 38 | * you will need Python 2.7 or higher - note that it *probably* won't work on 2.6 (has to do with pickling lambdas...) 39 | * you need to install Disco 0.5 and its dependencies - get that working first 40 | * you need to install Hustle and its 'deps' thusly: 41 | 42 | ``` 43 | cd hustle 44 | sudo ./bootstrap.sh 45 | ``` 46 | 47 | Please refer to the [Installation Guide](http://tspurway.github.io/hustle/start/install.html) for more details 48 | 49 | Documentation 50 | ------------- 51 | 52 | [Hustle User Guide](http://tspurway.github.io/hustle/) 53 | 54 | [Hustle Mailing List](http://groups.google.com/group/hustle-users) 55 | 56 | Credits 57 | ------- 58 | 59 | Special thanks to following open-source projects: 60 | 61 | * [EWAHBoolArray](https://github.com/lemire/EWAHBoolArray) 62 | * [disco](http://discoproject.org/) 63 | * [liblmdb](http://symas.com/mdb/) 64 | * [lz4](https://code.google.com/p/lz4/) 65 | * [ultrajson](https://github.com/esnme/ultrajson) 66 | 67 | [![Build Status](https://travis-ci.org/tspurway/hustle.svg?branch=master)](https://travis-ci.org/tspurway/hustle) 68 | -------------------------------------------------------------------------------- /bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/bin/__init__.py -------------------------------------------------------------------------------- /bin/insert: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | if __name__ == '__main__': 4 | from argparse import ArgumentParser 5 | from disco.func import disco_input_stream 6 | from disco.comm import open_url 7 | from hustle import Table, insert 8 | from hustle.core.marble import csv_decoder 9 | from functools import partial 10 | 11 | parser = ArgumentParser(prog='insert', description="Hustle bulk load") 12 | 13 | parser.add_argument( 14 | "-s", 15 | "--server", 16 | dest="server", 17 | help="DDFS server destination", 18 | default='disco://localhost' 19 | ) 20 | 21 | parser.add_argument( 22 | "-f", 23 | dest="infile", 24 | help="A file containing a list of all files to be inserted", 25 | ) 26 | 27 | parser.add_argument( 28 | "-m", 29 | "--maxsize", 30 | dest="maxsize", 31 | help="Initial size of Hustle marble", 32 | default=1024*1024*1024 33 | ) 34 | 35 | parser.add_argument( 36 | "-t", 37 | "--tmpdir", 38 | dest="tmpdir", 39 | help="Temporary directory for Hustle marble creation", 40 | default='/tmp' 41 | ) 42 | 43 | parser.add_argument( 44 | "-p", 45 | dest="processor", 46 | help="a module.function for the Hustle import preprocessor", 47 | ) 48 | 49 | parser.add_argument( 50 | "--disco-chunk", 51 | dest="disco_chunk", 52 | help="Indicated if the input files are in Disco CHUNK format", 53 | default=False, 54 | action='store_true' 55 | ) 56 | 57 | parser.add_argument( 58 | "--csv-fields", 59 | dest="csv_fields", 60 | help="Assume input files are CSV and have the comma separated list of fields provided", 61 | ) 62 | 63 | parser.add_argument( 64 | "--delimiter", 65 | dest="delimiter", 66 | help="For CSV input, this is the delimeter", 67 | default=',' 68 | ) 69 | 70 | parser.add_argument( 71 | "table", 72 | metavar='TABLE', 73 | type=str, 74 | help="The Hustle table to insert to", 75 | ) 76 | 77 | parser.add_argument( 78 | "files", 79 | metavar='FILES', 80 | type=str, 81 | nargs="+", 82 | help="The files to insert", 83 | ) 84 | 85 | options = parser.parse_args() 86 | 87 | tab = Table.from_tag(options.table, server=options.server) 88 | 89 | if options.infile: 90 | fd = open(options.infile) 91 | files = fd.readlines() 92 | else: 93 | files = options.files 94 | 95 | decoder = None 96 | if options.csv_fields: 97 | decoder = partial(csv_decoder, fieldnames=options.csv_fields.split(options.delimiter)) 98 | 99 | preproc = None 100 | if options.processor: 101 | spec = options.processor.split('.') 102 | modname = '.'.join(spec[:-1]) 103 | funcname = spec[-1] 104 | mod = __import__(modname, fromlist=[funcname]) 105 | preproc = getattr(mod, funcname) 106 | 107 | streams = [] 108 | for f in files: 109 | s = open_url(f) 110 | if options.disco_chunk: 111 | s = disco_input_stream(s, None, None) 112 | streams.append(s) 113 | 114 | insert(tab, streams=streams, preprocess=preproc, 115 | maxsize=int(options.maxsize), tmpdir=options.tmpdir, server=options.server, 116 | lru_size=25000) 117 | -------------------------------------------------------------------------------- /bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | PYTHON=python 6 | CONFIG_DIR=/etc/hustle 7 | BIN_DIR=/usr/local/bin 8 | WORK_DIR=${PWD} 9 | FROM=${WORK_DIR}/settings.yaml 10 | DEST=${CONFIG_DIR}/settings.yaml 11 | 12 | pip install -r requirements.txt 13 | cd deps && make install 14 | cd ${WORK_DIR} && pip install . 15 | cp bin/hustle ${BIN_DIR}/hustle 16 | 17 | if [[ ! -d ${CONFIG_DIR} ]]; then 18 | mkdir ${CONFIG_DIR} 19 | elif [[ -f ${DEST} ]]; then 20 | read -p "Settings file already exists, overwrite it? [Yes/No]: " rc; 21 | if [[ "$rc" =~ [Nn][Oo] ]]; then 22 | exit 0 23 | fi 24 | fi 25 | cp ${FROM} ${DEST} 26 | -------------------------------------------------------------------------------- /deps/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | cardunion/cardunion.c 3 | libwtrie/wtrie.c 4 | maxhash/maxhash.c 5 | liblru/pylru.cpp 6 | liblmdb/db.c 7 | liblmdb/mdb_copy 8 | liblmdb/mdb_stat 9 | libebset/pyebset.cpp 10 | *.pyc 11 | *.o 12 | *.a 13 | *.so 14 | *.swp 15 | -------------------------------------------------------------------------------- /deps/AUTHORS.txt: -------------------------------------------------------------------------------- 1 | Tim Spurway 2 | Nan Jiang 3 | Shayan Pooya 4 | -------------------------------------------------------------------------------- /deps/INSTALL.txt: -------------------------------------------------------------------------------- 1 | INSTALL 2 | ======= 3 | 4 | 1. Install C Library 5 | -------------------- 6 | make 7 | sudo make install 8 | 9 | 2. Install Python Library 10 | ------------------------- 11 | sudo python setup.py install 12 | -------------------------------------------------------------------------------- /deps/LICENSE.txt: -------------------------------------------------------------------------------- 1 | This is the MIT license: http://www.opensource.org/licenses/mit-license.php 2 | 3 | Copyright (c) 2013 Chango Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 9 | to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or 12 | substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /deps/Makefile: -------------------------------------------------------------------------------- 1 | # Top level Makefile, liblmdb is the only thing needs tobe compiled 2 | 3 | RM = rm -rf 4 | PYTHON = python 5 | LMDB = liblmdb 6 | 7 | default: all 8 | 9 | .DEFAULT: 10 | cd $(LMDB) && $(MAKE) $@ 11 | $(PYTHON) setup.py build 12 | 13 | install: 14 | cd $(LMDB) && $(MAKE) $@ 15 | $(PYTHON) setup.py install 16 | 17 | .PHONY: clean 18 | 19 | clean: 20 | cd $(LMDB) && $(MAKE) $@ 21 | $(RM) build 22 | -------------------------------------------------------------------------------- /deps/cardunion/test/cardunion_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from math import sqrt 3 | from cardunion import Cardunion 4 | 5 | 6 | class TestCardunion(unittest.TestCase): 7 | def setUp(self): 8 | self.log2m = 12 9 | self.error = 1.04 / sqrt(2 ** self.log2m) 10 | 11 | def test_mid_range_with_strings(self): 12 | self.execute(10000, self.log2m, self.error) 13 | 14 | def test_long_range_with_strings(self): 15 | self.execute(100000, self.log2m, self.error) 16 | 17 | def test_low_range_with_strings(self): 18 | self.execute(100, self.log2m, self.error) 19 | 20 | def execute(self, set_size, m, p): 21 | hll = Cardunion(m) 22 | for i in range(set_size): 23 | hll.add(str(i)) 24 | 25 | estimate = hll.count() 26 | error = abs(estimate / float(set_size) - 1) 27 | self.assertLess(error, p) 28 | 29 | def test_with_duplicates(self): 30 | hll = Cardunion(self.log2m) 31 | set_size = 100000 32 | for i in range(set_size): 33 | if i % 3: 34 | hll.add(str(i + 1)) 35 | else: 36 | hll.add(str(i)) 37 | 38 | estimate = hll.count() 39 | expected = set_size * 2.0 / 3.0 40 | error = abs(estimate / float(expected) - 1) 41 | self.assertLess(error, self.error) 42 | 43 | def test_with_heavy_duplicates(self): 44 | hll = Cardunion(self.log2m) 45 | set_size = 100000 46 | for i in range(set_size): 47 | if i % 2 or i < set_size / 2: 48 | hll.add(str(1)) 49 | else: 50 | hll.add(str(i)) 51 | 52 | estimate = hll.count() 53 | expected = set_size * 1.0 / 4.0 54 | error = abs(estimate / float(expected) - 1) 55 | self.assertLess(error, self.error) 56 | 57 | def test_dumps(self): 58 | hll = Cardunion(self.log2m) 59 | hll_copy = Cardunion(self.log2m) 60 | for i in range(10000): 61 | hll.add(str(i)) 62 | 63 | hll_copy.loads(hll.dumps()) 64 | self.assertEqual(hll.count(), hll_copy.count()) 65 | 66 | def test_sparse_dumps(self): 67 | hll = Cardunion(self.log2m) 68 | hll_copy = Cardunion(self.log2m) 69 | for i in range(500): 70 | hll.add(str(i)) 71 | 72 | hll_copy.loads(hll.dumps()) 73 | self.assertEqual(hll.count(), hll_copy.count()) 74 | 75 | def test_union(self): 76 | hll = Cardunion(self.log2m) 77 | hll_1 = Cardunion(self.log2m) 78 | for i in range(10000): 79 | hll.add(str(i)) 80 | for i in range(10000, 20000): 81 | hll_1.add(str(i)) 82 | 83 | hll.union([hll_1]) 84 | estimate = hll.count() 85 | error = abs(estimate / float(20000) - 1) 86 | self.assertLess(error, self.error) 87 | 88 | def test_bunion(self): 89 | hll = Cardunion(self.log2m) 90 | hll_1 = Cardunion(self.log2m) 91 | hll_2 = Cardunion(self.log2m) 92 | for i in range(10000): 93 | hll.add(str(i)) 94 | for i in range(10000, 20000): 95 | hll_1.add(str(i)) 96 | for i in range(20000, 30000): 97 | hll_2.add(str(i)) 98 | 99 | hll.bunion([hll_1.dumps(), hll_2.dumps()]) 100 | estimate = hll.count() 101 | error = abs(estimate / float(30000) - 1) 102 | self.assertLess(error, self.error) 103 | 104 | def test_intersect(self): 105 | """Since there is no theoretical error bound for intersection, 106 | we'd use 3-sigma rule instead. 107 | """ 108 | hll = Cardunion() 109 | hll_1 = Cardunion() 110 | for i in range(10000): 111 | hll.add(str(i)) 112 | for i in range(5000, 15000): 113 | hll_1.add(str(i)) 114 | 115 | estimate, error, _ = Cardunion.intersect([hll_1, hll]) 116 | print estimate, error 117 | self.assertTrue(5000 - 3 * error <= estimate <= 5000 + 3 * error) 118 | 119 | def test_intersect_big_small(self): 120 | hll = Cardunion() 121 | hll_1 = Cardunion() 122 | for i in range(50): 123 | hll.add(str(i)) 124 | for i in range(1, 100000): 125 | hll_1.add(str(i)) 126 | 127 | estimate, error, _ = Cardunion.intersect([hll_1, hll]) 128 | print estimate, error 129 | self.assertTrue(50 - 3 * error <= estimate <= 50 + 3 * error) 130 | 131 | def test_intersect_a_few(self): 132 | hll = Cardunion() 133 | hll_1 = Cardunion() 134 | hll_2 = Cardunion() 135 | for i in range(5000): 136 | hll.add(str(i)) 137 | for i in range(1, 100000): 138 | hll_1.add(str(i)) 139 | for i in range(25, 1000): 140 | hll_2.add(str(i)) 141 | 142 | estimate, error, _ = Cardunion.intersect([hll_2, hll_1, hll]) 143 | print estimate, error 144 | self.assertTrue(975 - 3 * error <= estimate <= 975 + 3 * error) 145 | 146 | def test_intersect_a_lot(self): 147 | hlls = [] 148 | actual = 100000 149 | nset = 10 150 | for i in range(nset): 151 | hll = Cardunion() 152 | for j in range(actual): 153 | hll.add(str(i * 5000 + j)) 154 | hlls.append(hll) 155 | 156 | estimate, error, _ = Cardunion.intersect(hlls) 157 | print estimate, error 158 | self.assertTrue(actual - (nset - 1) * 5000 - 3 * error 159 | <= estimate <= actual - (nset - 1) * 5000 + 3 * error) 160 | 161 | def test_nonzero_counters(self): 162 | h = Cardunion() 163 | h.update_counter(1, 2) 164 | h.update_counter(3, 4) 165 | h.update_counter(5, 8) 166 | self.assertEquals(list(h.nonzero_counters), [(1, 2), (3, 4), (5, 8)]) 167 | -------------------------------------------------------------------------------- /deps/libebset/pyebset.pyx: -------------------------------------------------------------------------------- 1 | from libc.stdint cimport uint64_t, uint32_t 2 | from cython.operator cimport dereference as deref, preincrement as inc 3 | from libcpp.vector cimport vector 4 | from libcpp.string cimport string 5 | 6 | IF UNAME_MACHINE != "x86_64": 7 | ctypedef uint32_t uint_t 8 | ELSE: 9 | ctypedef uint64_t uint_t 10 | 11 | cdef extern from "" namespace "std": 12 | cdef cppclass ostream[T]: 13 | pass 14 | 15 | cdef extern from "" namespace "std": 16 | cdef cppclass stringstream: 17 | stringstream() except + 18 | string str() 19 | ostream write(char *, size_t) 20 | 21 | cdef extern from "" namespace "std": 22 | cdef bint binary_search(vector[size_t].iterator, 23 | vector[size_t].iterator, 24 | uint_t&) 25 | 26 | cdef extern from "ewah.h": 27 | cdef cppclass EWAHBoolArray[T]: 28 | EWAHBoolArray() nogil except + 29 | bint set(size_t i) nogil 30 | bint get(size_t i) nogil 31 | void logicaland(EWAHBoolArray&, EWAHBoolArray&) nogil 32 | void logicalor(EWAHBoolArray&, EWAHBoolArray&) nogil 33 | void logicalnot(EWAHBoolArray&) nogil 34 | size_t sizeInBytes() nogil 35 | void write(stringstream &, bint) nogil 36 | void read(stringstream &, bint) nogil 37 | vector[size_t] toArray() nogil 38 | size_t numberOfOnes() nogil 39 | bint operator==(EWAHBoolArray&) nogil 40 | bint operator!=(EWAHBoolArray&) nogil 41 | size_t sizeInBits() nogil 42 | void reset() nogil 43 | void inplace_logicalnot() nogil 44 | 45 | 46 | cdef class BitSet: 47 | cdef EWAHBoolArray[uint_t] *thisptr 48 | cdef vector[size_t] indexes 49 | cdef bint updated 50 | 51 | def __cinit__(self): 52 | self.thisptr = new EWAHBoolArray[uint_t]() 53 | self.updated = True 54 | 55 | def __dealloc__(self): 56 | del self.thisptr 57 | 58 | def __setitem__(self, key, value): 59 | if value: 60 | self.set(key) 61 | 62 | def __getitem__(self, key): 63 | return self.thisptr.get(key) 64 | 65 | def set(self, size_t i): 66 | if self.thisptr.set(i): 67 | self.updated = True 68 | return True 69 | else: 70 | return False 71 | 72 | def get(self, size_t i): 73 | return self.thisptr.get(i) 74 | 75 | def dumps(self): 76 | cdef stringstream s 77 | 78 | self.thisptr.write(s, True) 79 | return s.str() 80 | 81 | def loads(self, s): 82 | cdef stringstream ss 83 | 84 | ss.write(s, len(s)) 85 | self.thisptr.read(ss, True) 86 | self.updated = True 87 | return 88 | 89 | def size_in_bytes(self): 90 | return self.thisptr.sizeInBytes() 91 | 92 | def size_in_bits(self): 93 | return self.thisptr.sizeInBits() 94 | 95 | def reset(self): 96 | self.thisptr.reset() 97 | 98 | cpdef BitSet land(self, BitSet other): 99 | cdef BitSet s = BitSet() 100 | 101 | self.thisptr.logicaland(deref(other.thisptr), deref(s.thisptr)) 102 | return s 103 | 104 | def __and__(self, other): 105 | return self.land(other) 106 | 107 | cpdef BitSet lor(self, BitSet other): 108 | cdef BitSet s = BitSet() 109 | 110 | self.thisptr.logicalor(deref(other.thisptr), deref(s.thisptr)) 111 | return s 112 | 113 | def __or__(self, other): 114 | return self.lor(other) 115 | 116 | cpdef BitSet lnot(self): 117 | cdef BitSet s = BitSet() 118 | 119 | self.thisptr.logicalnot(deref(s.thisptr)) 120 | return s 121 | 122 | def lnot_inplace(self): 123 | self.thisptr.inplace_logicalnot() 124 | self.updated = True 125 | return self 126 | 127 | def __richcmp__(BitSet l, BitSet r, int op): 128 | cdef bint e 129 | 130 | if op == 2: 131 | e = (deref(l.thisptr) == deref(r.thisptr)) 132 | elif op == 3: 133 | e = (deref(l.thisptr) != deref(r.thisptr)) 134 | else: 135 | raise AttributeError("Unsupported operators.") 136 | return e 137 | 138 | def __invert__(self): 139 | return self.lnot() 140 | 141 | def __iter__(self): 142 | cdef vector[size_t] v = self.thisptr.toArray() 143 | cdef size_t i 144 | 145 | IF UNAME_SYSNAME == "Linux": 146 | cdef vector[uint_t].iterator it = v.begin() 147 | while it != v.end(): 148 | i = deref(it) 149 | yield i 150 | inc(it) 151 | ELSE: 152 | # clang compiler on Mac Os x will report error for the code above 153 | return (i for i in v) 154 | 155 | def __len__(self): 156 | return self.thisptr.numberOfOnes() 157 | 158 | def __str__(self): 159 | return self.dumps() 160 | 161 | def __contains__(self, size_t v): 162 | if self.updated or (self.indexes.size() == 0): 163 | self.indexes = self.thisptr.toArray() 164 | self.updated = False 165 | 166 | return binary_search(self.indexes.begin(), self.indexes.end(), v) 167 | -------------------------------------------------------------------------------- /deps/libebset/test/pyebset_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pyebset import BitSet 3 | 4 | 5 | class BitSetTest(unittest.TestCase): 6 | """Docstring for BitSetTest """ 7 | 8 | def test_set(self): 9 | b = BitSet() 10 | self.assertTrue(b.set(0)) 11 | self.assertTrue(b.set(1)) 12 | self.assertTrue(b.set(2)) 13 | self.assertTrue(b.set(3)) 14 | self.assertFalse(b.set(1)) 15 | 16 | def test_dumps_loads(self): 17 | b = BitSet() 18 | self.assertTrue(b.set(0)) 19 | self.assertTrue(b.set(1)) 20 | self.assertTrue(b.set(4)) 21 | self.assertTrue(b.set(8)) 22 | self.assertTrue(b.set(16)) 23 | s = BitSet() 24 | s.loads(b.dumps()) 25 | self.assertEqual(b, s) 26 | 27 | def test_logical_ops(self): 28 | b = BitSet() 29 | b.set(0) 30 | b.set(1) 31 | b.set(4) 32 | b.set(8) 33 | b.set(16) 34 | bb = BitSet() 35 | bb.set(0) 36 | bb.set(1) 37 | bb.set(4) 38 | bb.set(9) 39 | cc = BitSet() 40 | cc.set(0) 41 | cc.set(1) 42 | cc.set(4) 43 | cc.set(8) 44 | cc.set(9) 45 | cc.set(16) 46 | dd = BitSet() 47 | dd.set(0) 48 | dd.set(1) 49 | dd.set(4) 50 | ee = BitSet() 51 | ee.set(2) 52 | ee.set(3) 53 | 54 | la = b & bb 55 | lo = b | bb 56 | ln = ~ dd 57 | ll = ~ ln 58 | self.assertEqual(lo, cc) 59 | self.assertNotEqual(la, dd) 60 | self.assertEqual(list(ln), list(ee)) 61 | self.assertEqual(len(b), 5) 62 | self.assertEqual(len(bb), 4) 63 | self.assertEqual(len(cc), 6) 64 | self.assertEqual(len(dd), 3) 65 | self.assertEqual(len(ee), 2) 66 | self.assertEqual(len(la), 3) 67 | self.assertEqual(len(lo), 6) 68 | self.assertEqual(len(ln), 2) 69 | self.assertEqual(len(ll), 3) 70 | 71 | def test_logical_not(self): 72 | b = BitSet() 73 | b.set(0) 74 | b.set(1) 75 | b.set(8) 76 | b.set(9) 77 | c = ~b 78 | # test the logical not doesn't generate any numbers that are greater 79 | # than 9 in this case 80 | self.assertEqual(list(c), [2, 3, 4, 5, 6, 7]) 81 | d = ~c 82 | self.assertListEqual(list(d), [0, 1, 8, 9]) 83 | 84 | def test_logical_not_1(self): 85 | b = BitSet() 86 | b.set(0) 87 | b.set(1) 88 | b.set(7) 89 | b.set(8) 90 | c = ~b 91 | # test the logical not doesn't generate any numbers that are greater 92 | # than 9 in this case 93 | self.assertEqual(list(c), [2, 3, 4, 5, 6]) 94 | d = ~c 95 | self.assertListEqual(list(d), [0, 1, 7, 8]) 96 | 97 | def test_generator(self): 98 | b = BitSet() 99 | b.set(1) 100 | b.set(4) 101 | b.set(10) 102 | b.set(100000) 103 | b.set(12323131) 104 | self.assertEqual(list(b), [1, 4, 10, 100000, 12323131]) 105 | 106 | def test_contains(self): 107 | b = BitSet() 108 | b.set(1) 109 | b.set(4) 110 | b.set(10) 111 | b.set(100000) 112 | b.set(12323131) 113 | for i in [1, 4, 10, 100000, 12323131]: 114 | self.assertTrue(i in b) 115 | for i in [2, 3, 5, 6, 1232312]: 116 | self.assertTrue(i not in b) 117 | 118 | def test_eq_ne(self): 119 | b = BitSet() 120 | b.set(1) 121 | b.set(2) 122 | bb = BitSet() 123 | bb.set(1) 124 | bb.set(2) 125 | cc = BitSet() 126 | cc.set(2) 127 | cc.set(3) 128 | self.assertTrue(b == bb) 129 | self.assertTrue(bb != cc) 130 | -------------------------------------------------------------------------------- /deps/liblmdb/COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright 2011-2013 Howard Chu, Symas Corp. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted only as authorized by the OpenLDAP 6 | Public License. 7 | 8 | A copy of this license is available in the file LICENSE in the 9 | top-level directory of the distribution or, alternatively, at 10 | . 11 | 12 | OpenLDAP is a registered trademark of the OpenLDAP Foundation. 13 | 14 | Individual files and/or contributed packages may be copyright by 15 | other parties and/or subject to additional restrictions. 16 | 17 | This work also contains materials derived from public sources. 18 | 19 | Additional information about OpenLDAP can be obtained at 20 | . 21 | -------------------------------------------------------------------------------- /deps/liblmdb/LICENSE: -------------------------------------------------------------------------------- 1 | The OpenLDAP Public License 2 | Version 2.8, 17 August 2003 3 | 4 | Redistribution and use of this software and associated documentation 5 | ("Software"), with or without modification, are permitted provided 6 | that the following conditions are met: 7 | 8 | 1. Redistributions in source form must retain copyright statements 9 | and notices, 10 | 11 | 2. Redistributions in binary form must reproduce applicable copyright 12 | statements and notices, this list of conditions, and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution, and 15 | 16 | 3. Redistributions must contain a verbatim copy of this document. 17 | 18 | The OpenLDAP Foundation may revise this license from time to time. 19 | Each revision is distinguished by a version number. You may use 20 | this Software under terms of this license revision or under the 21 | terms of any subsequent revision of the license. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS 24 | CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, 25 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 26 | AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 27 | SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S) 28 | OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, 29 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 30 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 32 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 34 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 | POSSIBILITY OF SUCH DAMAGE. 36 | 37 | The names of the authors and copyright holders must not be used in 38 | advertising or otherwise to promote the sale, use or other dealing 39 | in this Software without specific, written prior permission. Title 40 | to copyright in this Software shall at all times remain with copyright 41 | holders. 42 | 43 | OpenLDAP is a registered trademark of the OpenLDAP Foundation. 44 | 45 | Copyright 1999-2003 The OpenLDAP Foundation, Redwood City, 46 | California, USA. All Rights Reserved. Permission to copy and 47 | distribute verbatim copies of this document is granted. 48 | -------------------------------------------------------------------------------- /deps/liblmdb/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for liblmdb (Lightning memory-mapped database library). 2 | 3 | ######################################################################## 4 | # Configuration. The compiler options must enable threaded compilation. 5 | # 6 | # Preprocessor macros (for CPPFLAGS) of interest... 7 | # Note that the defaults should already be correct for most 8 | # platforms; you should not need to change any of these. 9 | # Read their descriptions in mdb.c if you do: 10 | # 11 | # - MDB_USE_POSIX_SEM 12 | # - MDB_DSYNC 13 | # - MDB_FDATASYNC 14 | # - MDB_USE_PWRITEV 15 | # 16 | # There may be other macros in mdb.c of interest. You should 17 | # read mdb.c before changing any of them. 18 | # 19 | CC = gcc 20 | W = -W -Wall -Wno-unused-parameter -Wbad-function-cast 21 | THREADS = -pthread 22 | OPT = -O2 -g 23 | CFLAGS = $(THREADS) $(OPT) $(W) $(XCFLAGS) 24 | LDLIBS = 25 | SOLIBS = 26 | prefix = /usr/local 27 | 28 | ######################################################################## 29 | 30 | IHDRS = lmdb.h 31 | ILIBS = liblmdb.a liblmdb.so 32 | IPROGS = mdb_stat mdb_copy 33 | IDOCS = mdb_stat.1 mdb_copy.1 34 | PROGS = $(IPROGS) 35 | all: $(ILIBS) $(PROGS) 36 | 37 | install: $(ILIBS) $(IPROGS) $(IHDRS) 38 | for f in $(IPROGS); do cp $$f $(DESTDIR)$(prefix)/bin; done 39 | for f in $(ILIBS); do cp $$f $(DESTDIR)$(prefix)/lib; done 40 | for f in $(IHDRS); do cp $$f $(DESTDIR)$(prefix)/include; done 41 | 42 | clean: 43 | rm -rf $(PROGS) *.[ao] *.so *~ testdb 44 | 45 | test: all 46 | mkdir testdb 47 | ./mtest && ./mdb_stat testdb 48 | 49 | liblmdb.a: mdb.o midl.o 50 | ar rs $@ mdb.o midl.o 51 | 52 | liblmdb.so: mdb.o midl.o 53 | # $(CC) $(LDFLAGS) -pthread -shared -Wl,-Bsymbolic -o $@ mdb.o midl.o $(SOLIBS) 54 | $(CC) $(LDFLAGS) -pthread -shared -o $@ mdb.o midl.o $(SOLIBS) 55 | 56 | mdb_stat: mdb_stat.o liblmdb.a 57 | mdb_copy: mdb_copy.o liblmdb.a 58 | 59 | mdb.o: mdb.c lmdb.h midl.h 60 | $(CC) $(CFLAGS) -fPIC $(CPPFLAGS) -c mdb.c 61 | 62 | midl.o: midl.c midl.h 63 | $(CC) $(CFLAGS) -fPIC $(CPPFLAGS) -c midl.c 64 | 65 | %: %.o 66 | $(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ 67 | 68 | %.o: %.c lmdb.h 69 | $(CC) $(CFLAGS) $(CPPFLAGS) -c $< 70 | -------------------------------------------------------------------------------- /deps/liblmdb/README.md: -------------------------------------------------------------------------------- 1 | Requires: 2 | ======= 3 | - liblmdb 4 | - Python 2.7 (that is all I have tested with) 5 | - Cython 6 | 7 | Install: 8 | ======= 9 | ``` 10 | $ sudo python setup.py install 11 | ``` 12 | 13 | Usage 14 | ===== 15 | 16 | Using Writer and Reader 17 | ----------------------- 18 | 19 | >>> import mdb 20 | >>> writer = mdb.Writer('/tmp/mdbtest') 21 | >>> writer.put('foo', 'bar') 22 | >>> writer.mput({"key": "value", "egg": "spam"}) 23 | >>> writer.close() 24 | >>> reader = mdb.Reader('/tmp/mdbtest') 25 | >>> reader.get('foo') 26 | >>> for key, value in reader.iteritems(): 27 | ... print key, value 28 | >>> reader.close() 29 | 30 | Using Integer Key 31 | ----------------- 32 | >>> writer = mdb.Writer('/tmp/mdbtest', dup=True, int_key=True) 33 | >>> writer = writer.put(1, 'foo') 34 | >>> writer = writer.put(1, 'bar') # append a duplicate key 35 | >>> writer.close() 36 | >>> reader = mdb.DupReader('/tmp/mdbtest', int_key=True) 37 | >>> for v in reader.get(1): 38 | ... print v 39 | >>> reader.close() 40 | 41 | Using Low-level Stuff 42 | --------------------- 43 | >>> env = mdb.Env('/tmp/mdbtest') 44 | >>> txn = env.begin_txn() 45 | >>> db = env.open_db(txn) 46 | >>> db.put(txn, 'hi', 'assinine') 47 | >>> txn.commit() 48 | >>> txn = env.begin_txn() 49 | >>> print '"%s"' % db.get(txn, 'hi') # --> assinine 50 | >>> txn.close() 51 | >>> db.close() 52 | >>> env.close() 53 | -------------------------------------------------------------------------------- /deps/liblmdb/cmdb.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from 'lmdb.h': 2 | cdef enum: 3 | # env creation flags 4 | MDB_FIXEDMAP = 0x01 5 | MDB_NOSUBDIR = 0x4000 6 | MDB_NOSYNC = 0x10000 7 | MDB_RDONLY = 0x20000 8 | MDB_NOMETASYNC = 0x40000 9 | MDB_WRITEMAP = 0x80000 10 | MDB_MAPASYNC = 0x100000 11 | MDB_NOTLS = 0x200000 12 | MDB_NOLOCK = 0x400000 13 | MDB_NORDAHEAD = 0x800000 14 | 15 | # db open flags 16 | MDB_REVERSEKEY = 0x02 17 | MDB_DUPSORT = 0x04 18 | MDB_INTEGERKEY = 0x08 19 | MDB_DUPFIXED = 0x10 20 | MDB_INTEGERDUP = 0x20 21 | MDB_REVERSEDUP = 0x40 22 | MDB_CREATE = 0x40000 23 | 24 | # write flags 25 | MDB_NOOVERWRITE = 0x10 26 | MDB_NODUPDATA = 0x20 27 | MDB_CURRENT = 0x40 28 | MDB_RESERVE = 0x10000 29 | MDB_APPEND = 0x20000 30 | MDB_APPENDDUP = 0x40000 31 | MDB_MULTIPLE = 0x80000 32 | 33 | # MDB Return Values 34 | MDB_SUCCESS = 0 35 | MDB_KEYEXIST = -30799 36 | MDB_NOTFOUND = -30798 37 | MDB_PAGE_NOTFOUND = -30797 38 | MDB_CORRUPTED = -30796 39 | MDB_PANIC = -30795 40 | MDB_VERSION_MISMATCH = -30794 41 | MDB_INVALID = -30793 42 | MDB_MAP_FULL = -30792 43 | MDB_DBS_FULL = -30791 44 | MDB_READERS_FULL = -30790 45 | MDB_TLS_FULL = -30789 46 | MDB_TXN_FULL = -30788 47 | MDB_CURSOR_FULL = -30787 48 | MDB_PAGE_FULL = -30786 49 | MDB_MAP_RESIZED = -30785 50 | MDB_INCOMPATIBLE = -30784 51 | MDB_BAD_RSLOT = -30783 52 | MDB_BAD_TXN = -30782 53 | MDB_BAD_VALSIZE = -30781 54 | MDB_LAST_ERRCODE = MDB_BAD_VALSIZE 55 | 56 | cdef enum Cursor_Op: 57 | # cursor operations 58 | MDB_FIRST = 0 59 | MDB_FIRST_DUP = 1 60 | MDB_GET_BOTH = 2 61 | MDB_GET_BOTH_RANGE = 3 62 | MDB_GET_CURRENT = 4 63 | MDB_GET_MULTIPLE = 5 64 | MDB_LAST = 6 65 | MDB_LAST_DUP = 7 66 | MDB_NEXT = 8 67 | MDB_NEXT_DUP = 9 68 | MDB_NEXT_MULTIPLE = 10 69 | MDB_NEXT_NODUP = 11 70 | MDB_PREV = 12 71 | MDB_PREV_DUP = 13 72 | MDB_PREV_NODUP = 14 73 | MDB_SET = 15 74 | MDB_SET_KEY = 16 75 | MDB_SET_RANGE = 17 76 | 77 | ctypedef struct MDB_txn: 78 | pass 79 | 80 | ctypedef struct MDB_env: 81 | pass 82 | 83 | ctypedef struct MDB_cursor: 84 | pass 85 | 86 | ctypedef unsigned int MDB_dbi 87 | 88 | ctypedef struct MDB_val: 89 | size_t mv_size 90 | void *mv_data 91 | 92 | ctypedef struct MDB_stat: 93 | unsigned int ms_psize 94 | unsigned int ms_depth 95 | size_t ms_branch_pages 96 | size_t ms_leaf_pages 97 | size_t ms_overflow_pages 98 | size_t ms_entries 99 | 100 | ctypedef struct MDB_envinfo: 101 | void *me_mapaddr 102 | size_t me_mapsize 103 | size_t me_last_pgno 104 | size_t me_last_txnid 105 | unsigned int me_maxreaders 106 | unsigned int me_numreaders 107 | 108 | ctypedef int (*MDB_cmp_func)(const MDB_val *a, const MDB_val *b) 109 | 110 | char *mdb_strerror(int err) 111 | int mdb_env_create(MDB_env **env) 112 | int mdb_env_open(MDB_env *env, char *path, unsigned int flags, unsigned int mode) 113 | int mdb_env_copy(MDB_env *env, char *path) 114 | int mdb_env_stat(MDB_env *env, MDB_stat *stat) 115 | int mdb_env_info(MDB_env *env, MDB_envinfo *stat) 116 | int mdb_env_sync(MDB_env *env, int force) 117 | int mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff) 118 | int mdb_env_get_flags(MDB_env *env, unsigned int *flags) 119 | int mdb_env_get_path(MDB_env *env, char **path) 120 | int mdb_env_set_mapsize(MDB_env *env, size_t size) 121 | int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) 122 | int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) 123 | void mdb_env_close(MDB_env *env) 124 | 125 | int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn) 126 | int mdb_txn_commit(MDB_txn *txn) 127 | void mdb_txn_abort(MDB_txn *txn) 128 | void mdb_txn_reset(MDB_txn *txn) 129 | int mdb_txn_renew(MDB_txn *txn) 130 | 131 | int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func cmp) 132 | int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func cmp) 133 | int mdb_dbi_open(MDB_txn *txn, char *name, unsigned int flags, MDB_dbi *dbi) 134 | void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) 135 | int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat) 136 | int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int delete) 137 | int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) 138 | int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned int flags) 139 | int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) 140 | 141 | int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor) 142 | void mdb_cursor_close(MDB_cursor *cursor) 143 | int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor) 144 | int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned int op) 145 | int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned int flags) 146 | int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags) 147 | int mdb_cursor_count(MDB_cursor *cursor, size_t *countp) 148 | -------------------------------------------------------------------------------- /deps/liblmdb/mdb_copy.1: -------------------------------------------------------------------------------- 1 | .TH MDB_COPY 1 "2012/12/12" "LMDB 0.9.5" 2 | .\" Copyright 2012 Howard Chu, Symas Corp. All Rights Reserved. 3 | .\" Copying restrictions apply. See COPYRIGHT/LICENSE. 4 | .SH NAME 5 | mdb_copy \- LMDB environment copy tool 6 | .SH SYNOPSIS 7 | .B mdb_copy 8 | .I srcpath\ [dstpath] 9 | .SH DESCRIPTION 10 | The 11 | .B mdb_copy 12 | utility copies an LMDB environment. The environment can 13 | be copied regardless of whether it is currently in use. 14 | No lockfile is created, since it gets recreated at need. 15 | 16 | If 17 | .I dstpath 18 | is specified it must be the path of an empty directory 19 | for storing the backup. Otherwise, the backup will be 20 | written to stdout. 21 | 22 | .SH DIAGNOSTICS 23 | Exit status is zero if no errors occur. 24 | Errors result in a non-zero exit status and 25 | a diagnostic message being written to standard error. 26 | .SH CAVEATS 27 | This utility can trigger significant file size growth if run 28 | in parallel with write transactions, because pages which they 29 | free during copying cannot be reused until the copy is done. 30 | .SH "SEE ALSO" 31 | .BR mdb_stat (1) 32 | .SH AUTHOR 33 | Howard Chu of Symas Corporation 34 | -------------------------------------------------------------------------------- /deps/liblmdb/mdb_copy.c: -------------------------------------------------------------------------------- 1 | /* mdb_copy.c - memory-mapped database backup tool */ 2 | /* 3 | * Copyright 2012 Howard Chu, Symas Corp. 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted only as authorized by the OpenLDAP 8 | * Public License. 9 | * 10 | * A copy of this license is available in the file LICENSE in the 11 | * top-level directory of the distribution or, alternatively, at 12 | * . 13 | */ 14 | #ifdef _WIN32 15 | #include 16 | #define MDB_STDOUT GetStdHandle(STD_OUTPUT_HANDLE) 17 | #else 18 | #define MDB_STDOUT 1 19 | #endif 20 | #include 21 | #include 22 | #include 23 | #include "lmdb.h" 24 | 25 | static void 26 | sighandle(int sig) 27 | { 28 | } 29 | 30 | int main(int argc,char * argv[]) 31 | { 32 | int rc; 33 | MDB_env *env; 34 | char *envname = argv[1]; 35 | 36 | if (argc<2 || argc>3) { 37 | fprintf(stderr, "usage: %s srcpath [dstpath]\n", argv[0]); 38 | exit(EXIT_FAILURE); 39 | } 40 | 41 | #ifdef SIGPIPE 42 | signal(SIGPIPE, sighandle); 43 | #endif 44 | #ifdef SIGHUP 45 | signal(SIGHUP, sighandle); 46 | #endif 47 | signal(SIGINT, sighandle); 48 | signal(SIGTERM, sighandle); 49 | 50 | rc = mdb_env_create(&env); 51 | 52 | rc = mdb_env_open(env, envname, MDB_RDONLY, 0); 53 | if (rc) { 54 | printf("mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); 55 | } else { 56 | if (argc == 2) 57 | rc = mdb_env_copyfd(env, MDB_STDOUT); 58 | else 59 | rc = mdb_env_copy(env, argv[2]); 60 | if (rc) 61 | printf("mdb_env_copy failed, error %d %s\n", rc, mdb_strerror(rc)); 62 | } 63 | mdb_env_close(env); 64 | 65 | return rc ? EXIT_FAILURE : EXIT_SUCCESS; 66 | } 67 | -------------------------------------------------------------------------------- /deps/liblmdb/mdb_stat.1: -------------------------------------------------------------------------------- 1 | .TH MDB_STAT 1 "2012/12/12" "LMDB 0.9.5" 2 | .\" Copyright 2012 Howard Chu, Symas Corp. All Rights Reserved. 3 | .\" Copying restrictions apply. See COPYRIGHT/LICENSE. 4 | .SH NAME 5 | mdb_stat \- LMDB environment status tool 6 | .SH SYNOPSIS 7 | .B mdb_stat 8 | .BR \ envpath 9 | [\c 10 | .BR \-e ] 11 | [\c 12 | .BR \-f [ f [ f ]]] 13 | [\c 14 | .BR \-n ] 15 | [\c 16 | .BR \-r [ r ]] 17 | [\c 18 | .BR \-a \ | 19 | .BI \-s \ subdb\fR] 20 | .SH DESCRIPTION 21 | The 22 | .B mdb_stat 23 | utility displays the status of an LMDB environment. 24 | .SH OPTIONS 25 | .TP 26 | .BR \-e 27 | Display information about the database environment. 28 | .TP 29 | .BR \-f 30 | Display information about the environment freelist. 31 | If \fB\-ff\fP is given, summarize each freelist entry. 32 | If \fB\-fff\fP is given, display the full list of page IDs in the freelist. 33 | .TP 34 | .BR \-n 35 | Display the status of an LMDB database which does not use subdirectories. 36 | .TP 37 | .BR \-r 38 | Display information about the environment reader table. 39 | Shows the process ID, thread ID, and transaction ID for each active 40 | reader slot. The process ID and transaction ID are in decimal, the 41 | thread ID is in hexadecimal. The transaction ID is displayed as "-" 42 | if the reader does not currently have a read transaction open. 43 | If \fB\-rr\fP is given, check for stale entries in the reader 44 | table and clear them. The reader table will be printed again 45 | after the check is performed. 46 | .TP 47 | .BR \-a 48 | Display the status of all of the subdatabases in the environment. 49 | .TP 50 | .BR \-s \ subdb 51 | Display the status of a specific subdatabase. 52 | .SH DIAGNOSTICS 53 | Exit status is zero if no errors occur. 54 | Errors result in a non-zero exit status and 55 | a diagnostic message being written to standard error. 56 | .SH "SEE ALSO" 57 | .BR mdb_copy (1) 58 | .SH AUTHOR 59 | Howard Chu of Symas Corporation 60 | -------------------------------------------------------------------------------- /deps/liblmdb/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | 5 | __version__ = '0.4.0' 6 | 7 | setup( 8 | name = "mdb", 9 | version = __version__, 10 | description = 'Python client of MDB-Lightning', 11 | cmdclass = {'build_ext': build_ext}, 12 | author = 'Chango Inc.', 13 | keywords=['mdb-ligtning', 'mdb', 'lmdb', 'key-value store'], 14 | license='MIT', 15 | ext_modules = [Extension("mdb", ["db.pyx", ], 16 | libraries=["lmdb"], 17 | library_dirs=["/usr/local/lib"], 18 | include_dirs=["/usr/local/include"], 19 | runtime_library_dirs=["/usr/local/lib"])] 20 | ) 21 | -------------------------------------------------------------------------------- /deps/liblmdb/test/test_cursor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import mdb 3 | from unittest import TestCase 4 | 5 | 6 | class TestCursor(TestCase): 7 | 8 | def setUp(self): 9 | import os 10 | import errno 11 | self.path = './testdbm' 12 | try: 13 | os.makedirs(self.path) 14 | except OSError as e: 15 | if e.errno == errno.EEXIST and os.path.isdir(self.path): 16 | pass 17 | else: 18 | raise 19 | self.env = mdb.Env(self.path, max_dbs=8) 20 | self.txn = self.env.begin_txn() 21 | self.db = self.env.open_db(self.txn, 'test_cursor') 22 | self.db.drop(self.txn, 0) 23 | self.txn.commit() 24 | self.txn = self.env.begin_txn() 25 | 26 | def tearDown(self): 27 | import shutil 28 | self.txn.commit() 29 | self.db.close() 30 | self.env.close() 31 | shutil.rmtree(self.path) 32 | 33 | def test_put(self): 34 | # all keys must be sorted 35 | cursor = mdb.Cursor(self.txn, self.db) 36 | cursor.put('foo', 'bar', mdb.MDB_APPENDDUP) 37 | self.assertEqual(cursor.get('foo'), ('foo', 'bar')) 38 | 39 | def test_put_unicode(self): 40 | # all keys must be sorted 41 | cursor = mdb.Cursor(self.txn, self.db) 42 | cursor.put('fΩo', 'b∑r', mdb.MDB_APPENDDUP) 43 | self.assertEqual(cursor.get('fΩo'), ('fΩo', 'b∑r')) 44 | 45 | def test_put_duplicate(self): 46 | # all values must be sorted as well 47 | cursor = mdb.Cursor(self.txn, self.db) 48 | cursor.put('foo', 'bar', mdb.MDB_APPENDDUP) 49 | cursor.put('foo', 'bar1', mdb.MDB_APPENDDUP) 50 | self.assertEqual(cursor.count_dups(), 2) 51 | self.assertEqual(cursor.get('foo'), ('foo', 'bar')) 52 | while 1: 53 | key, value = cursor.get(op=mdb.MDB_NEXT_DUP) 54 | if not key: 55 | break 56 | self.assertEqual((key, value), ('foo', 'bar1')) 57 | 58 | def test_delete_by_key(self): 59 | cursor = mdb.Cursor(self.txn, self.db) 60 | cursor.put('delete', 'done', mdb.MDB_APPENDDUP) 61 | cursor.put('delete', 'done1', mdb.MDB_APPENDDUP) 62 | key, value = cursor.get('delete') 63 | cursor.delete(mdb.MDB_NODUPDATA) 64 | self.assertEqual(cursor.get('delete'), (None, None)) 65 | 66 | def test_delete_by_key_value(self): 67 | cursor = mdb.Cursor(self.txn, self.db) 68 | cursor.put('delete', 'done', mdb.MDB_APPENDDUP) 69 | cursor.put('delete', 'done1', mdb.MDB_APPENDDUP) 70 | key, value = cursor.get('delete') 71 | cursor.delete() 72 | self.assertEqual(cursor.get('delete'), ('delete', 'done1')) 73 | 74 | def test_delete_by_key_value_1(self): 75 | cursor = mdb.Cursor(self.txn, self.db) 76 | cursor.put('delete', 'done', mdb.MDB_APPENDDUP) 77 | cursor.put('delete', 'done1', mdb.MDB_APPENDDUP) 78 | cursor.put('delete', 'done2', mdb.MDB_APPENDDUP) 79 | key, value = cursor.get('delete', 'done2', op=mdb.MDB_NEXT_DUP) 80 | cursor.delete() 81 | self.assertEqual(cursor.get('delete'), ('delete', 'done')) 82 | -------------------------------------------------------------------------------- /deps/liblmdb/test/test_greater_failure.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import mdb 3 | from unittest import TestCase 4 | 5 | 6 | class TestGreaterFailure(TestCase): 7 | 8 | def setUp(self): 9 | import os 10 | import errno 11 | self.path = './testdbmi' 12 | try: 13 | os.makedirs(self.path) 14 | except OSError as e: 15 | if e.errno == errno.EEXIST and os.path.isdir(self.path): 16 | pass 17 | else: 18 | raise 19 | self.env = mdb.Env(self.path, mapsize=1 * mdb.MB, max_dbs=8) 20 | 21 | def tearDown(self): 22 | import shutil 23 | self.env.close() 24 | shutil.rmtree(self.path) 25 | 26 | def drop_mdb(self): 27 | txn = self.env.begin_txn() 28 | db = self.env.open_db(txn, 'test_db', 29 | flags=mdb.MDB_CREATE|mdb.MDB_DUPSORT|mdb.MDB_INTEGERKEY, 30 | key_inttype=mdb.MDB_INT_32) 31 | db.drop(txn, 0) 32 | txn.commit() 33 | db.close() 34 | 35 | def test_intstr_greater_failure(self): 36 | # all keys must be sorted 37 | txn = self.env.begin_txn() 38 | db = self.env.open_db(txn, 'test_db', 39 | flags=mdb.MDB_CREATE|mdb.MDB_INTEGERKEY|mdb.MDB_DUPSORT, 40 | key_inttype=mdb.MDB_INT_32) 41 | db.put(txn,184504, 'bar1') 42 | db.put(txn,184031, 'bar2') 43 | db.put(txn,145248, 'bar3') 44 | db.put(txn,84131 , 'bar4') 45 | db.put(txn,3869 , 'bar5') 46 | db.put(txn,124034, 'bar6') 47 | db.put(txn,90752 , 'bar7') 48 | db.put(txn,48288 , 'bar8') 49 | db.put(txn,97573 , 'bar9') 50 | db.put(txn,18455 , 'bar0') 51 | 52 | txn.commit() 53 | txn = self.env.begin_txn() 54 | res = list(db.get_gt(txn, 50000)) 55 | self.assertEqual(len(res), 7) 56 | res = list(db.get_gt(txn, 84131)) 57 | self.assertEqual(len(res), 6) 58 | res = list(db.get_ge(txn, 84131)) 59 | self.assertEqual(len(res), 7) 60 | txn.commit() 61 | db.close() 62 | 63 | def test_intint_greater_failure(self): 64 | # all keys must be sorted 65 | txn = self.env.begin_txn() 66 | db = self.env.open_db(txn, 'test_db', 67 | flags=mdb.MDB_CREATE|mdb.MDB_INTEGERKEY|mdb.MDB_DUPSORT|mdb.MDB_INTEGERDUP, 68 | key_inttype=mdb.MDB_INT_32) 69 | db.put(txn,184504, 1) 70 | db.put(txn,184031, 2) 71 | db.put(txn,145248, 3) 72 | db.put(txn,84131 , 4) 73 | db.put(txn,3869 , 5) 74 | db.put(txn,124034, 6) 75 | db.put(txn,90752 , 7) 76 | db.put(txn,48288 , 8) 77 | db.put(txn,97573 , 9) 78 | db.put(txn,18455 , 0) 79 | 80 | txn.commit() 81 | txn = self.env.begin_txn() 82 | res = list(db.get_gt(txn, 50000)) 83 | self.assertEqual(len(res), 7) 84 | res = list(db.get_gt(txn, 84131)) 85 | self.assertEqual(len(res), 6) 86 | res = list(db.get_ge(txn, 84131)) 87 | self.assertEqual(len(res), 7) 88 | txn.commit() 89 | db.close() 90 | 91 | def test_strstr_greater_failure(self): 92 | # all keys must be sorted 93 | txn = self.env.begin_txn() 94 | db = self.env.open_db(txn, 'test_db', 95 | flags=mdb.MDB_CREATE) 96 | db.put(txn,'holy', 'bar1') 97 | db.put(txn,'smolly', 'bar2') 98 | db.put(txn,'abacus', 'bar3') 99 | db.put(txn,'dreadlock' , 'bar4') 100 | db.put(txn,'inno' , 'bar5') 101 | db.put(txn,'db', 'bar6') 102 | db.put(txn,'idiotic' , 'bar7') 103 | db.put(txn,'idioms' , 'bar8') 104 | 105 | txn.commit() 106 | txn = self.env.begin_txn() 107 | res = list(db.get_gt(txn, 'grover')) 108 | self.assertEqual(len(res), 5) 109 | res = list(db.get_gt(txn, 'db')) 110 | self.assertEqual(len(res), 6) 111 | res = list(db.get_ge(txn, 'db')) 112 | self.assertEqual(len(res), 7) 113 | txn.commit() 114 | db.close() 115 | 116 | def test_strint_greater_failure(self): 117 | # all keys must be sorted 118 | txn = self.env.begin_txn() 119 | db = self.env.open_db(txn, 'test_db', 120 | flags=mdb.MDB_CREATE|mdb.MDB_INTEGERDUP) 121 | db.put(txn,'holy', 1) 122 | db.put(txn,'smolly', 2) 123 | db.put(txn,'abacus', 3) 124 | db.put(txn,'dreadlock' , 4) 125 | db.put(txn,'inno' , 5) 126 | db.put(txn,'db', 6) 127 | db.put(txn,'idiotic' , 7) 128 | db.put(txn,'idioms' , 8) 129 | 130 | txn.commit() 131 | txn = self.env.begin_txn() 132 | res = list(db.get_gt(txn, 'grover')) 133 | self.assertEqual(len(res), 5) 134 | res = list(db.get_gt(txn, 'db')) 135 | self.assertEqual(len(res), 6) 136 | res = list(db.get_ge(txn, 'db')) 137 | self.assertEqual(len(res), 7) 138 | txn.commit() 139 | db.close() 140 | 141 | -------------------------------------------------------------------------------- /deps/liblmdb/test/test_intreader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from mdb import Writer, Reader, DupReader 4 | 5 | 6 | class TestReaderWriter(TestCase): 7 | def setUp(self): 8 | pass 9 | 10 | def tearDown(self): 11 | import shutil 12 | try: 13 | shutil.rmtree('./test_rw') 14 | except OSError: 15 | pass 16 | try: 17 | shutil.rmtree('./test_rw_dup') 18 | except OSError: 19 | pass 20 | 21 | def test_reader_and_writer(self): 22 | writer = Writer('./test_rw', int_key=True) 23 | writer.drop() 24 | writer.put(1234, 'bar') 25 | writer.put(5678, 'spam') 26 | reader = Reader('./test_rw', int_key=True) 27 | value = reader.get(1234) 28 | self.assertEqual(value, 'bar') 29 | value = reader.get(5678) 30 | self.assertEqual(value, 'spam') 31 | 32 | def test_dup_reader_and_writer(self): 33 | def key_value_gen(): 34 | for i in range(3): 35 | yield 789, "value%d" % (i * i) 36 | writer = Writer('./test_rw_dup', int_key=True, dup=True) 37 | writer.drop() 38 | writer.put(123, 'bar') 39 | writer.put(456, 'spam') 40 | writer.mput({123: "bar1", 456: "spam1"}) 41 | writer.mput(key_value_gen()) 42 | reader = DupReader('./test_rw_dup', int_key=True) 43 | values = reader.get(123) 44 | self.assertEqual(list(values), ['bar', 'bar1']) 45 | values = reader.get(456) 46 | self.assertEqual(list(values), ['spam', 'spam1']) 47 | values = reader.get(789) 48 | self.assertEqual(list(values), ['value0', 'value1', 'value4']) 49 | -------------------------------------------------------------------------------- /deps/liblmdb/test/test_reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from unittest import TestCase 3 | from mdb import Writer, Reader, DupReader 4 | from ujson import dumps, loads 5 | 6 | 7 | class TestReaderWriter(TestCase): 8 | def setUp(self): 9 | pass 10 | 11 | def tearDown(self): 12 | import shutil 13 | try: 14 | shutil.rmtree('./test_rw') 15 | except OSError: 16 | pass 17 | try: 18 | shutil.rmtree('./test_rw_dup') 19 | except OSError: 20 | pass 21 | 22 | def test_reader_and_writer(self): 23 | writer = Writer('./test_rw', encode_fn=dumps) 24 | writer.drop() 25 | writer.put('foo', 'bar') 26 | writer.put('egg', 'spam') 27 | reader = Reader('./test_rw', decode_fn=loads) 28 | value = reader.get('foo') 29 | self.assertEqual(value, 'bar') 30 | value = reader.get('egg') 31 | self.assertEqual(value, 'spam') 32 | 33 | def test_dup_reader_and_writer(self): 34 | def key_value_gen(): 35 | for i in range(3): 36 | yield 'fixed', "value%d" % (i * i) 37 | writer = Writer('./test_rw_dup', dup=True, 38 | encode_fn=dumps) 39 | writer.drop() 40 | writer.put('foo', 'bar') 41 | writer.put('egg', 'spam') 42 | writer.mput({"foo": "bar1", "egg": "spam1"}) 43 | writer.mput(key_value_gen()) 44 | reader = DupReader('./test_rw_dup', 45 | decode_fn=loads) 46 | values = reader.get('foo') 47 | self.assertEqual(list(values), ['bar', 'bar1']) 48 | values = reader.get('egg') 49 | self.assertEqual(list(values), ['spam', 'spam1']) 50 | values = reader.get('fixed') 51 | self.assertEqual(list(values), ['value0', 'value1', 'value4']) 52 | -------------------------------------------------------------------------------- /deps/liblru/clru.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013, Chango Inc. 3 | * 4 | * Based on the 'LRU cache implementation in C++' article by Tim Day. 5 | * Copyright (c) 2010-2011, Tim Day 6 | * 7 | * Permission to use, copy, modify, and/or distribute this software for any 8 | * purpose with or without fee is hereby granted, provided that the above 9 | * copyright notice and this permission notice appear in all copies. 10 | * 11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 | */ 19 | 20 | #ifndef _CLRU_ 21 | #define _CLRU_ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | struct cmp_str { 30 | bool operator()(const char *a, const char *b) const { 31 | return strcmp(a, b) < 0; 32 | } 33 | }; 34 | 35 | typedef void (*CharFunc)(char *, void *); 36 | 37 | class CharLRU 38 | { 39 | public: 40 | typedef std::list CharList; 41 | typedef std::map Map; 42 | CharLRU(CharFunc fetch, CharFunc evict, size_t c, void *cookie) 43 | :_fetch(fetch), _evict(evict), _capacity(c), _cookie(cookie) 44 | { 45 | assert(_capacity != 0); 46 | } 47 | 48 | void evictall() { 49 | unsigned long count = charlist.size(); 50 | for (unsigned long i = 0; i < count; i++) { 51 | evict(); 52 | } 53 | } 54 | 55 | void set(char *k) { 56 | touch(k, false); 57 | } 58 | 59 | void get(char *k) { 60 | touch(k, true); 61 | } 62 | private: 63 | void touch(char *_k, bool shouldFetch) { 64 | const Map::iterator it =locationMap.find(_k); 65 | 66 | if (it == locationMap.end()) { 67 | char *k = strdup(_k); 68 | assert(k != NULL); 69 | assert(strlen(_k) == strlen(k)); 70 | if (shouldFetch) { 71 | _fetch(k, _cookie); 72 | } 73 | insert(k); 74 | } else { 75 | charlist.splice(charlist.end(), charlist, it->second); 76 | } 77 | } 78 | 79 | void insert(char *k) { 80 | if (locationMap.size()==_capacity) 81 | evict(); 82 | CharList::iterator it =charlist.insert(charlist.end(), k); 83 | locationMap.insert(std::make_pair(k, it)); 84 | } 85 | 86 | void evict() { 87 | assert(!charlist.empty()); 88 | 89 | const Map::iterator it =locationMap.find(charlist.front()); 90 | assert(it != locationMap.end()); 91 | 92 | char *k = it->first; 93 | assert(k != NULL); 94 | _evict(k, _cookie); 95 | locationMap.erase(it); 96 | charlist.pop_front(); 97 | free(k); 98 | } 99 | 100 | CharFunc _fetch, _evict; 101 | const size_t _capacity; 102 | void *_cookie; 103 | 104 | CharList charlist; 105 | Map locationMap; 106 | }; 107 | 108 | typedef void (*IntFunc)(long, void *); 109 | class IntLRU 110 | { 111 | public: 112 | typedef std::list IntList; 113 | typedef std::map Map; 114 | IntLRU(IntFunc fetch, IntFunc evict, size_t c, void *cookie) 115 | :_fetch(fetch), _evict(evict), _capacity(c), _cookie(cookie) 116 | { 117 | assert(_capacity != 0); 118 | } 119 | 120 | void evictall() { 121 | unsigned long count = longlist.size(); 122 | for (unsigned long i = 0; i < count; i++) { 123 | evict(); 124 | } 125 | } 126 | 127 | void set(long k) { 128 | touch(k, false); 129 | } 130 | 131 | void get(long k) { 132 | touch(k, true); 133 | } 134 | private: 135 | void touch(long k, bool shouldFetch) { 136 | const Map::iterator it = locationMap.find(k); 137 | 138 | if (it == locationMap.end()) { 139 | if (shouldFetch) { 140 | _fetch(k, _cookie); 141 | } 142 | insert(k); 143 | } else { 144 | longlist.splice(longlist.end(), longlist, it->second); 145 | } 146 | } 147 | 148 | void insert(long k) { 149 | if (locationMap.size()==_capacity) 150 | evict(); 151 | IntList::iterator it =longlist.insert(longlist.end(), k); 152 | locationMap.insert(std::make_pair(k, it)); 153 | } 154 | 155 | void evict() { 156 | assert(!longlist.empty()); 157 | 158 | const Map::iterator it =locationMap.find(longlist.front()); 159 | assert(it != locationMap.end()); 160 | 161 | long k = it->first; 162 | _evict(k, _cookie); 163 | locationMap.erase(it); 164 | longlist.pop_front(); 165 | } 166 | 167 | IntFunc _fetch, _evict; 168 | const size_t _capacity; 169 | void *_cookie; 170 | 171 | IntList longlist; 172 | Map locationMap; 173 | }; 174 | #endif 175 | -------------------------------------------------------------------------------- /deps/liblru/pylru.pyx: -------------------------------------------------------------------------------- 1 | import sys 2 | import traceback 3 | from collections import defaultdict 4 | 5 | cdef extern from "clru.h": 6 | ctypedef void (*CharFetch)(char *, void *) 7 | ctypedef void (*CharEvict)(char *, void *) 8 | 9 | cdef cppclass CharLRU: 10 | CharLRU(CharFetch f, CharEvict e, size_t size, void *self) 11 | void get(char *k) 12 | void set(char *k) 13 | void evictall() 14 | 15 | cdef void _charfetch(char *key, void *_self): 16 | self = _self 17 | try: 18 | res = self._Fetch(key) 19 | if res is not None: 20 | self.kv[key] = res 21 | except Exception, e: 22 | print >> sys.stderr, traceback.format_exc() 23 | print >> sys.stderr, "Exception: %s" % str(e) 24 | 25 | cdef void _charevict(char *key, void *_self): 26 | self = _self 27 | try: 28 | try: 29 | val = self.kv[key] 30 | except KeyError: 31 | # key has no value 32 | return 33 | if val is None: 34 | print key, self.kv 35 | 36 | self._Evict(key, val) 37 | except Exception, e: 38 | print >> sys.stderr, traceback.format_exc() 39 | print >> sys.stderr, "Exception: %s" % str(e) 40 | del self.kv[key] 41 | 42 | cdef class CharLRUDict(object): 43 | cdef CharLRU *store 44 | cdef dict kv 45 | cdef object _Fetch, _Evict 46 | 47 | property kv: 48 | def __get__(self): 49 | return self.kv 50 | 51 | property _Fetch: 52 | def __get__(self): 53 | return self._Fetch 54 | 55 | property _Evict: 56 | def __get__(self): 57 | return self._Evict 58 | 59 | def __cinit__(self, max_size=None, fetch=None, evict=None, factory=object): 60 | self.kv = defaultdict(factory) 61 | self._Fetch = fetch 62 | self._Evict = evict 63 | self.store = new CharLRU(_charfetch, _charevict, max_size, self) 64 | if self.store is NULL: 65 | raise MemoryError() 66 | 67 | def __dealloc__(self): 68 | if self.store is not NULL: 69 | del self.store 70 | 71 | def set(self, char *key, value): 72 | self.kv[key] = value 73 | self.store.set(key) 74 | 75 | def __setitem__(self, char *key, value): 76 | self.set(key, value) 77 | 78 | def get(self, char *key): 79 | self.store.get(key) 80 | return self.kv.get(key, None) 81 | 82 | def __getitem__(self, char *key): 83 | self.store.get(key) 84 | return self.kv[key] 85 | 86 | def evictAll(self): 87 | self.store.evictall() 88 | 89 | def _getContents(self): 90 | from copy import copy 91 | return copy(self.kv) 92 | 93 | 94 | cdef extern from "clru.h": 95 | ctypedef void (*IntFetch)(long, void *) 96 | ctypedef void (*IntEvict)(long, void *) 97 | 98 | cdef cppclass IntLRU: 99 | IntLRU(IntFetch f, IntEvict e, size_t size, void *self) 100 | void get(long k) 101 | void set(long k) 102 | void evictall() 103 | 104 | cdef void _intfetch(long key, void *_self): 105 | self = _self 106 | try: 107 | res = self._Fetch(key) 108 | if res is not None: 109 | self.kv[key] = res 110 | except Exception, e: 111 | print >> sys.stderr, traceback.format_exc() 112 | print >> sys.stderr, "Exception: %s" % str(e) 113 | 114 | cdef void _intevict(long key, void *_self): 115 | self = _self 116 | try: 117 | try: 118 | val = self.kv[key] 119 | except KeyError: 120 | # key has no value 121 | return 122 | if val is None: 123 | print key, self.kv 124 | print >> sys.stderr, key, self.kv 125 | self._Evict(key, val) 126 | except Exception, e: 127 | print >> sys.stderr, traceback.format_exc() 128 | print >> sys.stderr, "Exception: %s" % str(e) 129 | del self.kv[key] 130 | 131 | cdef class IntLRUDict(object): 132 | cdef IntLRU *store 133 | cdef dict kv 134 | cdef object _Fetch, _Evict 135 | 136 | property kv: 137 | def __get__(self): 138 | return self.kv 139 | 140 | property _Fetch: 141 | def __get__(self): 142 | return self._Fetch 143 | 144 | property _Evict: 145 | def __get__(self): 146 | return self._Evict 147 | 148 | 149 | def __cinit__(self, size_t max_size, object fetch=None, object evict=None, factory=object): 150 | self.kv = defaultdict(factory) 151 | self._Fetch = fetch 152 | self._Evict = evict 153 | self.store = new IntLRU(_intfetch, _intevict, max_size, self) 154 | if self.store is NULL: 155 | raise MemoryError() 156 | 157 | def __dealloc__(self): 158 | if self.store is not NULL: 159 | del self.store 160 | 161 | def set(self, long key, value): 162 | self.kv[key] = value 163 | self.store.set(key) 164 | 165 | def __setitem__(self, key, value): 166 | self.set(key, value) 167 | 168 | def get(self, long key): 169 | self.store.get(key) 170 | return self.kv.get(key, None) 171 | 172 | def __getitem__(self, long key): 173 | self.store.get(key) 174 | return self.kv[key] 175 | 176 | def evictAll(self): 177 | self.store.evictall() 178 | 179 | def _getContents(self): 180 | from copy import copy 181 | return copy(self.kv) 182 | 183 | cdef class LRUDict(object): 184 | @classmethod 185 | def getDict(cls, max_size=None, fetch=None, evict=None, isInt=False, factory=object): 186 | if isInt: 187 | return IntLRUDict(max_size, fetch, evict, factory) 188 | else: 189 | return CharLRUDict(max_size, fetch, evict, factory) 190 | -------------------------------------------------------------------------------- /deps/liblz4/lz4hc.h: -------------------------------------------------------------------------------- 1 | /* 2 | LZ4 HC - High Compression Mode of LZ4 3 | Header File 4 | Copyright (C) 2011-2013, Yann Collet. 5 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are 9 | met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of conditions and the following disclaimer 15 | in the documentation and/or other materials provided with the 16 | distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | You can contact the author at : 31 | - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html 32 | - LZ4 source repository : http://code.google.com/p/lz4/ 33 | */ 34 | #pragma once 35 | 36 | 37 | #if defined (__cplusplus) 38 | extern "C" { 39 | #endif 40 | 41 | 42 | int LZ4_compressHC (const char* source, char* dest, int inputSize); 43 | /* 44 | LZ4_compressHC : 45 | return : the number of bytes in compressed buffer dest 46 | or 0 if compression fails. 47 | note : destination buffer must be already allocated. 48 | To avoid any problem, size it to handle worst cases situations (input data not compressible) 49 | Worst case size evaluation is provided by function LZ4_compressBound() (see "lz4.h") 50 | */ 51 | 52 | int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); 53 | /* 54 | LZ4_compress_limitedOutput() : 55 | Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. 56 | If it cannot achieve it, compression will stop, and result of the function will be zero. 57 | This function never writes outside of provided output buffer. 58 | 59 | inputSize : Max supported value is 1 GB 60 | maxOutputSize : is maximum allowed size into the destination buffer (which must be already allocated) 61 | return : the number of output bytes written in buffer 'dest' 62 | or 0 if compression fails. 63 | */ 64 | 65 | 66 | /* Note : 67 | Decompression functions are provided within LZ4 source code (see "lz4.h") (BSD license) 68 | */ 69 | 70 | 71 | /* Advanced Functions */ 72 | 73 | void* LZ4_createHC (const char* inputBuffer); 74 | int LZ4_compressHC_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize); 75 | int LZ4_compressHC_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize); 76 | char* LZ4_slideInputBufferHC (void* LZ4HC_Data); 77 | int LZ4_freeHC (void* LZ4HC_Data); 78 | 79 | /* 80 | These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. 81 | In order to achieve this, it is necessary to start creating the LZ4HC Data Structure, thanks to the function : 82 | 83 | void* LZ4_createHC (const char* inputBuffer); 84 | The result of the function is the (void*) pointer on the LZ4HC Data Structure. 85 | This pointer will be needed in all other functions. 86 | If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. 87 | The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. 88 | The input buffer must be already allocated, and size at least 192KB. 89 | 'inputBuffer' will also be the 'const char* source' of the first block. 90 | 91 | All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. 92 | To compress each block, use either LZ4_compressHC_continue() or LZ4_compressHC_limitedOutput_continue(). 93 | Their behavior are identical to LZ4_compressHC() or LZ4_compressHC_limitedOutput(), 94 | but require the LZ4HC Data Structure as their first argument, and check that each block starts right after the previous one. 95 | If next block does not begin immediately after the previous one, the compression will fail (return 0). 96 | 97 | When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : 98 | char* LZ4_slideInputBufferHC(void* LZ4HC_Data); 99 | must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. 100 | Note that, for this function to work properly, minimum size of an input buffer must be 192KB. 101 | ==> The memory position where the next input data block must start is provided as the result of the function. 102 | 103 | Compression can then resume, using LZ4_compressHC_continue() or LZ4_compressHC_limitedOutput_continue(), as usual. 104 | 105 | When compression is completed, a call to LZ4_freeHC() will release the memory used by the LZ4HC Data Structure. 106 | */ 107 | 108 | 109 | #if defined (__cplusplus) 110 | } 111 | #endif 112 | -------------------------------------------------------------------------------- /deps/liblzf/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2000-2008 Marc Alexander Lehmann 2 | 3 | Redistribution and use in source and binary forms, with or without modifica- 4 | tion, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 14 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- 15 | CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- 17 | CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 18 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 19 | OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 20 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- 21 | ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 22 | OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | Alternatively, the contents of this file may be used under the terms of 25 | the GNU General Public License ("GPL") version 2 or any later version, 26 | in which case the provisions of the GPL are applicable instead of 27 | the above. If you wish to allow the use of your version of this file 28 | only under the terms of the GPL and not to allow others to use your 29 | version of this file under the BSD license, indicate your decision 30 | by deleting the provisions above and replace them with the notice 31 | and other provisions required by the GPL. If you do not delete the 32 | provisions above, a recipient may use your version of this file under 33 | either the BSD or the GPL. 34 | -------------------------------------------------------------------------------- /deps/liblzf/lzf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2000-2008 Marc Alexander Lehmann 3 | * 4 | * Redistribution and use in source and binary forms, with or without modifica- 5 | * tion, are permitted provided that the following conditions are met: 6 | * 7 | * 1. Redistributions of source code must retain the above copyright notice, 8 | * this list of conditions and the following disclaimer. 9 | * 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 15 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- 16 | * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 17 | * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- 18 | * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 20 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 21 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- 22 | * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 23 | * OF THE POSSIBILITY OF SUCH DAMAGE. 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * the GNU General Public License ("GPL") version 2 or any later version, 27 | * in which case the provisions of the GPL are applicable instead of 28 | * the above. If you wish to allow the use of your version of this file 29 | * only under the terms of the GPL and not to allow others to use your 30 | * version of this file under the BSD license, indicate your decision 31 | * by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL. If you do not delete the 33 | * provisions above, a recipient may use your version of this file under 34 | * either the BSD or the GPL. 35 | */ 36 | 37 | #ifndef LZF_H 38 | #define LZF_H 39 | 40 | /*********************************************************************** 41 | ** 42 | ** lzf -- an extremely fast/free compression/decompression-method 43 | ** http://liblzf.plan9.de/ 44 | ** 45 | ** This algorithm is believed to be patent-free. 46 | ** 47 | ***********************************************************************/ 48 | 49 | #define LZF_VERSION 0x0105 /* 1.5, API version */ 50 | 51 | /* 52 | * Compress in_len bytes stored at the memory block starting at 53 | * in_data and write the result to out_data, up to a maximum length 54 | * of out_len bytes. 55 | * 56 | * If the output buffer is not large enough or any error occurs return 0, 57 | * otherwise return the number of bytes used, which might be considerably 58 | * more than in_len (but less than 104% of the original size), so it 59 | * makes sense to always use out_len == in_len - 1), to ensure _some_ 60 | * compression, and store the data uncompressed otherwise (with a flag, of 61 | * course. 62 | * 63 | * lzf_compress might use different algorithms on different systems and 64 | * even different runs, thus might result in different compressed strings 65 | * depending on the phase of the moon or similar factors. However, all 66 | * these strings are architecture-independent and will result in the 67 | * original data when decompressed using lzf_decompress. 68 | * 69 | * The buffers must not be overlapping. 70 | * 71 | * If the option LZF_STATE_ARG is enabled, an extra argument must be 72 | * supplied which is not reflected in this header file. Refer to lzfP.h 73 | * and lzf_c.c. 74 | * 75 | */ 76 | unsigned int 77 | lzf_compress (const void *const in_data, unsigned int in_len, 78 | void *out_data, unsigned int out_len); 79 | 80 | /* 81 | * Decompress data compressed with some version of the lzf_compress 82 | * function and stored at location in_data and length in_len. The result 83 | * will be stored at out_data up to a maximum of out_len characters. 84 | * 85 | * If the output buffer is not large enough to hold the decompressed 86 | * data, a 0 is returned and errno is set to E2BIG. Otherwise the number 87 | * of decompressed bytes (i.e. the original length of the data) is 88 | * returned. 89 | * 90 | * If an error in the compressed data is detected, a zero is returned and 91 | * errno is set to EINVAL. 92 | * 93 | * This function is very fast, about as fast as a copying loop. 94 | */ 95 | unsigned int 96 | lzf_decompress (const void *const in_data, unsigned int in_len, 97 | void *out_data, unsigned int out_len); 98 | 99 | #endif 100 | 101 | -------------------------------------------------------------------------------- /deps/liblzf/lzfP.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2000-2007 Marc Alexander Lehmann 3 | * 4 | * Redistribution and use in source and binary forms, with or without modifica- 5 | * tion, are permitted provided that the following conditions are met: 6 | * 7 | * 1. Redistributions of source code must retain the above copyright notice, 8 | * this list of conditions and the following disclaimer. 9 | * 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 15 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- 16 | * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 17 | * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- 18 | * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 20 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 21 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- 22 | * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 23 | * OF THE POSSIBILITY OF SUCH DAMAGE. 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * the GNU General Public License ("GPL") version 2 or any later version, 27 | * in which case the provisions of the GPL are applicable instead of 28 | * the above. If you wish to allow the use of your version of this file 29 | * only under the terms of the GPL and not to allow others to use your 30 | * version of this file under the BSD license, indicate your decision 31 | * by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL. If you do not delete the 33 | * provisions above, a recipient may use your version of this file under 34 | * either the BSD or the GPL. 35 | */ 36 | 37 | #ifndef LZFP_h 38 | #define LZFP_h 39 | 40 | #define STANDALONE 1 /* at the moment, this is ok. */ 41 | 42 | #ifndef STANDALONE 43 | # include "lzf.h" 44 | #endif 45 | 46 | /* 47 | * Size of hashtable is (1 << HLOG) * sizeof (char *) 48 | * decompression is independent of the hash table size 49 | * the difference between 15 and 14 is very small 50 | * for small blocks (and 14 is usually a bit faster). 51 | * For a low-memory/faster configuration, use HLOG == 13; 52 | * For best compression, use 15 or 16 (or more, up to 23). 53 | */ 54 | #ifndef HLOG 55 | # define HLOG 16 56 | #endif 57 | 58 | /* 59 | * Sacrifice very little compression quality in favour of compression speed. 60 | * This gives almost the same compression as the default code, and is 61 | * (very roughly) 15% faster. This is the preferred mode of operation. 62 | */ 63 | #ifndef VERY_FAST 64 | # define VERY_FAST 1 65 | #endif 66 | 67 | /* 68 | * Sacrifice some more compression quality in favour of compression speed. 69 | * (roughly 1-2% worse compression for large blocks and 70 | * 9-10% for small, redundant, blocks and >>20% better speed in both cases) 71 | * In short: when in need for speed, enable this for binary data, 72 | * possibly disable this for text data. 73 | */ 74 | #ifndef ULTRA_FAST 75 | # define ULTRA_FAST 0 76 | #endif 77 | 78 | /* 79 | * Unconditionally aligning does not cost very much, so do it if unsure 80 | */ 81 | #ifndef STRICT_ALIGN 82 | # define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) 83 | #endif 84 | 85 | /* 86 | * You may choose to pre-set the hash table (might be faster on some 87 | * modern cpus and large (>>64k) blocks, and also makes compression 88 | * deterministic/repeatable when the configuration otherwise is the same). 89 | */ 90 | #ifndef INIT_HTAB 91 | # define INIT_HTAB 0 92 | #endif 93 | 94 | /* 95 | * Avoid assigning values to errno variable? for some embedding purposes 96 | * (linux kernel for example), this is necessary. NOTE: this breaks 97 | * the documentation in lzf.h. 98 | */ 99 | #ifndef AVOID_ERRNO 100 | # define AVOID_ERRNO 0 101 | #endif 102 | 103 | /* 104 | * Whether to pass the LZF_STATE variable as argument, or allocate it 105 | * on the stack. For small-stack environments, define this to 1. 106 | * NOTE: this breaks the prototype in lzf.h. 107 | */ 108 | #ifndef LZF_STATE_ARG 109 | # define LZF_STATE_ARG 0 110 | #endif 111 | 112 | /* 113 | * Whether to add extra checks for input validity in lzf_decompress 114 | * and return EINVAL if the input stream has been corrupted. This 115 | * only shields against overflowing the input buffer and will not 116 | * detect most corrupted streams. 117 | * This check is not normally noticeable on modern hardware 118 | * (<1% slowdown), but might slow down older cpus considerably. 119 | */ 120 | #ifndef CHECK_INPUT 121 | # define CHECK_INPUT 1 122 | #endif 123 | 124 | /*****************************************************************************/ 125 | /* nothing should be changed below */ 126 | 127 | typedef unsigned char u8; 128 | 129 | typedef const u8 *LZF_STATE[1 << (HLOG)]; 130 | 131 | #if !STRICT_ALIGN 132 | /* for unaligned accesses we need a 16 bit datatype. */ 133 | # include 134 | # if USHRT_MAX == 65535 135 | typedef unsigned short u16; 136 | # elif UINT_MAX == 65535 137 | typedef unsigned int u16; 138 | # else 139 | # undef STRICT_ALIGN 140 | # define STRICT_ALIGN 1 141 | # endif 142 | #endif 143 | 144 | #if ULTRA_FAST 145 | # if defined(VERY_FAST) 146 | # undef VERY_FAST 147 | # endif 148 | #endif 149 | 150 | #if INIT_HTAB 151 | # ifdef __cplusplus 152 | # include 153 | # else 154 | # include 155 | # endif 156 | #endif 157 | 158 | #endif 159 | 160 | -------------------------------------------------------------------------------- /deps/liblzf/lzf_d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2000-2007 Marc Alexander Lehmann 3 | * 4 | * Redistribution and use in source and binary forms, with or without modifica- 5 | * tion, are permitted provided that the following conditions are met: 6 | * 7 | * 1. Redistributions of source code must retain the above copyright notice, 8 | * this list of conditions and the following disclaimer. 9 | * 10 | * 2. Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED 15 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- 16 | * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 17 | * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- 18 | * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 20 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 21 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- 22 | * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 23 | * OF THE POSSIBILITY OF SUCH DAMAGE. 24 | * 25 | * Alternatively, the contents of this file may be used under the terms of 26 | * the GNU General Public License ("GPL") version 2 or any later version, 27 | * in which case the provisions of the GPL are applicable instead of 28 | * the above. If you wish to allow the use of your version of this file 29 | * only under the terms of the GPL and not to allow others to use your 30 | * version of this file under the BSD license, indicate your decision 31 | * by deleting the provisions above and replace them with the notice 32 | * and other provisions required by the GPL. If you do not delete the 33 | * provisions above, a recipient may use your version of this file under 34 | * either the BSD or the GPL. 35 | */ 36 | 37 | #include "lzfP.h" 38 | 39 | #if AVOID_ERRNO 40 | # define SET_ERRNO(n) 41 | #else 42 | # include 43 | # define SET_ERRNO(n) errno = (n) 44 | #endif 45 | 46 | /* 47 | #if (__i386 || __amd64) && __GNUC__ >= 3 48 | # define lzf_movsb(dst, src, len) \ 49 | asm ("rep movsb" \ 50 | : "=D" (dst), "=S" (src), "=c" (len) \ 51 | : "0" (dst), "1" (src), "2" (len)); 52 | #endif 53 | */ 54 | 55 | unsigned int 56 | lzf_decompress (const void *const in_data, unsigned int in_len, 57 | void *out_data, unsigned int out_len) 58 | { 59 | u8 const *ip = (const u8 *)in_data; 60 | u8 *op = (u8 *)out_data; 61 | u8 const *const in_end = ip + in_len; 62 | u8 *const out_end = op + out_len; 63 | 64 | do 65 | { 66 | unsigned int ctrl = *ip++; 67 | 68 | if (ctrl < (1 << 5)) /* literal run */ 69 | { 70 | ctrl++; 71 | 72 | if (op + ctrl > out_end) 73 | { 74 | SET_ERRNO (E2BIG); 75 | return 0; 76 | } 77 | 78 | #if CHECK_INPUT 79 | if (ip + ctrl > in_end) 80 | { 81 | SET_ERRNO (EINVAL); 82 | return 0; 83 | } 84 | #endif 85 | 86 | #ifdef lzf_movsb 87 | lzf_movsb (op, ip, ctrl); 88 | #else 89 | do 90 | *op++ = *ip++; 91 | while (--ctrl); 92 | #endif 93 | } 94 | else /* back reference */ 95 | { 96 | unsigned int len = ctrl >> 5; 97 | 98 | u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; 99 | 100 | #if CHECK_INPUT 101 | if (ip >= in_end) 102 | { 103 | SET_ERRNO (EINVAL); 104 | return 0; 105 | } 106 | #endif 107 | if (len == 7) 108 | { 109 | len += *ip++; 110 | #if CHECK_INPUT 111 | if (ip >= in_end) 112 | { 113 | SET_ERRNO (EINVAL); 114 | return 0; 115 | } 116 | #endif 117 | } 118 | 119 | ref -= *ip++; 120 | 121 | if (op + len + 2 > out_end) 122 | { 123 | SET_ERRNO (E2BIG); 124 | return 0; 125 | } 126 | 127 | if (ref < (u8 *)out_data) 128 | { 129 | SET_ERRNO (EINVAL); 130 | return 0; 131 | } 132 | 133 | #ifdef lzf_movsb 134 | len += 2; 135 | lzf_movsb (op, ref, len); 136 | #else 137 | *op++ = *ref++; 138 | *op++ = *ref++; 139 | 140 | do 141 | *op++ = *ref++; 142 | while (--len); 143 | #endif 144 | } 145 | } 146 | while (ip < in_end); 147 | 148 | return op - (u8 *)out_data; 149 | } 150 | 151 | -------------------------------------------------------------------------------- /deps/librtrie/main.c: -------------------------------------------------------------------------------- 1 | d#include 2 | #include "rtrie.h" 3 | 4 | uint8_t nodes[] = {0, 0, 0, 2, 16, 0, 0, 1, 11, 0, 0, 2, 19, 0, 0, 0, 4, 0, 0, 1, 8, 0, 0, 0, 22, 0, 0, 0}; 5 | uint8_t kids[] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 103, 2, 0, 0, 104, 0, 0, 0, 0, 4, 0, 103, 111, 111, 100, 6 | 0, 0, 5, 0, 0, 98, 4, 0, 0, 0, 3, 0, 98, 121, 101, 0, 0, 0, 0, 0, 0, 0, 4, 0, 104, 101, 108, 108, 0, 7 | 0, 1, 0, 0, 111, 6, 0, 0, 115, 2, 0, 0, 0, 1, 0, 111, 0, 3, 0, 0, 116, 1, 0, 0, 0, 5, 0, 116, 104, 101, 8 | 114, 101, 0, 2, 0, 0, 0, 4, 0, 115, 105, 110, 107, 0, 0}; 9 | 10 | int main(int argc, const char * argv[]) 11 | { 12 | char result[256]; 13 | size_t rlen; 14 | uint32_t *node32s = (uint32_t*)nodes; 15 | uint32_t *kid32s = (uint32_t*)kids; 16 | 17 | printf("testing value_for_vid()...\n"); 18 | int x = value_for_vid(node32s, kid32s, 3, result, &rlen); 19 | result[rlen] = '\0'; 20 | printf("Done rval=%d %s\n", x, result); 21 | x = value_for_vid(node32s, kid32s, 1, result, &rlen); 22 | result[rlen] = '\0'; 23 | printf("Done rval=%d %s\n", x, result); 24 | x = value_for_vid(node32s, kid32s, 4, result, &rlen); 25 | result[rlen] = '\0'; 26 | printf("Done rval=%d %s\n", x, result); 27 | 28 | printf("testing vid_for_value()...\n"); 29 | uint32_t vid=0; 30 | x = vid_for_value(node32s, kid32s, "hello", 5, &vid); 31 | printf("Done vid=%d %d\n", vid, x); 32 | 33 | x = vid_for_value(node32s, kid32s, "hellothere", 10, &vid); 34 | printf("Done vid=%d %d\n", vid, x); 35 | 36 | x = vid_for_value(node32s, kid32s, "good", 4, &vid); 37 | printf("Done vid=%d %d\n", vid, x); 38 | 39 | x = vid_for_value(node32s, kid32s, "dung", 4, &vid); 40 | printf("Done vid=%d %d\n", vid, x); 41 | 42 | return 0; 43 | } 44 | 45 | -------------------------------------------------------------------------------- /deps/librtrie/pyrtrie.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Python.h" 6 | #include "rtrie.h" 7 | 8 | 9 | #if PY_MAJOR_VERSION >= 3 10 | #define PYSTR_CREATE PyBytes_FromStringAndSize 11 | #else 12 | #define PYSTR_CREATE PyString_FromStringAndSize 13 | #endif 14 | 15 | #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) 16 | typedef int Py_ssize_t; 17 | #define PY_SSIZE_T_MAX INT_MAX 18 | #define PY_SSIZE_T_MIN INT_MIN 19 | #endif 20 | 21 | 22 | static PyObject * 23 | py_print_it(PyObject *self, PyObject *args) 24 | { 25 | uint64_t node_ptr, kid_ptr; 26 | uint32_t *nodes; 27 | uint32_t *kids; 28 | 29 | if (!PyArg_ParseTuple(args, "K|K", &node_ptr, &kid_ptr)) 30 | return NULL; 31 | 32 | nodes = (uint32_t *)node_ptr; 33 | kids = (uint32_t *)kid_ptr; 34 | 35 | print_it(nodes, kids); 36 | Py_XINCREF(Py_None); 37 | return Py_None; 38 | } 39 | 40 | static PyObject * 41 | py_summarize(PyObject *self, PyObject *args) 42 | { 43 | uint64_t node_ptr, kid_ptr; 44 | uint32_t size; 45 | uint32_t *nodes; 46 | uint32_t *kids; 47 | 48 | if (!PyArg_ParseTuple(args, "K|K|I", &node_ptr, &kid_ptr, &size)) 49 | return NULL; 50 | 51 | nodes = (uint32_t *)node_ptr; 52 | kids = (uint32_t *)kid_ptr; 53 | 54 | summarize(nodes, kids, size); 55 | Py_XINCREF(Py_None); 56 | return Py_None; 57 | } 58 | 59 | static PyObject * 60 | py_value_for_vid(PyObject *self, PyObject *args) 61 | { 62 | uint64_t node_ptr, kid_ptr; 63 | uint32_t vid; 64 | uint32_t *nodes; 65 | uint32_t *kids; 66 | char res[8092]; 67 | size_t rlen; 68 | 69 | if (!PyArg_ParseTuple(args, "K|K|I", &node_ptr, &kid_ptr, &vid)) 70 | return NULL; 71 | 72 | nodes = (uint32_t *)node_ptr; 73 | kids = (uint32_t *)kid_ptr; 74 | 75 | if (!value_for_vid(nodes, kids, vid, res, &rlen)) { 76 | return PYSTR_CREATE(res, rlen); 77 | } 78 | Py_XINCREF(Py_None); 79 | return Py_None; 80 | } 81 | 82 | //TODO: these routines should return 0 on not found 83 | // int vid_for_value(uint32_t *nodes, uint32_t *kids, char *key, uint16_t key_len, uint32_t *vid); 84 | static PyObject * 85 | py_vid_for_value(PyObject *self, PyObject *args) 86 | { 87 | uint64_t node_ptr, kid_ptr; 88 | uint32_t vid; 89 | uint32_t *nodes; 90 | uint32_t *kids; 91 | char *key; 92 | Py_ssize_t key_len; 93 | 94 | if (!PyArg_ParseTuple(args, "K|K|s#", &node_ptr, &kid_ptr, &key, &key_len)) 95 | return NULL; 96 | 97 | nodes = (uint32_t *)node_ptr; 98 | kids = (uint32_t *)kid_ptr; 99 | 100 | if (!vid_for_value(nodes, kids, key, key_len, &vid)) { 101 | return PyInt_FromLong((long)vid); 102 | } 103 | Py_XINCREF(Py_None); 104 | return Py_None; 105 | } 106 | 107 | 108 | PyDoc_STRVAR(module_doc, "Python wrapper for the rtrie."); 109 | 110 | static PyMethodDef rtrie_methods[] = { 111 | 112 | {"value_for_vid", py_value_for_vid, METH_VARARGS, 113 | "Get Value based on VID"}, 114 | {"vid_for_value", py_vid_for_value, METH_VARARGS, 115 | "Get VID based on Value"}, 116 | {"print_it", py_print_it, METH_VARARGS, 117 | "Print rtrie"}, 118 | {"summarize", py_summarize, METH_VARARGS, 119 | "Summarize rtrie"}, 120 | 121 | {NULL, NULL, 0, NULL} 122 | }; 123 | 124 | 125 | #if PY_MAJOR_VERSION <= 2 126 | 127 | extern PyMODINIT_FUNC 128 | initrtrie(void) 129 | { 130 | PyObject *m; 131 | 132 | m = Py_InitModule3("rtrie", rtrie_methods, module_doc); 133 | 134 | if (m == NULL) 135 | return; 136 | PyModule_AddStringConstant(m, "__version__", MODULE_VERSION); 137 | } 138 | 139 | #else 140 | 141 | /* Python 3.x */ 142 | 143 | static PyModuleDef rtrie_module = { 144 | PyModuleDef_HEAD_INIT, 145 | "rtrie", 146 | module_doc, 147 | -1, 148 | rtrie_methods, 149 | NULL, 150 | NULL, 151 | NULL, 152 | NULL 153 | }; 154 | 155 | extern PyMODINIT_FUNC 156 | PyInit_rtrie(void) 157 | { 158 | PyObject *m; 159 | 160 | m = PyModule_Create(&rtrie_module); 161 | if (m == NULL) 162 | goto finally; 163 | PyModule_AddStringConstant(m, "__version__", MODULE_VERSION); 164 | 165 | finally: 166 | return m; 167 | } 168 | 169 | #endif 170 | -------------------------------------------------------------------------------- /deps/librtrie/rtrie.c: -------------------------------------------------------------------------------- 1 | #include "stdint.h" 2 | #include "string.h" 3 | #include "stdio.h" 4 | 5 | static char *_val_for_vid(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *rval) { 6 | char *curr = rval; 7 | uint32_t node = nodes[vid]; 8 | uint32_t kid_offset = (uint32_t )0x00ffffff & node; 9 | uint32_t parent = kids[kid_offset++]; 10 | 11 | if (parent) { 12 | curr = _val_for_vid(nodes, kids, parent, rval); 13 | } 14 | 15 | uint16_t *radix = (uint16_t *)&kids[kid_offset]; 16 | char *radix_chars = (char *)(radix + 1); 17 | memcpy(curr, radix_chars, *radix); 18 | curr += *radix; 19 | return curr; 20 | } 21 | 22 | 23 | int value_for_vid(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *result, size_t *rlen) { 24 | char *end = _val_for_vid(nodes, kids, vid, result); 25 | *rlen = end - result; 26 | return 0; 27 | } 28 | 29 | 30 | static uint32_t _find_binary(uint32_t *knodes, uint8_t kid_len, unsigned char selector) { 31 | int lower = 0; 32 | int upper = kid_len - 1; 33 | 34 | while (lower <= upper) { 35 | int mid = lower + ((upper - lower) / 2); 36 | 37 | // unpack the node - the high order byte is the selector for those children 38 | uint32_t knode = knodes[mid]; 39 | unsigned char rselect = (unsigned char) (knode >> 24); 40 | uint32_t node = ((uint32_t)0x00ffffff) & knode; 41 | 42 | if (rselect == selector) { 43 | return node; 44 | } 45 | else if (rselect < selector) { 46 | lower = mid + 1; 47 | } 48 | else{ 49 | upper = mid - 1; 50 | } 51 | } 52 | return 0; 53 | } 54 | 55 | 56 | static uint32_t _vid_for_value(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *key, uint16_t key_len) { 57 | if (!key_len) { 58 | return vid; 59 | } 60 | 61 | uint32_t node = nodes[vid]; 62 | uint8_t kid_len = (uint8_t)(node >> 24); 63 | uint32_t kid_offset = ((uint32_t )0x00ffffff & node) + 1; 64 | uint16_t *radix = (uint16_t *)&kids[kid_offset]; 65 | uint16_t radix_len = *radix; 66 | uint16_t i; 67 | 68 | // we need to compare the radix to the key 69 | if (radix_len <= key_len) { 70 | char *radix_chars = (char *)(radix + 1); 71 | for (i = 0; i < radix_len; i++) { 72 | if (radix_chars[i] != key[i]) { 73 | return 0; 74 | } 75 | } 76 | 77 | // did we find the VID? 78 | if (radix_len == key_len) { 79 | return vid; 80 | } 81 | 82 | // we have a matching radix, take the 'rest' of the key and match with it's children 83 | char *selector = key + radix_len; 84 | uint16_t selector_len = key_len - radix_len; 85 | uint16_t width = 2 + radix_len; 86 | kid_offset += width / 4; 87 | if (width % 4) { 88 | kid_offset++; 89 | } 90 | uint32_t *knodes = kids + kid_offset; 91 | uint32_t knode = _find_binary(knodes, kid_len, (unsigned char)(*selector)); 92 | if (knode) { 93 | return _vid_for_value(nodes, kids, knode, selector, selector_len); 94 | } 95 | } 96 | return 0; 97 | 98 | } 99 | 100 | int vid_for_value(uint32_t *nodes, uint32_t *kids, char *key, uint16_t key_len, uint32_t *vid) { 101 | uint32_t node = _vid_for_value(nodes, kids, 0, key, key_len); 102 | if (node) { 103 | *vid = node; 104 | return 0; 105 | } 106 | return -1; 107 | } 108 | 109 | 110 | static void _print_it(uint32_t *nodes, uint32_t *kids, uint32_t curr_node, unsigned char selector, int depth) { 111 | int i; 112 | uint32_t node = nodes[curr_node]; 113 | uint8_t kid_len = (uint8_t)(node >> 24); 114 | uint32_t kid_offset = (uint32_t )0x00ffffff & node; 115 | uint32_t *kid = kids + kid_offset; 116 | uint32_t parent = kid[0]; 117 | uint16_t radix_len = ((uint16_t *)kid)[2]; 118 | for(i = 0; i < depth; ++i) { 119 | printf(" "); 120 | } 121 | if(radix_len > 0) { 122 | char *radix = ((char *)kid) + 6; 123 | printf("%d '%.*s' ", radix_len, radix_len, radix); 124 | } 125 | else { 126 | printf(" "); 127 | } 128 | printf("%d(%d) '%c'(0x%x) - %d\n", curr_node, parent, selector, selector, kid_len); 129 | 130 | // pad 131 | uint32_t child_offset = 6 + radix_len; 132 | child_offset += (4 - (child_offset % 4)) % 4; 133 | child_offset /= 4; 134 | 135 | // process kids 136 | uint32_t *children = kid + child_offset; 137 | for(i = 0; i < kid_len; ++i) { 138 | uint32_t child = children[i]; 139 | unsigned char sel = (unsigned char)(child >> 24); 140 | uint32_t new_node = child & 0x00ffffff; 141 | _print_it(nodes, kids, new_node, sel, depth + 1); 142 | } 143 | } 144 | 145 | void print_it(uint32_t *nodes, uint32_t *kids) { 146 | _print_it(nodes, kids, 0, '\0', 0); 147 | } 148 | 149 | 150 | void summarize(uint32_t *nodes, uint32_t *kids, int num_nodes) { 151 | int i; 152 | printf("Summarize nodes=%p kids=%p num_nodes=%d\n", nodes, kids, num_nodes); 153 | for(i = 0; i < num_nodes; ++i) { 154 | uint32_t node = nodes[i]; 155 | uint8_t kid_len = (uint8_t)(node >> 24); 156 | uint32_t kid_offset = (uint32_t )0x00ffffff & node; 157 | uint32_t *kid = kids + kid_offset; 158 | uint32_t parent = kid[0]; 159 | uint16_t radix_len = ((uint16_t *)kid)[2]; 160 | 161 | printf("%d %d | %d ", kid_len, kid_offset, parent); 162 | if(radix_len > 0) { 163 | char *radix = ((char *)kid) + 6; 164 | printf("%d '%.*s'\n", radix_len, radix_len, radix); 165 | } 166 | else { 167 | printf("0 ''\n"); 168 | } 169 | //printf("%d %d(%d) - %d, %d\n", i, node, parent, kid_len, kid_offset); 170 | } 171 | } -------------------------------------------------------------------------------- /deps/librtrie/rtrie.h: -------------------------------------------------------------------------------- 1 | #ifndef _RTRIE_H_ 2 | #define _RTRIE_H_ 3 | 4 | #include "stdint.h" 5 | #include "stddef.h" 6 | 7 | int value_for_vid(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *result, size_t *rlen); 8 | int vid_for_value(uint32_t *nodes, uint32_t *kids, char *key, uint16_t key_len, uint32_t *vid); 9 | void print_it(uint32_t *nodes, uint32_t *kids); 10 | void summarize(uint32_t *nodes, uint32_t *kids, int num_nodes); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /deps/libwtrie/test/test_wtrie.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import unittest 3 | from wtrie import Trie 4 | 5 | 6 | class TestWTrie(unittest.TestCase): 7 | def test_wtrie(self): 8 | t = Trie() 9 | self.assertEqual(t.add('hello'), 1) 10 | self.assertEqual(t.add('hell'), 2) 11 | self.assertEqual(t.add('hello'), 1) 12 | self.assertEqual(t.add('hellothere'), 3) 13 | self.assertEqual(t.add('good'), 4) 14 | self.assertEqual(t.add('goodbye'), 5) 15 | self.assertEqual(t.add('hello'), 1) 16 | self.assertEqual(t.add('hellsink'), 6) 17 | self.assertEqual(t.add(''), 0) 18 | 19 | # nodes = t.nodes 20 | # t.print_it() 21 | 22 | key, sz, pt = t.node_at_path() 23 | self.assertEqual(sz, 2) 24 | 25 | key, sz, pt = t.node_at_path(104) 26 | self.assertEqual(key, 'hell') 27 | self.assertEqual(pt, 0) 28 | self.assertEqual(sz, 2, 'actual %s' % sz) 29 | 30 | key2, sz, pt = t.node_at_path(104, 111) 31 | self.assertEqual(key2, 'o', 'actual %s' % key) 32 | self.assertEqual(pt, 2) 33 | self.assertEqual(sz, 1) 34 | 35 | key, sz, pt = t.node_at_path(104, 111, 116) 36 | self.assertEqual(key, 'there') 37 | self.assertEqual(pt, 1) 38 | self.assertEqual(sz, 0) 39 | 40 | n, k, _ = t.serialize() 41 | self.assertEqual(len(n), 7 * 4, "actual %d" % len(n)) 42 | self.assertEqual(len(k), 100, "actual %d" % len(k)) 43 | # print "sqork: %s" % t.kid_space 44 | 45 | print 'nodes', n 46 | print 'kids', k 47 | 48 | unpacked = struct.unpack_from("7I", n, 0) 49 | expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016) 50 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 51 | 52 | unpacked = struct.unpack_from("IH2I", k, 0) 53 | expected = (0, 0, 0x67000004, 0x68000002) 54 | self.assertEqual(unpacked, expected, unpacked) 55 | 56 | unpacked = struct.unpack_from("IH4cI", k, 16) 57 | expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005) 58 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 59 | 60 | unpacked = struct.unpack_from("IH3c", k, 32) 61 | expected = (0x0004, 0x0003, 'b', 'y', 'e') 62 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 63 | 64 | unpacked = struct.unpack_from("IH4c2I", k, 44) 65 | expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006) 66 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 67 | 68 | unpacked = struct.unpack_from("IHcI", k, 64) 69 | expected = (0x0002, 1, 'o', 0x74000003) 70 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 71 | 72 | unpacked = struct.unpack_from("IH5c", k, 76) 73 | expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e') 74 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 75 | 76 | unpacked = struct.unpack_from("IH4c", k, 88) 77 | expected = (0x0002, 0x0004, 's', 'i', 'n', 'k') 78 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 79 | -------------------------------------------------------------------------------- /deps/maxhash/test/maxhash_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from maxhash import MinHeap, MaxHash 3 | 4 | 5 | class TestMinHeap(unittest.TestCase): 6 | def test_pop(self): 7 | m = MinHeap(1024, (3, 5, 2, 1)) 8 | self.assertEqual(m.pop(), 1) 9 | self.assertEqual(m.pop(), 2) 10 | self.assertEqual(m.pop(), 3) 11 | self.assertEqual(m.pop(), 5) 12 | 13 | def test_push(self): 14 | m = MinHeap(1024, ()) 15 | m.push(1) 16 | m.push(3) 17 | m.push(2) 18 | self.assertEqual(m.pop(), 1) 19 | self.assertEqual(m.pop(), 2) 20 | self.assertEqual(m.pop(), 3) 21 | m.push(1) 22 | m.push(3) 23 | m.push(2) 24 | m.push(4) 25 | m.push(6) 26 | m.push(5) 27 | self.assertEqual(m.pop(), 1) 28 | self.assertEqual(m.pop(), 2) 29 | self.assertEqual(m.pop(), 3) 30 | self.assertEqual(m.pop(), 4) 31 | self.assertEqual(m.pop(), 5) 32 | self.assertEqual(m.pop(), 6) 33 | 34 | def test_nlargest(self): 35 | m = MinHeap(1024, [1, 2, 3, 4, 2, 1, 5, 6]) 36 | l = list(m.nlargest(3)) 37 | l.sort() 38 | self.assertEqual(l, [4, 5, 6]) 39 | 40 | 41 | class TestMaxHash(unittest.TestCase): 42 | def test_add(self): 43 | m = MaxHash(8192) 44 | m.add(str(1)) 45 | m.add(str(2)) 46 | m.add(str(3)) 47 | m.add(str(4)) 48 | self.assertEqual(len(m.uniq()), 4) 49 | 50 | def test_merge(self): 51 | r1 = range(10000) 52 | m1 = MaxHash(8192) 53 | r2 = range(2000, 12000) 54 | m2 = MaxHash(8192) 55 | r3 = range(15000) 56 | m3 = MaxHash(8192) 57 | for i in r1: 58 | m1.add(str(i)) 59 | for i in r2: 60 | m2.add(str(i)) 61 | for i in r3: 62 | m3.add(str(i)) 63 | m2.merge(m1) 64 | ix = MaxHash.get_jaccard_index([m2, m3]) 65 | self.assertAlmostEqual(ix, 0.80, 2) 66 | 67 | def test_union(self): 68 | r1 = range(10000) 69 | m1 = MaxHash(8192) 70 | r2 = range(2000, 12000) 71 | m2 = MaxHash(8192) 72 | r3 = range(15000) 73 | m3 = MaxHash(8192) 74 | for i in r1: 75 | m1.add(str(i)) 76 | for i in r2: 77 | m2.add(str(i)) 78 | for i in r3: 79 | m3.add(str(i)) 80 | m4 = m1.union(m2) 81 | ix = MaxHash.get_jaccard_index([m3, m4]) 82 | self.assertAlmostEqual(ix, 0.80, 2) 83 | 84 | def test_jarcard_index(self): 85 | r1 = range(10000) 86 | m1 = MaxHash(8192) 87 | r2 = range(2000, 10000) 88 | m2 = MaxHash(8192) 89 | for i in r1: 90 | m1.add(str(i)) 91 | for i in r2: 92 | m2.add(str(i)) 93 | ix = MaxHash.get_jaccard_index([m1, m2]) 94 | self.assertAlmostEqual(ix, 0.80, 2) 95 | -------------------------------------------------------------------------------- /deps/scamurmur3/murmur3.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the 3 | // public domain. The author hereby disclaims copyright to this source 4 | // code. 5 | 6 | #ifndef _MURMURHASH3_H_ 7 | #define _MURMURHASH3_H_ 8 | 9 | #include 10 | 11 | //----------------------------------------------------------------------------- 12 | 13 | void MurmurHash3_x86_32 (const void *key, int len, uint32_t seed, void *out); 14 | 15 | void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out); 16 | 17 | void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out); 18 | 19 | //----------------------------------------------------------------------------- 20 | 21 | #endif // _MURMURHASH3_H_ 22 | -------------------------------------------------------------------------------- /deps/scamurmur3/scamurmur3.c: -------------------------------------------------------------------------------- 1 | 2 | // License: MIT License 3 | // http://www.opensource.org/licenses/mit-license.php 4 | 5 | // SMHasher code is from SMHasher project, authored by Austin Appleby, et al. 6 | // http://code.google.com/p/smhasher/ 7 | 8 | // Python extension code by Patrick Hensley 9 | // Ported from C++ to C by Chango Corp. 10 | 11 | 12 | #include 13 | #include "murmur3.h" 14 | 15 | 16 | #if PY_VERSION_HEX < 0x02050000 17 | typedef int Py_ssize_t; 18 | #define PY_SSIZE_T_MAX INT_MAX 19 | #define PY_SSIZE_T_MIN INT_MIN 20 | #endif 21 | 22 | 23 | static PyObject * 24 | _py_murmur3_128(PyObject *self, PyObject *args, int x86, int size) 25 | { 26 | const char *key; 27 | Py_ssize_t len; 28 | uint32_t seed = 0; 29 | unsigned char out[16]; 30 | 31 | if (!PyArg_ParseTuple(args, "s#|I", &key, &len, &seed)) { 32 | return NULL; 33 | } 34 | 35 | if (x86) { 36 | MurmurHash3_x86_128((void *)key, len, seed, &out); 37 | } else { 38 | MurmurHash3_x64_128((void *)key, len, seed, &out); 39 | } 40 | 41 | return _PyLong_FromByteArray((const unsigned char *)&out, size, 0, 0); 42 | } 43 | 44 | static PyObject * 45 | py_murmur3_x86_32(PyObject *self, PyObject *args) 46 | { 47 | const char *key; 48 | Py_ssize_t len; 49 | uint32_t seed = 0; 50 | unsigned char out[4]; 51 | 52 | if (!PyArg_ParseTuple(args, "s#|I", &key, &len, &seed)) { 53 | return NULL; 54 | } 55 | 56 | MurmurHash3_x86_32((void *)key, len, seed, &out); 57 | 58 | return _PyLong_FromByteArray((const unsigned char *)&out, 4, 0, 0); 59 | } 60 | 61 | static PyObject * 62 | py_murmur3_x86_64(PyObject *self, PyObject *args) 63 | { 64 | return _py_murmur3_128(self, args, 1, 8); 65 | } 66 | 67 | 68 | static PyObject * 69 | py_murmur3_x64_64(PyObject *self, PyObject *args) 70 | { 71 | return _py_murmur3_128(self, args, 0, 8); 72 | } 73 | 74 | 75 | static PyObject * 76 | py_murmur3_x86_128(PyObject *self, PyObject *args) 77 | { 78 | return _py_murmur3_128(self, args, 1, 16); 79 | } 80 | 81 | 82 | static PyObject * 83 | py_murmur3_x64_128(PyObject *self, PyObject *args) 84 | { 85 | return _py_murmur3_128(self, args, 0, 16); 86 | } 87 | 88 | 89 | PyDoc_STRVAR(module_doc, "Python wrapper for the SMHasher routines."); 90 | 91 | static PyMethodDef scamurmur3_methods[] = { 92 | {"murmur3_x86_32", py_murmur3_x86_32, METH_VARARGS, 93 | "Make an x86 murmur3 32-bit hash value"}, 94 | 95 | {"murmur3_x86_64", py_murmur3_x86_64, METH_VARARGS, 96 | "Make an x86 murmur3 64-bit hash value"}, 97 | {"murmur3_x64_64", py_murmur3_x64_64, METH_VARARGS, 98 | "Make an x64 murmur3 64-bit hash value"}, 99 | 100 | {"murmur3_x86_128", py_murmur3_x86_128, METH_VARARGS, 101 | "Make an x86 murmur3 128-bit hash value"}, 102 | {"murmur3_x64_128", py_murmur3_x64_128, METH_VARARGS, 103 | "Make an x64 murmur3 128-bit hash value"}, 104 | 105 | {NULL, NULL, 0, NULL} 106 | }; 107 | 108 | 109 | #if PY_MAJOR_VERSION <= 2 110 | 111 | extern PyMODINIT_FUNC 112 | initscamurmur3(void) 113 | { 114 | PyObject *m; 115 | 116 | m = Py_InitModule3("scamurmur3", scamurmur3_methods, module_doc); 117 | 118 | if (m == NULL) 119 | return; 120 | PyModule_AddStringConstant(m, "__version__", MODULE_VERSION); 121 | } 122 | 123 | #else 124 | 125 | /* Python 3.x */ 126 | 127 | static PyModuleDef scamurmur3_module = { 128 | PyModuleDef_HEAD_INIT, 129 | "scamurmur3", 130 | module_doc, 131 | -1, 132 | scamurmur3_methods, 133 | NULL, 134 | NULL, 135 | NULL, 136 | NULL 137 | }; 138 | 139 | extern PyMODINIT_FUNC 140 | PyInit_smhasher(void) 141 | { 142 | PyObject *m; 143 | 144 | m = PyModule_Create(&smhasher_module); 145 | if (m == NULL) 146 | goto finally; 147 | PyModule_AddStringConstant(m, "__version__", MODULE_VERSION); 148 | 149 | finally: 150 | return m; 151 | } 152 | 153 | #endif 154 | 155 | -------------------------------------------------------------------------------- /deps/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | 5 | 6 | __version__ = '"0.3.0"' 7 | 8 | 9 | scamurmur3 = Extension( 10 | 'scamurmur3', 11 | sources=[ 12 | 'scamurmur3/scamurmur3.c', 13 | 'scamurmur3/murmur3.c', 14 | ], 15 | include_dirs=['./scamurmur3'], 16 | define_macros=[('MODULE_VERSION', __version__)] 17 | ) 18 | 19 | clzf = Extension( 20 | 'clzf', 21 | sources=[ 22 | 'liblzf/clzf.c', 23 | 'liblzf/lzf_c.c', 24 | 'liblzf/lzf_d.c', 25 | ], 26 | include_dirs=['./liblzf'], 27 | define_macros=[('MODULE_VERSION', __version__)] 28 | ) 29 | 30 | rtrie = Extension( 31 | 'rtrie', 32 | sources=[ 33 | 'librtrie/pyrtrie.c', 34 | 'librtrie/rtrie.c', 35 | ], 36 | include_dirs=['./librtrie'], 37 | define_macros=[('MODULE_VERSION', __version__)] 38 | ) 39 | 40 | wtrie = Extension( 41 | "wtrie", 42 | sources=["libwtrie/wtrie.pyx"], 43 | define_macros=[('MODULE_VERSION', __version__)] 44 | ) 45 | 46 | clz4 = Extension( 47 | 'clz4', 48 | sources=[ 49 | 'liblz4/clz4.c', 50 | 'liblz4/lz4.c', 51 | 'liblz4/lz4hc.c', 52 | ], 53 | include_dirs=['./liblz4'], 54 | define_macros=[('MODULE_VERSION', __version__)] 55 | ) 56 | 57 | cardunion = Extension( 58 | "cardunion", 59 | ["cardunion/cardunion.pyx"], 60 | libraries=["m"], 61 | define_macros=[('MODULE_VERSION', __version__)] 62 | ) 63 | 64 | ebitset = Extension( 65 | "pyebset", 66 | sources=["libebset/pyebset.pyx"], 67 | include_dirs=['./libebset'], 68 | language="c++", 69 | define_macros=[('MODULE_VERSION', __version__)] 70 | ) 71 | 72 | maxhash = Extension( 73 | "maxhash", 74 | sources=["maxhash/maxhash.pyx"], 75 | define_macros=[('MODULE_VERSION', __version__)] 76 | ) 77 | 78 | lru = Extension( 79 | "pylru", 80 | sources=["liblru/pylru.pyx"], 81 | include_dirs=['./liblru'], 82 | language="c++", 83 | define_macros=[('MODULE_VERSION', __version__)] 84 | ) 85 | 86 | 87 | lmdb = Extension( 88 | "mdb", 89 | sources=["liblmdb/db.pyx", ], 90 | libraries=["lmdb"], 91 | library_dirs=["/usr/local/lib"], 92 | include_dirs=["/usr/local/include"], 93 | runtime_library_dirs=["/usr/local/lib"]) 94 | 95 | 96 | setup( 97 | name = "hustle-deps", 98 | version = __version__, 99 | cmdclass = {'build_ext': build_ext}, 100 | description=('Hustle-deps: a collection of dependent libraries.'), 101 | author = 'Chango Inc.', 102 | license = 'MIT', 103 | ext_modules = [ 104 | scamurmur3, 105 | cardunion, 106 | ebitset, 107 | maxhash, 108 | clzf, 109 | clz4, 110 | rtrie, 111 | wtrie, 112 | lru, 113 | lmdb, 114 | ] 115 | ) 116 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | -------------------------------------------------------------------------------- /doc/_static/hustle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/doc/_static/hustle.png -------------------------------------------------------------------------------- /doc/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {%- block extrahead %} 4 | {{ super() }} 5 | 10 | {% endblock %} 11 | 12 | {% block footer %} 13 | {{ super() }} 14 | 25 | {% endblock %} 26 | 27 | -------------------------------------------------------------------------------- /doc/api/core.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: hustle.core.marble 2 | :members: -------------------------------------------------------------------------------- /doc/api/hustle.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: hustle 2 | :members: -------------------------------------------------------------------------------- /doc/howto/cli.rst: -------------------------------------------------------------------------------- 1 | .. _cliguide: 2 | 3 | Hustle Command Line Interface (CLI) 4 | =================================== 5 | 6 | After installing Hustle, you can invoke the Hustle CLI from the installation directory like this:: 7 | 8 | bin/hustle 9 | 10 | Assuming you've installed everything and have a running and correctly configured *Disco* instance, you will get a 11 | Python prompt looking something like this:: 12 | 13 | ➜ bin git:(develop) ✗ ./hustle 14 | Loading Hustle Tables from disco://localhost 15 | impressions 16 | pixels 17 | Welcome to Hustle! Type `commands()` or `tables()` for some help, `exit()` to leave. 18 | >>> 19 | 20 | We see here that the CLI has loaded the Hustle tables from the *disco://localhost* cluster called *impressions* 21 | and *pixels*. The CLI actually loads these into Python's global variable space, so that these 22 | :class:`Tables ` are actually instantiated with their table names in the Python namespace:: 23 | 24 | >>> schema(impressions) 25 | ad_id (int32,IX) cpm_millis (uint32) date (string,IX,PT) 26 | site_id (dict(32),IX) time (uint32,IX) token (string,IX) 27 | url (dict(32)) 28 | 29 | gives the *schema* of the *impressions* table. Doing a query is just as simple:: 30 | 31 | >>> select(impressions.ad_id, h_sum(impressions.cpm_millis), where=impressions.date == '2014-01-20') 32 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 33 | ad_id sum(cpm_millis) 34 | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 35 | 30,016 1,690 36 | 30,003 925 37 | 30,019 2,023 38 | 30,024 1,511 39 | 30,009 863 40 | 30,025 3,124 41 | 30,010 2,555 42 | 30,011 2,150 43 | 30,014 4,491 44 | 45 | 46 | The CLI offers the following features over and above being a 'normal' Python REPL: 47 | - configurable command history 48 | - no *import* statements required to load Hustle functionality 49 | - auto-completion (with TAB key) of all Hustle functions, Tables, and Columns 50 | - query results (from :func:`select ` are automatically sent to *stdout* 51 | -------------------------------------------------------------------------------- /doc/howto/configure.rst: -------------------------------------------------------------------------------- 1 | .. _configureguide: 2 | 3 | Configuring Hustle 4 | ================== 5 | 6 | Hustle has a configuration file located at:: 7 | 8 | /etc/hustle/settings.yaml 9 | 10 | and has the following possible settings: 11 | 12 | ============== ============================== ============================================== 13 | Name Default Value Description 14 | ============== ============================== ============================================== 15 | server disco://localhost The Disco master node 16 | worker_class hustle.core.pipeworker.Worker The Disco Worker class 17 | dump False True will automatically print select() results 18 | nest False True will return a Table from select() 19 | partition 16 The number of partitions for restrict-select 20 | history_size 1000 The number of history entries in the CLI 21 | ============== ============================== ============================================== 22 | 23 | -------------------------------------------------------------------------------- /doc/howto/delete.rst: -------------------------------------------------------------------------------- 1 | .. _deleteguide: 2 | 3 | Deleting Data in Hustle 4 | ======================= 5 | Deleting data in Hustle is partition-oriented, which means you *can't* remove specific rows as conventional database systems dose. There are two functions to do this with different granularities. 6 | 7 | Delete 8 | ------ 9 | :func:`delete() ` function *only* deletes data but keeps the table definition. If a :class:`Table ` object specified, all data in that table will be deleted. To delete a particular range of partitions, pass it an :class:`Expr `, for example, "impressions.date < '2014-01-01'". 10 | 11 | .. seealso:: 12 | 13 | :func:`hustle.delete` 14 | Hustle's delete statement 15 | 16 | :ref:`schemadesign` 17 | Details of the Hustle Partition 18 | 19 | Drop 20 | ---- 21 | Use :func:`drop() ` function to delete the whole table, including data, all partitions, and table definition. Unlike :func:`delete() `, it *only* takes a :class:`Table ` object to specify the table you want to drop. 22 | 23 | .. seealso:: 24 | 25 | :func:`hustle.drop` 26 | Hustle's drop statement 27 | -------------------------------------------------------------------------------- /doc/howto/insert.rst: -------------------------------------------------------------------------------- 1 | .. _insertguide: 2 | 3 | Inserting Data To Hustle 4 | ======================== 5 | 6 | The process of inserting data into a Hustle cluster is referred to as a *distributed* insert. It is 7 | distributed because the client machine does the heavy lifting of creating a 8 | :class:`Marble `, which is a self-contained large grained database fragment, which 9 | is then `pushed into the distributed file system DDFS `_, 10 | which is a relatively inexpensive HTTP operation. The write throughput to the Hustle cluster, then, is only 11 | bound by the number of machines inserting into it. 12 | 13 | Hustle currently supports `one JSON object per line `_ style input, as well as 14 | `Disco's native results format `_. 15 | 16 | Here is an example insert:: 17 | 18 | from hustle import Table, insert 19 | impressions = Table.from_tag('impressions') 20 | insert(impressions, './imprsions-june-8.json', server='disco://hustle') 21 | 22 | Hustle provides a command-line tool for inserting data located at :code:`bin/insert`. Here is the *--help* for 23 | it:: 24 | 25 | ➜ hustle/bin > ./insert --help 26 | usage: insert [-h] [-s SERVER] [-f INFILE] [-m MAXSIZE] [-t TMPDIR] 27 | [-p PROCESSOR] [--disco-chunk] 28 | TABLE FILES [FILES ...] 29 | 30 | Hustle bulk load 31 | 32 | positional arguments: 33 | TABLE The Hustle table to insert to 34 | FILES The JSON formated files to insert 35 | 36 | optional arguments: 37 | -h, --help show this help message and exit 38 | -s SERVER, --server SERVER 39 | DDFS server destination 40 | -f INFILE A file containing a list of all files to be inserted 41 | -m MAXSIZE, --maxsize MAXSIZE 42 | Initial size of Hustle marble 43 | -t TMPDIR, --tmpdir TMPDIR 44 | Temporary directory for Hustle marble creation 45 | -p PROCESSOR a module.function for the Hustle import preprocessor 46 | --disco-chunk Indicated if the input files are in Disco CHUNK format 47 | 48 | .. seealso:: 49 | 50 | Page :ref:`integrationtests` 51 | Hustle's Integration Test Suite for creating and inserting to partitioned Tables. 52 | 53 | :func:`insert() function ` 54 | 55 | -------------------------------------------------------------------------------- /doc/howto/integration_tests.rst: -------------------------------------------------------------------------------- 1 | .. _integrationtests: 2 | 3 | Hustle Integration Test Suite 4 | ============================= 5 | 6 | The Hustle Integration Test suite is a good place to see non-trivial Hustle Tables created, 7 | data inserted into them, and some subsequent queries. They are located in:: 8 | 9 | hustle/integration_test 10 | 11 | To run the test suite, ensure you have installed `Nose `_ and 12 | :ref:`Hustle `. Before you run the integration tests, you will need to make sure 13 | `Disco `_ is running and that you have run the *setup.py* script once:: 14 | 15 | python hustle/integration_test/setup.py 16 | 17 | You can then execute the *nosetests* in the integration suite:: 18 | 19 | cd hustle/integration_test 20 | nosetests 21 | 22 | -------------------------------------------------------------------------------- /doc/hustle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/doc/hustle.png -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | Hustle Documentation 2 | ==================== 3 | 4 | Hustle is a distributed, column oriented, relational 5 | `OLAP Database `_. Hustle supports parallel insertions 6 | and queries over large data sets, stored on an unreliable cluster of computers. It is meant to load and query the 7 | enormous data sets typical of ad-tech, high volume web services, and other large-scale analytics applications. 8 | 9 | Hustle is a distributed database. When data is inserted into Hustle, it is replicated across a cluster to enhance 10 | availability, horizontal scalability and enable parallel query execution. When data is replicated on multiple nodes, 11 | your database becomes resistant to node failure because there is always multiple copies of it on the cluster. This 12 | allows you to simply add more machines to increase both overall storage and to decrease query time by performing 13 | more operations in parallel. 14 | 15 | Hustle is a relational database, so, unlike other NoSQL databases, it stores its data in rows and columns in a fixed 16 | schema. This means that you must *create* Tables with a fixed number of Columns of specific data types, before 17 | *inserting* data into the database. The advantage of this is that both storage and query execution can be 18 | fine tuned to minimize both the data footprint and the query execution time. 19 | 20 | Hustle uses a `column oriented format `_ for storing data. This 21 | scheme is often used for very large databases, as it is more efficient for aggregation operations such as sum() and 22 | average() functions over a particular column as well as relational *joins* across tables. 23 | 24 | Although Hustle has a relational data model, it is not a SQL database. Hustle extends the Python language for 25 | its relational query facility. Let's take a look at a typical Hustle query in Python:: 26 | 27 | select(impressions.ad_id, h_sum(pixels.amount), h_count(), 28 | where=(impressions.date < '2014-01-13', pixels.date < '2014-01-13'), 29 | join=(impressions.site_id, pixels.site_id), 30 | order_by='ad_id', desc=True) 31 | 32 | which would be equivalent to the SQL query:: 33 | 34 | SELECT i.ad_id, i.site_id, sum(p.amount), count(*) 35 | FROM impressions i 36 | JOIN pixels p on p.site_id = p.site_id 37 | WHERE i.date < '2014-01-13' and p.date < '2014-01-13' 38 | ORDER BY i.ad_id DESC 39 | GROUP BY i.ad_id, i.site_id 40 | 41 | The two approaches seem equivalent, however, Python is extensible, whereas SQL is not. You can do much more 42 | with Hustle than just query data. Hustle was designed to express distributed computation over indexed data which 43 | includes, but is not limited to the classic relational *select* statement. SQL is good at queries, not as an ecosystem 44 | for general purpose data-centric distributed computation. 45 | 46 | Hustle is meant for large, distributed inserts, and has *append only* semantics. It is suited to very large *log* 47 | file style inputs, and once data is inserted, it cannot be changed. This scheme is typically suitable for 48 | distributed applications that generate large log files, with many (possibly hundreds of) thousands of events 49 | per second. Hustle has been streamlined to accept structured JSON log files as its primary input format, and to 50 | perform *distributed* inserts. A distributed insert delegates most of the database creation work to the *client*, 51 | thereby freeing up the cluster's resources and avoiding a central computational pinch point like in other *write bound* 52 | relational OLAP databases. Hustle can easily handle almost unlimited write load using this scheme. 53 | 54 | Hustle utilizes modern compression and indexing data structures and algorithms to minimize overall memory footprint 55 | and to maximize query performance. It utilizes bitmap indexes, prefix trie (dictionary) and lz4 compression, and has a 56 | very rich set of string and numeric data types of various sizes. Typically, Hustle data sets are 25% to 50% than 57 | their equivalent GZIPed JSON sources. 58 | 59 | Hustle has several auxiliary tools: 60 | 61 | * a command line interface (CLI) Python shell with auto-completion of Hustle tables and functions 62 | * a client side insert script 63 | 64 | Features 65 | -------- 66 | 67 | * column oriented - super fast queries 68 | * distributed insert - Hustle is designed for petabyte scale datasets in a distributed environment with massive write loads 69 | * compressed - bitmap indexes, lz4, and prefix trie compression 70 | * relational - join gigantic data sets 71 | * partitioned - smart shards 72 | * embarrassingly distributed (`based on Disco `_) 73 | * embarrassingly fast (`uses LMDB `_) 74 | * NoSQL - Python DSL 75 | * bulk append only semantics 76 | * highly available, horizontally scalable 77 | * REPL/CLI query interface 78 | 79 | Getting started 80 | --------------- 81 | 82 | .. toctree:: 83 | :titlesonly: 84 | 85 | start/install 86 | .. start/tutorial 87 | 88 | Hustle In Depth 89 | --------------- 90 | 91 | .. toctree:: 92 | :titlesonly: 93 | 94 | howto/integration_tests 95 | howto/configure 96 | howto/cli 97 | howto/schema 98 | howto/query 99 | howto/insert 100 | howto/delete 101 | 102 | Reference 103 | --------- 104 | 105 | .. toctree:: 106 | :titlesonly: 107 | 108 | api/hustle 109 | api/core 110 | -------------------------------------------------------------------------------- /doc/start/install.rst: -------------------------------------------------------------------------------- 1 | .. _installguide: 2 | 3 | Installing Hustle 4 | ================= 5 | 6 | Hustle is hosted on `GitHub `_ and should be cloned from that repo:: 7 | 8 | git clone git@github.com:chango/hustle.git 9 | 10 | Dependencies 11 | ------------ 12 | 13 | Hustle has the following dependencies: 14 | 15 | * you will need `Python 2.7 `_ 16 | * you will need `Cython `_ 17 | * you will need `Disco 0.5 `_ 18 | * you will need `ultrajson `_ 19 | * you will need `PyYAML `_ 20 | 21 | Installing the Hustle Client 22 | ---------------------------- 23 | 24 | In order to run Hustle, you will need to install it onto an existing *Disco v0.5* cluster. 25 | 26 | In order to query a Hustle/Disco cluster, you will need to install the Hustle software on that *client* machine:: 27 | 28 | cd hustle 29 | sudo ./bootstrap.sh 30 | 31 | This will build and install Hustle on your client machine. 32 | 33 | Installing on the Cluster 34 | ------------------------- 35 | 36 | Disco is a distributed system and may have many nodes. Each of the nodes in your Disco cluster will need to install 37 | the Hustle dependencies. These can be found in the *hustle/deps* directory. The easiest way to install Hustle on 38 | your disco slave nodes is to:: 39 | 40 | cd hustle/deps 41 | make 42 | sudo make install 43 | 44 | on **ALL** you disco slave nodes. 45 | 46 | You may now want to go and run the :ref:`Integration Tests ` to validate your installation. 47 | -------------------------------------------------------------------------------- /doc/start/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | Hustle Tutorial 4 | =============== 5 | 6 | coming soon.... -------------------------------------------------------------------------------- /hustle/cardinality.py: -------------------------------------------------------------------------------- 1 | from hustle.core.marble import Aggregation, Column 2 | 3 | import mdb 4 | 5 | 6 | def h_cardinality(col): 7 | """ 8 | """ 9 | 10 | def _inner_deault(): 11 | from cardunion import Cardunion 12 | return Cardunion(12) 13 | 14 | def _inner_hll_accumulate(a, v): 15 | a.bunion([v]) 16 | return a 17 | 18 | return Aggregation("cardinality", 19 | col, 20 | f=_inner_hll_accumulate, 21 | g=lambda a: a.count(), 22 | h=lambda a: a.dumps(), 23 | default=_inner_deault, 24 | result_spec=Column('_cardinality_type', type_indicator=mdb.MDB_UINT_32)) 25 | 26 | 27 | def h_union(col): 28 | def _inner_deault(): 29 | from cardunion import Cardunion 30 | return Cardunion(12) 31 | 32 | def _inner_hll_accumulate(a, v): 33 | a.bunion([v]) 34 | return a 35 | 36 | return Aggregation("union", 37 | col, 38 | f=_inner_hll_accumulate, 39 | g=lambda a, c: a.dumps(), 40 | h=lambda a: a.dumps(), 41 | default=_inner_deault, 42 | result_spec=Column('_union_type', type_indicator=mdb.MDB_STR, compression_indicator=3)) 43 | 44 | 45 | def h_minhash_merge(col): 46 | def _inner_deault(): 47 | from maxhash import MaxHash 48 | return MaxHash() 49 | 50 | def _inner_hll_accumulate(a, v): 51 | from maxhash import MaxHash 52 | a.merge(MaxHash.loads(v)) 53 | return a 54 | 55 | return Aggregation("minhash_merge", 56 | col, 57 | f=_inner_hll_accumulate, 58 | g=lambda a, c: a.dumps(), 59 | h=lambda a: a.dumps(), 60 | default=_inner_deault, 61 | result_spec=Column('_minhash_merge_type', type_indicator=mdb.MDB_STR, compression_indicator=3)) 62 | -------------------------------------------------------------------------------- /hustle/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/hustle/core/__init__.py -------------------------------------------------------------------------------- /hustle/core/column_fn.py: -------------------------------------------------------------------------------- 1 | from hustle.core.marble import Column 2 | 3 | import mdb 4 | 5 | 6 | class ColumnFn(object): 7 | """ 8 | Decorator for column functions. 9 | 10 | Note that the decorating column will inherit all properties from the 11 | decorated one by default. If the column function will change the data type 12 | of the original column, remember to overwrite its corresponding types, 13 | i.e. type_indicator, index_indicator, rtrie_indicator, compression_indicator, 14 | and boolean. This matters when you want to store the query result back to 15 | the database. All the specific indicators show as follows: 16 | 17 | ============== ================== 18 | type_indicator Description 19 | ============== ================== 20 | mdb.MDB_STR String 21 | mdb.MDB_INT_8/16/32/64 Integer 22 | mdb.MDB_UINT_8/16/32/64 Unsigned Integer 23 | ============== ================== 24 | 25 | ============== ================== 26 | compression_indicator Description 27 | ============== ================== 28 | 0 Trie 29 | 1 String 30 | 2 LZ4 31 | 3 Binary 32 | ============== ================== 33 | 34 | ============== ================== 35 | trie_indicator Description 36 | ============== ================== 37 | mdb.MDB_UINT_16 16 bit Trie 38 | mdb.MDB_UINT_32 32 bit Trie(default) 39 | ============== ================== 40 | 41 | ============== ================== 42 | boolean Description 43 | ============== ================== 44 | True Boolean Type 45 | ============== ================== 46 | 47 | ============== ================== 48 | index_indicator Description 49 | ============== ================== 50 | 1 index(default) 51 | 2 wide index 52 | ============== ================== 53 | """ 54 | def __init__(self, 55 | type_indicator=None, 56 | index_indicator=None, 57 | compression_indicator=None, 58 | rtrie_indicator=None, 59 | boolean=None): 60 | self.type_indicator = type_indicator 61 | self.index_indicator = index_indicator 62 | self.compression_indicator = compression_indicator 63 | self.rtrie_indicator = rtrie_indicator 64 | self.boolean = boolean 65 | 66 | def __call__(self, fn): 67 | def wrap(column): 68 | index_indicator = self.index_indicator if self.index_indicator is \ 69 | not None else column.index_indicator 70 | type_indicator = self.type_indicator if self.type_indicator is \ 71 | not None else column.type_indicator 72 | rtrie_indicator = self.rtrie_indicator if self.rtrie_indicator is \ 73 | not None else column.rtrie_indicator 74 | compression_indicator = self.compression_indicator if \ 75 | self.compression_indicator is not None else column.compression_indicator 76 | is_boolean = self.boolean if self.boolean is not None else column.is_boolean 77 | 78 | new_column = Column(column.name, column.table, index_indicator, 79 | column.partition, type_indicator, compression_indicator, 80 | rtrie_indicator, alias=column.alias, boolean=is_boolean, 81 | column_fn=fn) 82 | return new_column 83 | return wrap 84 | 85 | 86 | # column functions defined as follows: 87 | 88 | @ColumnFn(type_indicator=mdb.MDB_STR) 89 | def ip_ntoa(val): 90 | import socket 91 | import struct 92 | try: 93 | ip = socket.inet_ntoa(struct.pack(">L", val)) 94 | except: 95 | ip = "0.0.0.0" 96 | return ip 97 | 98 | 99 | @ColumnFn(type_indicator=mdb.MDB_UINT_32) 100 | def ip_aton(val): 101 | import socket 102 | import struct 103 | try: 104 | ip = struct.unpack(">L", socket.inet_aton(val))[0] 105 | except: 106 | ip = 0 107 | return ip 108 | 109 | 110 | @ColumnFn(type_indicator=mdb.MDB_INT_16) 111 | def year(val): 112 | """ 113 | extract YEAR from "YYYY-MM-DD". Return 0 if failed 114 | """ 115 | try: 116 | year = int(val[:4]) 117 | except: 118 | year = -1 119 | return year 120 | 121 | 122 | @ColumnFn(type_indicator=mdb.MDB_INT_8) 123 | def month(val): 124 | """ 125 | extract MONTH from "YYYY-MM-DD". Return 0 if failed 126 | """ 127 | try: 128 | month = int(val[5:7]) 129 | except: 130 | month = -1 131 | return month 132 | 133 | 134 | @ColumnFn(type_indicator=mdb.MDB_INT_8) 135 | def day(val): 136 | """ 137 | extract DAY from "YYYY-MM-DD". Return 0 if failed 138 | """ 139 | try: 140 | day = int(val[-2:]) 141 | except: 142 | day = -1 143 | return day 144 | 145 | 146 | # the old school way to write a column function 147 | # def ip_ntoa(column): 148 | # "column function for converting an integer IPv4 to a string" 149 | # import mdb 150 | # if column.type_indicator != mdb.MDB_UINT_32: 151 | # raise TypeError("Specified column should be of the uint_32 type") 152 | # new_column = Column(column.name, column.table, column.index_indicator, 153 | # column.partition, mdb.MDB_STR, compression_indicator=0, 154 | # rtrie_indicator=column.rtrie_indicator, alias=column.alias, 155 | # boolean = False, column_fn=_ip_ntoa) 156 | # return new_column 157 | -------------------------------------------------------------------------------- /hustle/core/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from disco.ddfs import DDFS 3 | from disco.core import Disco 4 | 5 | 6 | def guess_settings(): 7 | for settings_file in (os.path.expanduser('~/.hustle'), 8 | '/etc/hustle/settings.yaml'): 9 | if os.path.exists(settings_file): 10 | return settings_file 11 | return '' 12 | 13 | 14 | defaults = { 15 | 'settings_file': guess_settings(), 16 | 'server': 'disco://localhost', 17 | 'nest': False, 18 | 'dump': False, 19 | 'worker_class': 'disco.worker.classic.worker.Worker', 20 | 'partition': 16, 21 | 'history_size': 1000 22 | } 23 | 24 | overrides = {} 25 | 26 | 27 | class Settings(dict): 28 | def __init__(self, *args, **kwargs): 29 | # load the defaults 30 | super(Settings, self).update(defaults) 31 | 32 | # override with the settings file 33 | path = kwargs.get('settings_file') or self['settings_file'] 34 | if path and os.path.exists(path): 35 | try: 36 | import yaml 37 | self.update(yaml.load(open(path))) 38 | except: 39 | pass # if ya can't ya can't 40 | 41 | # final overrides 42 | super(Settings, self).update(overrides) 43 | super(Settings, self).__init__(*args, **kwargs) 44 | 45 | # set up ddfs and disco 46 | if not self['server'].startswith('disco://'): 47 | self['server'] = 'disco://' + self['server'] 48 | 49 | if 'ddfs' not in self: 50 | self['ddfs'] = DDFS(self['server']) 51 | self['server'] = Disco(self['server']) 52 | 53 | # set up worker 54 | if 'worker' not in self: 55 | worker_mod, _, worker_class = self['worker_class'].rpartition('.') 56 | mod = __import__(worker_mod, {}, {}, worker_mod) 57 | self['worker'] = getattr(mod, worker_class)() 58 | -------------------------------------------------------------------------------- /hustle/core/stat.py: -------------------------------------------------------------------------------- 1 | from disco.core import Job 2 | from disco.worker.task_io import task_input_stream 3 | from hustle.core.pipeworker import Worker, HustleStage 4 | 5 | import hustle 6 | import hustle.core 7 | import hustle.core.marble 8 | 9 | 10 | def stat_input_stream(fd, size, url, params): 11 | from disco import util 12 | from hustle.core.marble import MarbleStream 13 | 14 | try: 15 | scheme, netloc, rest = util.urlsplit(url) 16 | except Exception as e: 17 | print "Error handling hustle_input_stream for %s. %s" % (url, e) 18 | raise e 19 | 20 | otab = None 21 | try: 22 | # print "FLurlG: %s" % url 23 | fle = util.localize(rest, disco_data=params._task.disco_data, 24 | ddfs_data=params._task.ddfs_data) 25 | # print "FLOGLE: %s" % fle 26 | otab = MarbleStream(fle) 27 | rows = otab.number_rows 28 | frows = float(rows) 29 | rval = {'_': rows, } 30 | for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems(): 31 | if subindexdb: 32 | rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows 33 | yield '', rval 34 | except Exception as e: 35 | print "Gibbers: %s" % e 36 | raise e 37 | finally: 38 | if otab: 39 | otab.close() 40 | 41 | 42 | class StatPipe(Job): 43 | required_modules = [ 44 | ('hustle', hustle.__file__), 45 | ('hustle.core', hustle.core.__file__), 46 | ('hustle.core.marble', hustle.core.marble.__file__)] 47 | 48 | def __init__(self, master): 49 | 50 | super(StatPipe, self).__init__(master=master, worker=Worker()) 51 | self.pipeline = [('split', 52 | HustleStage('stat', 53 | process=process_stat, 54 | input_chain=[task_input_stream, 55 | stat_input_stream]))] 56 | 57 | 58 | def process_stat(interface, state, label, inp, task): 59 | from disco import util 60 | 61 | # inp contains a set of replicas, let's force local #HACK 62 | input_processed = False 63 | for i, inp_url in inp.input.replicas: 64 | scheme, (netloc, port), rest = util.urlsplit(inp_url) 65 | if netloc == task.host: 66 | input_processed = True 67 | inp.input = inp_url 68 | break 69 | 70 | if not input_processed: 71 | raise Exception("Input %s not processed, no LOCAL resource found." 72 | % str(inp.input)) 73 | 74 | for key, value in inp: 75 | interface.output(0).add(key, value) 76 | -------------------------------------------------------------------------------- /hustle/core/util.py: -------------------------------------------------------------------------------- 1 | from disco import func 2 | from disco import util 3 | from disco.settings import DiscoSettings 4 | 5 | import collections 6 | 7 | 8 | class Peekable(object): 9 | def __init__(self, iterable): 10 | self._iterable = iter(iterable) 11 | self._cache = collections.deque() 12 | 13 | def __iter__(self): 14 | return self 15 | 16 | def _fillcache(self, n): 17 | if n is None: 18 | n = 1 19 | while len(self._cache) < n: 20 | self._cache.append(self._iterable.next()) 21 | 22 | def next(self, n=None): 23 | self._fillcache(n) 24 | if n is None: 25 | result = self._cache.popleft() 26 | else: 27 | result = [self._cache.popleft() for i in range(n)] 28 | return result 29 | 30 | def peek(self, n=None): 31 | self._fillcache(n) 32 | if n is None: 33 | result = self._cache[0] 34 | else: 35 | result = [self._cache[i] for i in range(n)] 36 | return result 37 | 38 | 39 | class SortedIterator(object): 40 | 41 | def __init__(self, inputs): 42 | ins = [Peekable(input) for input in inputs] 43 | self.collection = sorted(ins, key=self._key) 44 | 45 | def __iter__(self): 46 | return self 47 | 48 | def next(self): 49 | removes = [] 50 | reinsert = None 51 | rval = None 52 | for stream in self.collection: 53 | try: 54 | rval = stream.next() 55 | reinsert = stream 56 | break 57 | except StopIteration: 58 | removes.append(stream) 59 | 60 | if rval: 61 | for remove in removes: 62 | self.collection.remove(remove) 63 | if reinsert: 64 | self.collection.remove(reinsert) 65 | try: 66 | reinsert.peek() 67 | except: 68 | pass 69 | else: 70 | removes = [] 71 | reinsert_index = 0 72 | for stream in self.collection: 73 | try: 74 | stream.peek() 75 | if self._key(reinsert) < self._key(stream): 76 | break 77 | except: 78 | removes.append(stream) 79 | reinsert_index += 1 80 | self.collection.insert(reinsert_index, reinsert) 81 | for remove in removes: 82 | self.collection.remove(remove) 83 | return rval 84 | raise StopIteration 85 | 86 | def _key(self, stream): 87 | try: 88 | key, value = stream.peek() 89 | return tuple(key) 90 | except StopIteration: 91 | return tuple() 92 | 93 | 94 | def sorted_iterator(urls, 95 | reader=func.chain_reader, 96 | input_stream=(func.map_input_stream,), 97 | notifier=func.notifier, 98 | params=None, 99 | ddfs=None): 100 | 101 | from disco.worker import Input 102 | from disco.worker.classic.worker import Worker 103 | 104 | worker = Worker(map_reader=reader, map_input_stream=input_stream) 105 | settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() 106 | 107 | inputs = [] 108 | for input in util.inputlist(urls, settings=settings): 109 | notifier(input) 110 | instream = Input(input, open=worker.opener('map', 'in', params)) 111 | if instream: 112 | inputs.append(instream) 113 | 114 | return SortedIterator(inputs) 115 | 116 | 117 | def ensure_list(val): 118 | if not isinstance(val, list): 119 | if isinstance(val, (tuple, set)): 120 | return list(val) 121 | return [val] 122 | return val 123 | -------------------------------------------------------------------------------- /integration_test/README: -------------------------------------------------------------------------------- 1 | Make sure you have Disco (0.5 or later) running locally. 2 | 3 | Make sure you've installed the ../deps correctly. 4 | 5 | Make sure you run setup.py to create all of the tables the tests will need. 6 | 7 | Run the test from within this directory using nose. 8 | 9 | Good luck, and please let us know if you are having problems or have feedback! 10 | 11 | tspurway AT gmail D0T com 12 | 13 | -------------------------------------------------------------------------------- /integration_test/fixtures/ip.json: -------------------------------------------------------------------------------- 1 | {"ip":3221291265,"exchange_id":"OpenX"} 2 | {"ip":2130706433,"exchange_id":"OpenX"} 3 | {"ip":3232235777,"exchange_id":"Rubycon"} 4 | {"ip":3232235777,"exchange_id":"Appnexus"} 5 | {"ip":3221291265,"exchange_id":"OpenX"} 6 | {"ip":3221291266,"exchange_id":"Rubycon"} 7 | {"ip":3232235777,"exchange_id":"OpenX"} 8 | {"ip":3221291266,"exchange_id":"Appnexus"} 9 | {"ip":3221291265,"exchange_id":"OpenX"} 10 | {"ip":3232235777,"exchange_id":"OpenX"} 11 | {"ip":3221291265,"exchange_id":"Appnexus"} 12 | {"ip":2130706433,"exchange_id":"Rubycon"} 13 | {"ip":2130706433,"exchange_id":"OpenX"} 14 | {"ip":3221291266,"exchange_id":"Rubycon"} 15 | {"ip":2130706433,"exchange_id":"Appnexus"} 16 | {"ip":3232235777,"exchange_id":"Rubycon"} 17 | {"ip":3232235777,"exchange_id":"Rubycon"} 18 | {"ip":2130706433,"exchange_id":"Adx"} 19 | {"ip":3232235777,"exchange_id":"OpenX"} 20 | {"ip":2130706433,"exchange_id":"Adx"} 21 | {"ip":2130706433,"exchange_id":"Adx"} 22 | {"ip":3232235777,"exchange_id":"Appnexus"} 23 | {"ip":3232235777,"exchange_id":"OpenX"} 24 | {"ip":2130706433,"exchange_id":"OpenX"} 25 | {"ip":2130706433,"exchange_id":"Adx"} 26 | {"ip":3232235777,"exchange_id":"Adx"} 27 | {"ip":3221291265,"exchange_id":"Appnexus"} 28 | {"ip":3232235777,"exchange_id":"OpenX"} 29 | {"ip":3232235777,"exchange_id":"Appnexus"} 30 | {"ip":3221291266,"exchange_id":"Rubycon"} 31 | {"ip":3221291265,"exchange_id":"OpenX"} 32 | {"ip":3221291266,"exchange_id":"Appnexus"} 33 | {"ip":3221291265,"exchange_id":"Rubycon"} 34 | {"ip":3221291265,"exchange_id":"Adx"} 35 | {"ip":3221291266,"exchange_id":"Adx"} 36 | {"ip":3221291265,"exchange_id":"OpenX"} 37 | {"ip":3221291265,"exchange_id":"Appnexus"} 38 | {"ip":3221291266,"exchange_id":"OpenX"} 39 | {"ip":3221291265,"exchange_id":"OpenX"} 40 | {"ip":3221291266,"exchange_id":"Adx"} 41 | {"ip":3221291266,"exchange_id":"Appnexus"} 42 | {"ip":3221291265,"exchange_id":"OpenX"} 43 | {"ip":3221291266,"exchange_id":"OpenX"} 44 | {"ip":3232235777,"exchange_id":"Appnexus"} 45 | {"ip":3232235777,"exchange_id":"Rubycon"} 46 | {"ip":2130706433,"exchange_id":"Adx"} 47 | {"ip":3232235777,"exchange_id":"Adx"} 48 | {"ip":2130706433,"exchange_id":"OpenX"} 49 | {"ip":2130706433,"exchange_id":"Appnexus"} 50 | {"ip":3232235777,"exchange_id":"Adx"} 51 | {"ip":3221291265,"exchange_id":"Appnexus"} 52 | {"ip":2130706433,"exchange_id":"Appnexus"} 53 | {"ip":3232235777,"exchange_id":"Rubycon"} 54 | {"ip":3221291265,"exchange_id":"Appnexus"} 55 | {"ip":3232235777,"exchange_id":"Adx"} 56 | {"ip":3221291266,"exchange_id":"OpenX"} 57 | {"ip":3221291265,"exchange_id":"Rubycon"} 58 | {"ip":2130706433,"exchange_id":"OpenX"} 59 | {"ip":3232235777,"exchange_id":"Adx"} 60 | {"ip":3232235777,"exchange_id":"Rubycon"} 61 | {"ip":2130706433,"exchange_id":"OpenX"} 62 | {"ip":3221291266,"exchange_id":"Rubycon"} 63 | {"ip":2130706433,"exchange_id":"Appnexus"} 64 | {"ip":3221291265,"exchange_id":"Adx"} 65 | {"ip":3232235777,"exchange_id":"Adx"} 66 | {"ip":3221291265,"exchange_id":"OpenX"} 67 | {"ip":3221291265,"exchange_id":"Appnexus"} 68 | {"ip":3221291265,"exchange_id":"Appnexus"} 69 | {"ip":3221291265,"exchange_id":"Adx"} 70 | {"ip":2130706433,"exchange_id":"Adx"} 71 | {"ip":3221291266,"exchange_id":"Appnexus"} 72 | {"ip":3221291265,"exchange_id":"Appnexus"} 73 | {"ip":3232235777,"exchange_id":"OpenX"} 74 | {"ip":3221291266,"exchange_id":"OpenX"} 75 | {"ip":3221291266,"exchange_id":"Rubycon"} 76 | {"ip":3221291265,"exchange_id":"OpenX"} 77 | {"ip":3221291266,"exchange_id":"Adx"} 78 | {"ip":3221291266,"exchange_id":"OpenX"} 79 | {"ip":3221291265,"exchange_id":"Adx"} 80 | {"ip":3221291265,"exchange_id":"Adx"} 81 | {"ip":2130706433,"exchange_id":"Appnexus"} 82 | {"ip":2130706433,"exchange_id":"OpenX"} 83 | {"ip":3232235777,"exchange_id":"Adx"} 84 | {"ip":3221291266,"exchange_id":"Adx"} 85 | {"ip":3221291266,"exchange_id":"Adx"} 86 | {"ip":2130706433,"exchange_id":"Rubycon"} 87 | {"ip":2130706433,"exchange_id":"Appnexus"} 88 | {"ip":3221291265,"exchange_id":"OpenX"} 89 | {"ip":3221291265,"exchange_id":"Adx"} 90 | {"ip":2130706433,"exchange_id":"Appnexus"} 91 | {"ip":3232235777,"exchange_id":"Rubycon"} 92 | {"ip":3232235777,"exchange_id":"Rubycon"} 93 | {"ip":3221291266,"exchange_id":"Appnexus"} 94 | {"ip":3232235777,"exchange_id":"Rubycon"} 95 | {"ip":3221291265,"exchange_id":"Adx"} 96 | {"ip":2130706433,"exchange_id":"OpenX"} 97 | {"ip":3221291266,"exchange_id":"Adx"} 98 | {"ip":3221291266,"exchange_id":"Adx"} 99 | {"ip":3221291266,"exchange_id":"Adx"} 100 | {"ip":3232235777,"exchange_id":"Adx"} 101 | -------------------------------------------------------------------------------- /integration_test/setup.py: -------------------------------------------------------------------------------- 1 | from hustle import Table, insert 2 | from hustle.core.settings import Settings, overrides 3 | import ujson 4 | 5 | 6 | IMPS = '__test_imps' 7 | PIXELS = '__test_pixels' 8 | PIXELS_HLL = '__test_pixels_hll' 9 | IPS = '__test_ips' 10 | 11 | 12 | def imp_process(data): 13 | from disco.util import urlsplit 14 | 15 | _, (host, _), _ = urlsplit(data['url']) 16 | if host.startswith('www.'): 17 | host = host[4:] 18 | data['site_id'] = host 19 | 20 | 21 | def insert_hll(table, file=None, streams=None, preprocess=None, 22 | maxsize=100 * 1024 * 1024, tmpdir='/tmp', decoder=ujson.decode, 23 | lru_size=10000, hll_field=None, **kwargs): 24 | from cardunion import Cardunion 25 | import os 26 | 27 | settings = Settings(**kwargs) 28 | ddfs = settings['ddfs'] 29 | 30 | def part_tag(name, partition=None): 31 | rval = "hustle:" + name 32 | if partition: 33 | rval += ':' + str(partition) 34 | return rval 35 | 36 | def hll_iter(strms): 37 | buf = {} 38 | fields = table._field_names 39 | fields.remove('hll') 40 | # fields.remove('maxhash') 41 | 42 | for stream in strms: 43 | for line in stream: 44 | try: 45 | data = decoder(line) 46 | except Exception as e: 47 | print "Exception decoding record (skipping): %s %s" % (e, line) 48 | else: 49 | if preprocess: 50 | if not preprocess(data): 51 | continue 52 | key = ujson.dumps([data[f] for f in fields]) 53 | if key not in buf: 54 | hll = Cardunion(12) 55 | buf[key] = hll 56 | else: 57 | hll = buf[key] 58 | 59 | hll.add(data[hll_field]) 60 | 61 | for key, hll in buf.iteritems(): 62 | data = dict(zip(fields, ujson.loads(key))) 63 | data['hll'] = hll.dumps() 64 | yield data 65 | 66 | if file: 67 | streams = [open(file)] 68 | lines, partition_files = table._insert([hll_iter(streams)], 69 | maxsize=maxsize, tmpdir=tmpdir, 70 | decoder=lambda x: x, lru_size=lru_size) 71 | if partition_files is not None: 72 | for part, pfile in partition_files.iteritems(): 73 | tag = part_tag(table._name, part) 74 | ddfs.push(tag, [pfile]) 75 | print 'pushed %s, %s' % (part, tag) 76 | os.unlink(pfile) 77 | return table._name, lines 78 | 79 | 80 | def ensure_tables(): 81 | overrides['server'] = 'disco://localhost' 82 | overrides['dump'] = False 83 | overrides['nest'] = False 84 | settings = Settings() 85 | ddfs = settings['ddfs'] 86 | 87 | imps = Table.create(IMPS, 88 | columns=['wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis', 89 | 'index int ad_id', 'index string date', 'index uint time', 'bit click', 90 | 'index bit impression', 'bit conversion'], 91 | partition='date', 92 | force=True) 93 | pixels = Table.create(PIXELS, 94 | columns=['wide index string token', 'index bit isActive', 'index trie site_id', 95 | 'uint amount', 'index int account_id', 'index trie city', 'index trie16 state', 96 | 'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'], 97 | partition='date', 98 | force=True) 99 | pixel_hlls = Table.create(PIXELS_HLL, 100 | columns=['index bit isActive', 'index trie site_id', 'index int account_id', 101 | 'index trie city', 'index trie16 state', 'index string date', 102 | 'binary hll'], 103 | partition='date', 104 | force=True) 105 | ips = Table.create(IPS, 106 | columns=['index trie16 exchange_id', 'index uint32 ip'], 107 | force=True) 108 | 109 | tags = ddfs.list("hustle:%s:" % IMPS) 110 | if len(tags) == 0: 111 | # insert the files 112 | insert(imps, File='fixtures/imps.json', preprocess=imp_process) 113 | 114 | tags = ddfs.list("hustle:%s:" % PIXELS) 115 | if len(tags) == 0: 116 | # insert the files 117 | insert(pixels, File='fixtures/pixel.json') 118 | 119 | tags = ddfs.list("hustle:%s:" % IPS) 120 | if len(tags) == 0: 121 | # insert the files 122 | insert(ips, File='fixtures/ip.json') 123 | 124 | tags = ddfs.list("hustle:%s:" % PIXELS_HLL) 125 | if len(tags) == 0: 126 | # insert the files 127 | insert_hll(pixel_hlls, file='./fixtures/pixel.json', hll_field='token') 128 | 129 | 130 | if __name__ == '__main__': 131 | ensure_tables() 132 | -------------------------------------------------------------------------------- /integration_test/test_aggregation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import select, Table, h_sum, h_count, star 3 | from setup import IMPS 4 | from hustle.core.settings import Settings, overrides 5 | 6 | 7 | class TestAggregation(unittest.TestCase): 8 | def setUp(self): 9 | overrides['server'] = 'disco://localhost' 10 | overrides['dump'] = False 11 | overrides['nest'] = False 12 | self.settings = Settings() 13 | 14 | def tearDown(self): 15 | pass 16 | 17 | def test_count(self): 18 | imps = Table.from_tag(IMPS) 19 | res = select(h_count(), where=imps) 20 | count = list(res)[0][0] 21 | res.purge() 22 | self.assertEqual(count, 200) 23 | 24 | def test_simple_aggregation(self): 25 | imps = Table.from_tag(IMPS) 26 | results = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') 27 | 28 | sum_millis = {} 29 | for ad_id, millis in results: 30 | if ad_id not in sum_millis: 31 | sum_millis[ad_id] = [0, 0] 32 | sum_millis[ad_id][0] += millis 33 | sum_millis[ad_id][1] += 1 34 | results.purge() 35 | 36 | results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') 37 | self.assertGreater(len(list(results)), 0) 38 | for ad_id, millis, count in results: 39 | ad_tup = sum_millis[ad_id] 40 | self.assertEqual(millis, ad_tup[0]) 41 | self.assertEqual(count, ad_tup[1]) 42 | results.purge() 43 | 44 | def test_ordered_aggregation(self): 45 | imps = Table.from_tag(IMPS) 46 | resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') 47 | 48 | sum_millis = {} 49 | for ad_id, millis in resx: 50 | if ad_id not in sum_millis: 51 | sum_millis[ad_id] = [0, 0] 52 | sum_millis[ad_id][0] += millis 53 | sum_millis[ad_id][1] += 1 54 | 55 | results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), 56 | where=imps.date == '2014-01-27', 57 | order_by=2, 58 | limit=3, 59 | nest=True) 60 | self.assertGreater(len(list(results)), 0) 61 | lowest = 0 62 | for ad_id, millis, count in results: 63 | self.assertLessEqual(lowest, count) 64 | lowest = count 65 | ad_tup = sum_millis[ad_id] 66 | self.assertEqual(millis, ad_tup[0]) 67 | self.assertEqual(count, ad_tup[1]) 68 | self.assertEqual(len(list(results)), min(len(sum_millis), 3)) 69 | 70 | resx.purge() 71 | 72 | def test_multiple_group_bys(self): 73 | imps = Table.from_tag(IMPS) 74 | results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') 75 | 76 | sum_millis = {} 77 | for ad_id, dt, millis in results: 78 | key = str(ad_id) + dt 79 | if key not in sum_millis: 80 | sum_millis[key] = [0, 0] 81 | sum_millis[key][0] += millis 82 | sum_millis[key][1] += 1 83 | results.purge() 84 | 85 | results = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22') 86 | self.assertGreater(len(list(results)), 0) 87 | for ad_id, dt, millis, count in results: 88 | ad_tup = sum_millis[str(ad_id) + dt] 89 | self.assertEqual(millis, ad_tup[0]) 90 | self.assertEqual(count, ad_tup[1]) 91 | results.purge() 92 | 93 | def test_nested_agg(self): 94 | imps = Table.from_tag(IMPS) 95 | results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') 96 | 97 | sum_millis = {} 98 | for ad_id, dt, millis in results: 99 | key = str(ad_id) + dt 100 | if key not in sum_millis: 101 | sum_millis[key] = [0, 0] 102 | sum_millis[key][0] += millis 103 | sum_millis[key][1] += 1 104 | results.purge() 105 | 106 | newtab = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), 107 | where=imps.date > '2014-01-22', 108 | nest=True) 109 | results = select(*star(newtab), where=newtab) 110 | self.assertGreater(len(list(results)), 0) 111 | for ad_id, dt, millis, count in results: 112 | ad_tup = sum_millis[str(ad_id) + dt] 113 | self.assertEqual(millis, ad_tup[0]) 114 | self.assertEqual(count, ad_tup[1]) 115 | results.purge() 116 | 117 | def test_overflow(self): 118 | from itertools import izip 119 | 120 | imps = Table.from_tag(IMPS) 121 | fly_results = select(imps.date, h_sum(imps.impression), where=imps, order_by=imps.date) 122 | 123 | nest_tab = select(imps.date, h_sum(imps.impression), where=imps, nest=True) 124 | nest_results = select(*star(nest_tab), where=nest_tab, order_by=0) 125 | 126 | for ((fdate, fimps), (ndate, nimps)) in izip(fly_results, nest_results): 127 | self.assertEqual(fdate, ndate) 128 | self.assertEqual(fimps, nimps) 129 | nest_results.purge() 130 | 131 | -------------------------------------------------------------------------------- /integration_test/test_bool.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import select, Table, h_sum, h_count 3 | from setup import IMPS, PIXELS 4 | from hustle.core.settings import Settings, overrides 5 | 6 | 7 | class TestBool(unittest.TestCase): 8 | def setUp(self): 9 | overrides['server'] = 'disco://localhost' 10 | overrides['dump'] = False 11 | overrides['nest'] = False 12 | self.settings = Settings() 13 | 14 | def tearDown(self): 15 | pass 16 | 17 | def test_project(self): 18 | imps = Table.from_tag(IMPS) 19 | res = select(imps.click, imps.conversion, imps.impression, where=imps) 20 | clicks = conversions = impressions = 0 21 | for (click, conv, imp) in res: 22 | clicks += click 23 | conversions += conv 24 | impressions += imp 25 | 26 | self.assertEqual(clicks, 21) 27 | self.assertEqual(conversions, 5) 28 | self.assertEqual(impressions, 174) 29 | res.purge() 30 | 31 | def test_aggregate(self): 32 | imps = Table.from_tag(IMPS) 33 | res = select(h_sum(imps.click), h_sum(imps.conversion), h_sum(imps.impression), where=imps) 34 | 35 | (clicks, conversions, impressions) = list(res)[0] 36 | 37 | self.assertEqual(clicks, 21) 38 | self.assertEqual(conversions, 5) 39 | self.assertEqual(impressions, 174) 40 | res.purge() 41 | 42 | def test_bool_values(self): 43 | pix = Table.from_tag(PIXELS) 44 | res = select(pix.isActive, where=pix.isActive == True) 45 | actives = 0 46 | for (act, ) in res: 47 | actives += act 48 | 49 | self.assertEqual(actives, 234) 50 | res.purge() 51 | 52 | res = select(pix.isActive, where=pix.isActive == 0) 53 | actives = 0 54 | for (act, ) in res: 55 | actives += 1 56 | 57 | self.assertEqual(actives, 266) 58 | res.purge() 59 | 60 | def test_bit_values(self): 61 | pix = Table.from_tag(PIXELS) 62 | res = select(pix.isActive, where=pix.isActive == 1) 63 | actives = 0 64 | for (act, ) in res: 65 | actives += act 66 | 67 | self.assertEqual(actives, 234) 68 | res.purge() 69 | -------------------------------------------------------------------------------- /integration_test/test_cardinality.py: -------------------------------------------------------------------------------- 1 | from hustle import select, Table 2 | from setup import PIXELS_HLL 3 | from hustle.core.settings import Settings, overrides 4 | from hustle.cardinality import h_cardinality as h_hll 5 | 6 | from collections import defaultdict 7 | from operator import itemgetter 8 | 9 | import unittest 10 | import ujson 11 | 12 | 13 | HLL_ESTIMATE_ERROR = .04 14 | 15 | 16 | class TestCardinalityQuery(unittest.TestCase): 17 | def setUp(self): 18 | overrides['server'] = 'disco://localhost' 19 | overrides['dump'] = False 20 | overrides['nest'] = False 21 | self.settings = Settings() 22 | 23 | def tearDown(self): 24 | pass 25 | 26 | def checkEstimate(self, estimate, expect): 27 | self.assertAlmostEqual(estimate, expect, 28 | delta=int(HLL_ESTIMATE_ERROR * expect)) 29 | 30 | def test_cardinality_all(self): 31 | hll = Table.from_tag(PIXELS_HLL) 32 | res = select(h_hll(hll.hll), where=hll) 33 | estimate = next(iter(res))[0] 34 | tokens = set([]) 35 | with open("./fixtures/pixel.json") as f: 36 | for line in f: 37 | record = ujson.loads(line) 38 | tokens.add(record["token"]) 39 | self.checkEstimate(estimate, len(tokens)) 40 | res.purge() 41 | 42 | def test_cardinality_on_condition(self): 43 | hll = Table.from_tag(PIXELS_HLL) 44 | active_tokens = set([]) 45 | inactive_tokens = set([]) 46 | with open("./fixtures/pixel.json") as f: 47 | for line in f: 48 | record = ujson.loads(line) 49 | if record["isActive"]: 50 | active_tokens.add(record["token"]) 51 | else: 52 | inactive_tokens.add(record["token"]) 53 | res = select(h_hll(hll.hll), where=(hll.isActive == 1)) 54 | estimate = next(iter(res))[0] 55 | self.checkEstimate(estimate, len(active_tokens)) 56 | res.purge() 57 | 58 | res = select(h_hll(hll.hll), where=(hll.isActive == 0)) 59 | estimate = next(iter(res))[0] 60 | self.checkEstimate(estimate, len(inactive_tokens)) 61 | res.purge() 62 | 63 | def test_cardinality_with_order_by(self): 64 | hll = Table.from_tag(PIXELS_HLL) 65 | tokens_by_date = defaultdict(set) 66 | with open("./fixtures/pixel.json") as f: 67 | for line in f: 68 | record = ujson.loads(line) 69 | tokens_by_date[record["date"]].add(record["token"]) 70 | result = [(date, len(tokens)) for date, tokens in tokens_by_date.items()] 71 | 72 | # Test order by date 73 | expects = sorted(result, key=itemgetter(0), reverse=True) 74 | res = select(hll.date, h_hll(hll.hll), where=hll, order_by=0, desc=True) 75 | estimates = list(res) 76 | for i, (date, expected_cardinality) in enumerate(expects): 77 | self.assertEqual(estimates[i][0], date) 78 | self.checkEstimate(estimates[i][1], expected_cardinality) 79 | res.purge() 80 | 81 | # Test order by hll 82 | res = select(hll.date, h_hll(hll.hll), where=hll, order_by=1, desc=True) 83 | l = list(res) 84 | for i in range(len(l) - 1): 85 | self.assertTrue(l[i][1] >= l[i + 1][1]) 86 | res.purge() 87 | -------------------------------------------------------------------------------- /integration_test/test_column_fn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import select, Table, h_max, h_min 3 | from hustle.core.column_fn import ip_ntoa 4 | from setup import IPS 5 | from hustle.core.settings import Settings, overrides 6 | 7 | 8 | class TestSimpleQuery(unittest.TestCase): 9 | def setUp(self): 10 | overrides['server'] = 'disco://localhost' 11 | overrides['dump'] = False 12 | overrides['nest'] = False 13 | self.settings = Settings() 14 | 15 | def tearDown(self): 16 | pass 17 | 18 | def test_column_fn(self): 19 | ips = Table.from_tag(IPS) 20 | res = select(ips.exchange_id, ip_ntoa(ips.ip), 21 | where=ips.exchange_id == "Adx") 22 | results = list(res) 23 | self.assertEqual(len(results), 29) 24 | res.purge() 25 | 26 | def test_column_fn_with_agg(self): 27 | ips = Table.from_tag(IPS) 28 | res = select(ips.exchange_id, h_max(ip_ntoa(ips.ip)), 29 | where=ips, order_by=(ips.exchange_id,)) 30 | results = list(res) 31 | res.purge() 32 | exchanges = [ex for ex, _ in results] 33 | ipss = [ip for _, ip in results] 34 | self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges) 35 | self.assertListEqual(['192.168.1.1'] * 4, ipss) 36 | 37 | res = select(ips.exchange_id, h_min(ip_ntoa(ips.ip)), 38 | where=ips, order_by=(ips.exchange_id,)) 39 | results = list(res) 40 | res.purge() 41 | exchanges = [ex for ex, _ in results] 42 | ipss = [ip for _, ip in results] 43 | self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges) 44 | self.assertListEqual(['127.0.0.1'] * 4, ipss) 45 | 46 | def test_column_fn_with_distinct(self): 47 | ips = Table.from_tag(IPS) 48 | res = select(ip_ntoa(ips.ip), 49 | where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),), 50 | distinct=True) 51 | results = list(res) 52 | res.purge() 53 | ipss = [ip[0] for ip in results] 54 | self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], 55 | ipss) 56 | 57 | def test_column_fn_with_nest(self): 58 | ips = Table.from_tag(IPS) 59 | res = select(ip_ntoa(ips.ip), 60 | where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),), 61 | distinct=True, nest=True) 62 | ret = select(res.ip, where=res, order_by=(res.ip,)) 63 | results = list(ret) 64 | ret.purge() 65 | ipss = [ip[0] for ip in results] 66 | self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], 67 | ipss) 68 | -------------------------------------------------------------------------------- /integration_test/test_drop.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import Table, insert, drop, delete, get_partitions 3 | from hustle.core.settings import Settings, overrides 4 | 5 | IMPS = '__test_drop_imps' 6 | 7 | 8 | def imp_process(data): 9 | from disco.util import urlsplit 10 | 11 | _, (host, _), _ = urlsplit(data['url']) 12 | if host.startswith('www.'): 13 | host = host[4:] 14 | data['site_id'] = host 15 | 16 | 17 | def ensure_tables(): 18 | overrides['server'] = 'disco://localhost' 19 | overrides['dump'] = False 20 | overrides['nest'] = False 21 | settings = Settings() 22 | ddfs = settings['ddfs'] 23 | 24 | imps = Table.create(IMPS, 25 | fields=['=$token', '%url', '+%site_id', '@cpm_millis', '+#ad_id', '+$date', '+@time'], 26 | partition='date', 27 | force=True) 28 | 29 | tags = ddfs.list("hustle:%s:" % IMPS) 30 | if len(tags) == 0: 31 | # insert the files 32 | insert(imps, File='fixtures/imps.json', preprocess=imp_process) 33 | return imps 34 | 35 | 36 | class TestDropTable(unittest.TestCase): 37 | def setUp(self): 38 | overrides['server'] = 'disco://localhost' 39 | overrides['dump'] = False 40 | overrides['nest'] = False 41 | self.settings = Settings() 42 | self.ddfs = self.settings['ddfs'] 43 | self.table = ensure_tables() 44 | 45 | def test_delete_all(self): 46 | delete(self.table) 47 | self.assertEqual([], get_partitions(self.table)) 48 | tags = self.ddfs.list(Table.base_tag(self.table._name)) 49 | self.assertEqual(len(tags), 1) 50 | self.assertEqual(tags[0], "hustle:__test_drop_imps") 51 | 52 | def test_delete_partial(self): 53 | delete(self.table.date >= '2014-01-13') 54 | self.assertEqual(['hustle:__test_drop_imps:2014-01-10', 55 | 'hustle:__test_drop_imps:2014-01-11', 56 | 'hustle:__test_drop_imps:2014-01-12'], 57 | get_partitions(self.table)) 58 | tags = self.ddfs.list(Table.base_tag(self.table._name)) 59 | self.assertEqual(len(tags), 4) 60 | self.assertIn("hustle:__test_drop_imps", tags) 61 | drop(self.table) 62 | with self.assertRaises(ValueError): 63 | delete(self.table.site_id == 'foobar') 64 | delete(self.tale.url) 65 | 66 | def test_drop(self): 67 | drop(self.table) 68 | self.assertEqual([], get_partitions(self.table)) 69 | tags = self.ddfs.list(Table.base_tag(self.table._name)) 70 | self.assertEqual(len(tags), 0) 71 | -------------------------------------------------------------------------------- /integration_test/test_project_order.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import select, Table 3 | from setup import IMPS 4 | from hustle.core.settings import Settings, overrides 5 | 6 | 7 | class TestProjectOrder(unittest.TestCase): 8 | def setUp(self): 9 | overrides['server'] = 'disco://localhost' 10 | overrides['dump'] = False 11 | overrides['nest'] = False 12 | self.settings = Settings() 13 | 14 | def tearDown(self): 15 | pass 16 | 17 | def test_single_int_order(self): 18 | imps = Table.from_tag(IMPS) 19 | res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis) 20 | lowest = 0 21 | for a, d, c in res: 22 | self.assertLessEqual(lowest, c) 23 | lowest = c 24 | res.purge() 25 | 26 | def test_combo_order(self): 27 | imps = Table.from_tag(IMPS) 28 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 29 | where=imps.date > '2014-01-21', 30 | order_by=(imps.date, imps.cpm_millis)) 31 | lowest_cpm = 0 32 | lowest_date = '2000-01-01' 33 | for a, d, c in res: 34 | if lowest_date == d: 35 | self.assertLessEqual(lowest_cpm, c) 36 | lowest_cpm = c 37 | else: 38 | self.assertLessEqual(lowest_date, d) 39 | lowest_date = d 40 | lowest_cpm = c 41 | res.purge() 42 | 43 | def test_combo_descending(self): 44 | imps = Table.from_tag(IMPS) 45 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 46 | where=imps.date > '2014-01-21', 47 | order_by=(imps.date, imps.cpm_millis), 48 | desc=True) 49 | highest_cpm = 1000000000 50 | highest_date = '2222-01-01' 51 | for a, d, c in res: 52 | if highest_date == d: 53 | self.assertGreaterEqual(highest_cpm, c) 54 | highest_cpm = c 55 | else: 56 | self.assertGreaterEqual(highest_date, d) 57 | highest_date = d 58 | highest_cpm = c 59 | res.purge() 60 | 61 | def test_high_limit(self): 62 | imps = Table.from_tag(IMPS) 63 | res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', limit=100) 64 | results = list(res) 65 | self.assertEqual(len(results), 10) 66 | res.purge() 67 | 68 | def test_low_limit(self): 69 | imps = Table.from_tag(IMPS) 70 | res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', limit=4) 71 | results = list(res) 72 | self.assertEqual(len(results), 4) 73 | res.purge() 74 | 75 | def test_distinct(self): 76 | imps = Table.from_tag(IMPS) 77 | res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True) 78 | results = list(res) 79 | self.assertEqual(len(results), 8) 80 | res.purge() 81 | 82 | def test_overall(self): 83 | imps = Table.from_tag(IMPS) 84 | res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4, 85 | order_by='ad_id', desc=True) 86 | results = [a for a, d in res] 87 | self.assertEqual(len(results), 4) 88 | self.assertListEqual(results, [30019, 30018, 30017, 30015]) 89 | res.purge() 90 | -------------------------------------------------------------------------------- /integration_test/test_simple_query.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import select, Table 3 | from setup import IMPS 4 | from hustle.core.settings import Settings, overrides 5 | 6 | 7 | class TestSimpleQuery(unittest.TestCase): 8 | def setUp(self): 9 | overrides['server'] = 'disco://localhost' 10 | overrides['dump'] = False 11 | overrides['nest'] = False 12 | self.settings = Settings() 13 | 14 | def tearDown(self): 15 | pass 16 | 17 | def test_equality_on_partition(self): 18 | imps = Table.from_tag(IMPS) 19 | res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') 20 | results = list(res) 21 | self.assertEqual(len(results), 10) 22 | found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506) 23 | self.assertIsNotNone(found) 24 | self.assertTrue(all(d == '2014-01-27' for _, d, _ in results)) 25 | res.purge() 26 | 27 | def test_range_on_partition(self): 28 | imps = Table.from_tag(IMPS) 29 | res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27') 30 | results = list(res) 31 | self.assertEqual(len(results), 20) 32 | self.assertTrue(all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results)) 33 | res.purge() 34 | 35 | def test_combo_where_on_partition(self): 36 | imps = Table.from_tag(IMPS) 37 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 38 | where=((imps.date >= '2014-01-20') & (imps.ad_id == 30010))) 39 | results = list(res) 40 | self.assertEqual(len(results), 6) 41 | self.assertTrue(all(d >= '2014-01-20' and a == 30010 for a, d, _ in results)) 42 | res.purge() 43 | 44 | def test_combo_where_on_or_partition(self): 45 | imps = Table.from_tag(IMPS) 46 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 47 | where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010))) 48 | results = list(res) 49 | self.assertEqual(len(results), 27) 50 | self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results)) 51 | res.purge() 52 | 53 | def test_combo_where_on_or_partition_ex(self): 54 | imps = Table.from_tag(IMPS) 55 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 56 | where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id == 30010))) 57 | results = list(res) 58 | self.assertEqual(len(results), 27) 59 | self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results)) 60 | res.purge() 61 | 62 | def test_combo_where_on_or_partition_ex1(self): 63 | imps = Table.from_tag(IMPS) 64 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 65 | where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id << [30003, 30010]))) 66 | results = list(res) 67 | self.assertEqual(len(results), 40) 68 | self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 or a == 30003 for a, d, _ in results)) 69 | res.purge() 70 | 71 | def test_combo_where_on_or_partition_ex2(self): 72 | imps = Table.from_tag(IMPS) 73 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 74 | where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010]))) 75 | results = list(res) 76 | self.assertEqual(len(results), 1) 77 | self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results)) 78 | res.purge() 79 | 80 | def test_combo_where_on_and_partition(self): 81 | imps = Table.from_tag(IMPS) 82 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 83 | where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.ad_id == 30010))) 84 | results = list(res) 85 | self.assertEqual(len(results), 2) 86 | self.assertTrue(all(d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30010 for a, d, _ in results)) 87 | res.purge() 88 | 89 | def test_combo_where_no_partition(self): 90 | imps = Table.from_tag(IMPS) 91 | res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000)) 92 | results = list(res) 93 | print results 94 | self.assertEqual(len(results), 5) 95 | res.purge() 96 | 97 | def test_combo_where_on_mixed_partition(self): 98 | imps = Table.from_tag(IMPS) 99 | res = select(imps.ad_id, imps.date, imps.cpm_millis, 100 | where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000)))) 101 | results = list(res) 102 | self.assertEqual(len(results), 2) 103 | self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results)) 104 | res.purge() 105 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | nose==1.3.7 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ujson==1.35 2 | PyYAML==3.12 3 | Cython==0.27.3 4 | -------------------------------------------------------------------------------- /settings.yaml: -------------------------------------------------------------------------------- 1 | # disco server. 2 | server: disco://localhost 3 | 4 | # default hustle worker. Mostly, there is no need to change it. 5 | worker_class: hustle.core.pipeworker.Worker 6 | 7 | # print out the result to the terminal. If this is disabled, you need to call cat() 8 | # function yourself to get the result. 9 | dump: False 10 | 11 | # save query results to a hustle table 12 | nest: False 13 | 14 | # partition number for the hustle pipeworker. If unspecified, 16 would be used. 15 | # This number would be used when assigning labels for the output for each 16 | # stage. 17 | partition: 16 18 | 19 | # size of history for hustle shell 20 | history_size: 1000 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from setuptools import setup, find_packages 3 | 4 | 5 | if sys.version_info[:2] < (2, 6): 6 | raise RuntimeError('Requires Python 2.6 or better') 7 | 8 | VERSION = '0.2.7' 9 | 10 | setup( 11 | name='hustle', 12 | version=VERSION, 13 | description=('Hustle: a data warehouse system.'), 14 | keywords='hustle', 15 | author='Chango Inc.', 16 | author_email='dev@chango.com', 17 | url='http://chango.com', 18 | license='MIT License', 19 | packages=find_packages(exclude=['test', 'deps', 'examples', 'inferno']), 20 | include_package_data=True, 21 | zip_safe=False, 22 | test_suite='nose.collector', 23 | requires=['disco', 'mdb']) 24 | -------------------------------------------------------------------------------- /test/fixtures/keys: -------------------------------------------------------------------------------- 1 | _eb_cat_115280 2 | 0cd10308162c5216f2476f462ba6d0c3 3 | hobbit house being demolished due to lac 4 | money in the bank streams 5 | iphone contacts deleted how to restore 6 | montreal database administrator oracle qc 7 | 878ed9857ee90262fea180c83d3e96c5 8 | ton fatberg removed from london sewer 9 | 463d67a89afdcbc4960b4e2d45ae8c24 10 | flip flop luggage tag set of 11 | iab_sports 12 | crassula vendre montreal 13 | _eb_cat_9394 14 | tvs electronics phones cordless phones 15 | _av_2162 16 | make it diy floor pillow made from thrifted blanket 17 | 879319cd0476782a235d42e4fcb80edf 18 | iphone rumor 19 | new homes condos 20 | __cat_adx_532 21 | _av_2334 22 | _av_2334 23 | heart of the swarm patch makes the game more spectacular 24 | college football predictions the hardest places to play in 25 | -------------------------------------------------------------------------------- /test/test_lru_dict.py: -------------------------------------------------------------------------------- 1 | import mdb 2 | import os 3 | import unittest 4 | from functools import partial 5 | from hustle.core.marble import mdb_evict, mdb_fetch 6 | from pylru import LRUDict 7 | from pylru import CharLRUDict, IntLRUDict 8 | from pyebset import BitSet 9 | 10 | class TestLRUDict(unittest.TestCase): 11 | def setUp(self): 12 | pass 13 | 14 | def tearDown(self): 15 | try: 16 | os.unlink('/tmp/lru_test') 17 | os.unlink('/tmp/lru_test-lock') 18 | except: 19 | pass 20 | 21 | def test_lru(self): 22 | def get(db, txn, key): 23 | try: 24 | return db.get(txn, key) 25 | except: 26 | return None 27 | 28 | env = mdb.Env('/tmp/lru_test', flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) 29 | txn = env.begin_txn() 30 | ixdb = env.open_db(txn, 'ix', flags=mdb.MDB_CREATE) 31 | 32 | lru = LRUDict.getDict(5, 33 | partial(mdb_fetch, txn=txn, ixdb=ixdb), 34 | partial(mdb_evict, txn=txn, ixdb=ixdb)) 35 | 36 | lru.set('hello', BitSet()) 37 | lru.set('goodbye', BitSet()) 38 | lru.set('amine', BitSet()) 39 | lru.set('solution', BitSet()) 40 | lru.set('lsd', BitSet()) 41 | self.assertEqual(len(lru._getContents()), 5) 42 | 43 | lru.set('proxy', BitSet()) 44 | store = lru._getContents() 45 | self.assertNotIn('hello', store) 46 | self.assertIsNotNone(get(ixdb, txn, 'hello')) 47 | self.assertEqual(len(store), 5) 48 | 49 | bitmap = lru['hello'] 50 | store = lru._getContents() 51 | self.assertIn('hello', store) 52 | self.assertEqual(len(store), 5) 53 | self.assertIsInstance(bitmap, BitSet) 54 | self.assertIsNone(lru.get('skibiddles')) 55 | 56 | # test eviction order 57 | self.assertIsNotNone(lru.get('goodbye')) # this now should 'reset' goodbye so that it won't be evicted 58 | lru.set('whammy bar', BitSet()) # amine should be evicted 59 | store = lru._getContents() 60 | self.assertNotIn('amine', store) 61 | self.assertIn('goodbye', store) 62 | 63 | txn.commit() 64 | env.close() 65 | 66 | class LruTest(unittest.TestCase): 67 | def test_basic_char(self): 68 | mdict = {} 69 | 70 | def fetch(key): 71 | try: 72 | res = mdict[key] 73 | except: 74 | return None 75 | return res 76 | 77 | def evict(key, value): 78 | mdict[key] = value 79 | 80 | l = CharLRUDict(10, fetch, evict) 81 | 82 | a = 100000 83 | b = 200000 84 | 85 | for i in range(a, b): 86 | l.set(str(i * i), i * i) 87 | 88 | for i in range(b - 1, a, -1): 89 | v = l.get(str(i * i)) 90 | self.assertEqual(i * i, v) 91 | 92 | def test_basic_int(self): 93 | mdict = {} 94 | 95 | def fetch(key): 96 | try: 97 | res = mdict[key] 98 | except: 99 | return None 100 | return res 101 | 102 | def evict(key, value): 103 | mdict[key] = value 104 | 105 | l = IntLRUDict(10, fetch, evict) 106 | 107 | a = 100000 108 | b = 200000 109 | 110 | for i in range(a , b): 111 | l.set(i * i, i * i) 112 | 113 | for i in range(b - 1, a, -1): 114 | v = l.get(i * i) 115 | self.assertEqual(i * i, v) 116 | 117 | def test_no_eviction(self): 118 | def fetch(key): 119 | return None 120 | 121 | def evict(key, value): 122 | self.fail("Nothing should be evicted: " + str(key) + " " + str(value)) 123 | 124 | l = CharLRUDict(1, fetch, evict, list) 125 | s = l["10"] 126 | self.assertListEqual(s, []) 127 | -------------------------------------------------------------------------------- /test/test_merge_wrapper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle.core.pipeworker import merge_wrapper 3 | 4 | 5 | class TestMarble(unittest.TestCase): 6 | def test_up(self): 7 | a = [(('keya', 5), 22), (('lima', 9), 23), (('oebra', 21), 24), (('qeya', 5), 22), (('tima', 9), 23), (('zebra', 21), 24)] 8 | b = [(('aeya', 5), 22), (('fima', 12), 23), (('hebra', 8), 24), (('xya', 5), 22), (('yima', 12), 23), (('zzebra', 8), 24)] 9 | c = [(('beya', 5), 22), (('fliea', 9), 23), (('gray', 21), 24), (('morie', 5), 22), (('steel', 9), 23), (('yale', 21), 24)] 10 | d = [(('vera', 5), 22), (('wera', 12), 23), (('xera', 8), 24), (('yolanda', 5), 22), (('yolo', 12), 23), (('zanadu', 8), 24)] 11 | from heapq import merge 12 | res = merge(merge_wrapper(a), merge_wrapper(b), merge_wrapper(c), merge_wrapper(d)) 13 | lowest = 'aaaaaa' 14 | for k, v in res: 15 | self.assertTrue(lowest < k[0]) 16 | lowest = k[0] 17 | 18 | def test_down(self): 19 | a = [(('zebra', 21), 24), (('lima', 9), 23), (('keya', 5), 22), ] 20 | b = [(('zzebra', 8), 24), (('sima', 12), 23), (('aeya', 5), 22)] 21 | from heapq import merge 22 | res = merge(merge_wrapper(a, desc=True), merge_wrapper(b, desc=True)) 23 | highest = 'zzzzzzzzz' 24 | for k, v in res: 25 | self.assertTrue(highest > k[0]) 26 | highest = k[0] 27 | 28 | def test_nulls(self): 29 | a = [(('zebra', 21), 24), (('keya', 5), 22), ((None, 9), 23)] 30 | b = [(('zzebra', 8), 24), (('sima', 12), 23), (('aeya', 5), 22), ((None, 12), 18)] 31 | from heapq import merge 32 | res = merge(merge_wrapper(a, desc=True), merge_wrapper(b, desc=True)) 33 | highest = 'zzzzzzzzz' 34 | for k, v in res: 35 | print k, v 36 | self.assertTrue(highest >= k[0]) 37 | highest = k[0] 38 | 39 | def test_multi(self): 40 | a = [(('zebra', 12), 24), (('webra', 12), 24), (('bebra', 12), 24), (('aebra', 12), 24), (('zebra', 11), 24), (('keya', 5), 22), (('aeya', 5), 22), ] 41 | b = [(('sima', 12), 23), (('zzebra', 8), 28), (('yzebra', 8), 28), (('azebra', 8), 28), (('aeya', 5), 22)] 42 | from heapq import merge 43 | res = merge(merge_wrapper(a, sort_range=(1, 0), desc=True), merge_wrapper(b, sort_range=(1, 0), desc=True)) 44 | highest = 999999999 45 | highest_2nd = 'zzzzzzzz' 46 | same_count = 0 47 | for k, v in res: 48 | print "kev", k, v 49 | if highest == k[1]: 50 | self.assertTrue(highest_2nd >= k[0]) 51 | same_count += 1 52 | self.assertGreaterEqual(highest, k[1]) 53 | highest = k[1] 54 | highest_2nd = k[0] 55 | self.assertEqual(same_count, 8) 56 | 57 | def test_lopsided(self): 58 | a = [(('zebra', 21), 24)] 59 | b = [(('zzebra', 8), 24), (('sima', 12), 23), (('aeya', 5), 22)] 60 | from heapq import merge 61 | res = merge(merge_wrapper(a, desc=True), merge_wrapper(b, desc=True)) 62 | highest = 'zzzzzzzzz' 63 | for k, v in res: 64 | self.assertTrue(highest > k[0]) 65 | highest = k[0] 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /test/test_pipeworker.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle.core.pipeworker import sort_reader, disk_sort 3 | from StringIO import StringIO 4 | import os 5 | 6 | OUT_FILE = '/tmp/test_disk_sort' 7 | 8 | TEST_FILE = \ 9 | b"stuff\xff19\xffvalue1\x00" \ 10 | b"morestuff\xff29\xffvalue2\x00" \ 11 | b"reallylongkeyprobablylongerthanthebufferljkfdskjlkjjkjkjjjjjjjjjjjjjsfddfsfdsdfsdfsfdsfdsdfsfdsfdsdsffdsdfsdfsfdsdfsdfsdfsdfsfdsdsfdfsfdsfdsdsfdsffdsdfsdsfdsfdfsdfsfdssdfdfsdfsdfsdfsdfsfdsfdssdfdfs\xff15\xfffinalvalue\x00"\ 12 | 13 | EXPECTED = [ 14 | (["stuff", "19"], "value1"), 15 | (["morestuff", "29"], "value2"), 16 | (["reallylongkeyprobablylongerthanthebufferljkfdskjlkjjkjkjjjjjjjjjjjjjsfddfsfdsdfsdfsfdsfdsdfsfdsfdsdsffdsdfsdfsfdsdfsdfsdfsdfsfdsdsfdfsfdsfdsdsfdsffdsdfsdsfdsfdfsdfsfdssdfdfsdfsdfsdfsdfsfdsfdssdfdfs", "15"], "finalvalue"), 17 | ] 18 | 19 | RESPECTED = [ 20 | (["stuff", 1900], 'value1'), 21 | (["morestuff", 9], 'value2'), 22 | (["anymore", 290], 'value3'), 23 | (["stuff", 29], 'value4'), 24 | (["toeat", 1500], 'value5'), 25 | (["reallystuff", 15], 'finalvalue'), 26 | ] 27 | 28 | 29 | SOMENULLS = [ 30 | (["olay", 1900], 'value1'), 31 | (["morestuff", 9], 'value2'), 32 | (["anymore", 290], 'value3'), 33 | ([None, 29], 'value4'), 34 | (["toeat", 1500], 'value5'), 35 | (["reallystuff", 15], 'finalvalue'), 36 | ] 37 | 38 | 39 | class TestPipeworker(unittest.TestCase): 40 | def setUp(self): 41 | pass 42 | 43 | def _clean_ds_tmp(self): 44 | try: 45 | os.unlink(OUT_FILE) 46 | except: 47 | pass 48 | 49 | def test_sort_reader(self): 50 | for buf_size in [8, 16, 32, 64, 256, 8192]: 51 | infile = StringIO(TEST_FILE) 52 | for actual, expected in zip(sort_reader(infile, 'test', buf_size), EXPECTED): 53 | self.assertListEqual(actual[0], expected[0]) 54 | self.assertEqual(actual[1], expected[1]) 55 | 56 | def test_simple_disk_sort(self): 57 | self._clean_ds_tmp() 58 | actual = [(key, value) for key, value in disk_sort(RESPECTED, OUT_FILE, (0, 1))] 59 | print "ACTUAL: ", actual 60 | self.assertEqual(actual[0][0][0], "anymore") 61 | self.assertEqual(actual[1][0][1], 9) 62 | self.assertEqual(actual[2][0][0], "reallystuff") 63 | self.assertEqual(actual[3][1], ()) # tests secondary sorting 64 | 65 | def test_positional_disk_sort(self): 66 | self._clean_ds_tmp() 67 | actual = [(key, value) for key, value in disk_sort(RESPECTED, OUT_FILE, [1])] 68 | print "ACTUAL: ", actual 69 | self.assertEqual(actual[0][0][0], "morestuff") 70 | self.assertEqual(actual[1][0][1], 15) 71 | self.assertEqual(actual[2][0][0], "stuff") 72 | self.assertEqual(actual[3][1], ()) 73 | self.assertEqual(actual[5][1], ()) 74 | 75 | def test_nulls(self): 76 | self._clean_ds_tmp() 77 | actual = [(key, value) for key, value in disk_sort(SOMENULLS, OUT_FILE, [0])] 78 | print "ACTUAL: ", actual 79 | self.assertEqual(actual[0][0][0], None) 80 | 81 | -------------------------------------------------------------------------------- /test/test_query_checker.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle.core.marble import Marble, check_query 3 | 4 | 5 | _FIELDS = ("+@4id", "+*name", "+$date", "+%2genre", "+@2rating", "artist", "@4quantity") 6 | _PARTITIONS = "date" 7 | _FIELDS_SELL = ("+@4id", "+@4item_id", "+$date", "@4store_id", "@4quantity", "$price") 8 | 9 | 10 | class TestChecker(unittest.TestCase): 11 | def setUp(self): 12 | self.albums = Marble(name="Albums", 13 | fields=_FIELDS, 14 | partition=_PARTITIONS) 15 | self.transaction = Marble(name="Transcation", 16 | fields=_FIELDS_SELL, 17 | partition=_PARTITIONS) 18 | self.single_where = [(self.albums.rating > 3)] 19 | self.multi_wheres = [(self.albums.rating > 3) & (self.albums.id == 1000)] 20 | self.cross_wheres = [self.albums.rating > 3, self.transaction.id == 1000] 21 | self.single_select = [self.albums.name] 22 | self.multi_select = [self.albums.name, self.albums.date, self.albums.rating] 23 | self.cross_select = [self.albums.name, self.albums.artist, 24 | self.transaction.store_id, self.transaction.price] 25 | self.order_by = [self.albums.quantity, self.albums.rating] 26 | self.join = [self.albums.id, self.transaction.item_id] 27 | self.join_invalid = [self.albums.id, self.transaction.price] 28 | self.join_invalid_1 = [self.albums.id, self.albums.id] 29 | self.join_invalid_2 = [self.albums.id, self.transaction.price] 30 | self.limit_single = 100 31 | self.limit_single_invalid = -100 32 | 33 | def test_select_clauses(self): 34 | # test empty select 35 | with self.assertRaises(ValueError): 36 | check_query([], 37 | [], 38 | self.order_by, 39 | None, 40 | self.single_where) 41 | # test duplicate select 42 | with self.assertRaises(ValueError): 43 | check_query(self.single_select + self.single_select, 44 | [], 45 | self.order_by, 46 | None, 47 | self.single_where) 48 | self.assertTrue(check_query(self.single_select, [], [], 49 | None, self.single_where)) 50 | 51 | def test_where_clauses(self): 52 | # should raise if a single table shows up in multi-wheres 53 | # should raise if where and select are from different tables 54 | with self.assertRaises(ValueError): 55 | check_query(self.single_select, 56 | [], 57 | [], 58 | self.order_by, 59 | [self.transaction.id == 1000]) 60 | self.assertTrue(check_query(self.single_select, [], [], 61 | None, self.single_where)) 62 | 63 | def test_join(self): 64 | # test join with single table 65 | with self.assertRaises(ValueError): 66 | check_query(self.single_select, 67 | self.join, 68 | [], 69 | None, 70 | self.single_where) 71 | 72 | # test invalid join 73 | with self.assertRaises(ValueError): 74 | check_query(self.single_select, 75 | self.join_invalid, 76 | [], 77 | None, 78 | self.cross_wheres) 79 | 80 | # test invalid join 81 | with self.assertRaises(ValueError): 82 | check_query(self.single_select, 83 | self.join_invalid_1, 84 | [], 85 | None, 86 | self.cross_wheres) 87 | 88 | # test invalid join 89 | with self.assertRaises(ValueError): 90 | check_query(self.single_select, 91 | self.join_invalid_2, 92 | [], 93 | None, 94 | self.cross_wheres) 95 | self.assertTrue(check_query(self.single_select, 96 | self.join, [], None, self.cross_wheres)) 97 | 98 | def test_order_by(self): 99 | # should raise if select columns don't contain the order column 100 | with self.assertRaises(ValueError): 101 | check_query(self.single_select, 102 | [], 103 | self.order_by, 104 | None, 105 | self.single_where) 106 | self.assertTrue(check_query(self.single_select, [], [self.albums.name], 107 | None, self.single_where)) 108 | 109 | def test_limit(self): 110 | with self.assertRaises(ValueError): 111 | check_query(self.single_select, 112 | [], 113 | [], 114 | self.limit_single_invalid, 115 | self.single_where) 116 | 117 | self.assertTrue( 118 | check_query(self.single_select, 119 | [], 120 | [], 121 | self.limit_single, 122 | self.single_where)) 123 | 124 | def test_full_query(self): 125 | self.assertTrue( 126 | check_query( 127 | self.cross_select, 128 | self.join, 129 | self.single_select, 130 | self.limit_single, 131 | self.cross_wheres)) 132 | -------------------------------------------------------------------------------- /test/test_rtrie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import rtrie 4 | import mdb 5 | from wtrie import Trie 6 | 7 | class TestRTrie(unittest.TestCase): 8 | def test_rtrie_in_memory(self): 9 | 10 | s = unicode(u'séllsink').encode('utf-8') 11 | #print "HELLSINK: %s" % s 12 | 13 | t = Trie() 14 | self.assertEqual(t.add('hello'), 1) 15 | self.assertEqual(t.add('hell'), 2) 16 | self.assertEqual(t.add('hello'), 1) 17 | self.assertEqual(t.add('hellothere'), 3) 18 | self.assertEqual(t.add('good'), 4) 19 | self.assertEqual(t.add('goodbye'), 5) 20 | self.assertEqual(t.add('hello'), 1) 21 | self.assertEqual(t.add('hellsink'), 6) 22 | self.assertEqual(t.add(s), 7) 23 | t.print_it() 24 | 25 | nodes, kids, _ = t.serialize() 26 | nodeaddr, nodelen = nodes.buffer_info() 27 | kidaddr, kidlen = kids.buffer_info() 28 | print "LENS %s %s" % (nodelen, kidlen) 29 | 30 | for i in range(8): 31 | val = rtrie.value_for_vid(nodeaddr, kidaddr, i) 32 | print "Value", i, val 33 | 34 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1) 35 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2) 36 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5) 37 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6) 38 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'), 3) 39 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4) 40 | self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7) 41 | self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere')) 42 | self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h')) 43 | self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he')) 44 | self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel')) 45 | self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells')) 46 | 47 | def test_rtrie_in_mdb(self): 48 | t = Trie() 49 | self.assertEqual(t.add('hello'), 1) 50 | self.assertEqual(t.add('hell'), 2) 51 | self.assertEqual(t.add('hello'), 1) 52 | self.assertEqual(t.add('hellothere'), 3) 53 | self.assertEqual(t.add('good'), 4) 54 | self.assertEqual(t.add('goodbye'), 5) 55 | self.assertEqual(t.add('hello'), 1) 56 | self.assertEqual(t.add('hellsink'), 6) 57 | 58 | nodes, kids, _ = t.serialize() 59 | nodeaddr, nodelen = nodes.buffer_info() 60 | kidaddr, kidlen = kids.buffer_info() 61 | try: 62 | env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) 63 | txn = env.begin_txn() 64 | db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE) 65 | db.put_raw(txn, 'nodes', nodeaddr, nodelen) 66 | db.put_raw(txn, 'kids', kidaddr, kidlen) 67 | 68 | n, ns = db.get_raw(txn, 'nodes') 69 | k, ks = db.get_raw(txn, 'kids') 70 | txn.commit() 71 | env.close() 72 | 73 | env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR) 74 | txn = env.begin_txn() 75 | db = env.open_db(txn, name='_meta_') 76 | 77 | n, ns = db.get_raw(txn, 'nodes') 78 | k, ks = db.get_raw(txn, 'kids') 79 | self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1) 80 | self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2) 81 | self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5) 82 | self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6) 83 | self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3) 84 | self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4) 85 | self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere')) 86 | 87 | txn.commit() 88 | env.close() 89 | finally: 90 | import os 91 | os.unlink('/tmp/test_rtrie') 92 | os.unlink('/tmp/test_rtrie-lock') 93 | 94 | -------------------------------------------------------------------------------- /test/test_stress_wtrie.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from wtrie import Trie 4 | from rtrie import value_for_vid, vid_for_value 5 | 6 | 7 | pwd = os.getcwd() 8 | if os.path.basename(pwd) != 'test': 9 | fixture = os.path.join(pwd, 'test/fixtures/keys') 10 | else: 11 | fixture = os.path.join(pwd, 'fixtures/keys') 12 | 13 | 14 | class TestStressWTrie(unittest.TestCase): 15 | def test_stress_wtrie(self): 16 | ktrie = Trie() 17 | strie = Trie() 18 | etrie = Trie() 19 | 20 | keywords = {} 21 | search_terms = {} 22 | exchange_ids = {} 23 | 24 | with open(fixture) as f: 25 | for data in f: 26 | for word in data.split(' '): 27 | vid = ktrie.add(word) 28 | actual_vid = keywords.get(word) 29 | if actual_vid is not None: 30 | self.assertEqual(vid, actual_vid) 31 | else: 32 | keywords[word] = vid 33 | 34 | vid = strie.add(data) 35 | actual_vid = search_terms.get(data) 36 | if actual_vid is not None: 37 | self.assertEqual(vid, actual_vid) 38 | else: 39 | search_terms[data] = vid 40 | 41 | nodes, kids, nodelen = etrie.serialize() 42 | naddr, nlen = nodes.buffer_info() 43 | kaddr, klen = kids.buffer_info() 44 | #summarize(naddr, kaddr, nodelen) 45 | #print_it(naddr, kaddr) 46 | 47 | for dc, vid in exchange_ids.iteritems(): 48 | rvid = etrie.add(dc) 49 | self.assertEqual(vid, rvid) 50 | 51 | print dc, vid 52 | value = value_for_vid(naddr, kaddr, vid) 53 | self.assertEqual(dc, value) 54 | if dc != value: 55 | print " dc=%s adc=%s" % (dc, value) 56 | 57 | avid = vid_for_value(naddr, kaddr, dc) 58 | #print "vid=%s avid=%s" % (vid, avid) 59 | self.assertEqual(vid, avid) 60 | -------------------------------------------------------------------------------- /test/test_table.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle import Table 3 | 4 | class TestTable(unittest.TestCase): 5 | def test_create_syntax(self): 6 | full_columns = ['wide index uint32 x', 'index string y', 'int16 z', 'lz4 a', 'trie32 b', 'binary c'] 7 | full_fields = ['=@4x', '+$y', '#2z', '*a', '%4b', '&c'] 8 | fields = Table.parse_column_specs(full_columns) 9 | self.assertListEqual(fields, full_fields) 10 | 11 | default_columns = ['wide index x', 'index int y', 'uint z', 'trie b', 'c'] 12 | default_fields = ['=x', '+#y', '@z', '%b', 'c'] 13 | fields = Table.parse_column_specs(default_columns) 14 | self.assertListEqual(fields, default_fields) 15 | 16 | def test_create_errors(self): 17 | self.assertRaises(ValueError, Table.parse_column_specs, ['wide wide index x']) 18 | self.assertRaises(ValueError, Table.parse_column_specs, ['index wide x']) 19 | self.assertRaises(ValueError, Table.parse_column_specs, ['index blah16 x']) 20 | self.assertRaises(ValueError, Table.parse_column_specs, ['uint24 x']) 21 | -------------------------------------------------------------------------------- /test/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from hustle.core.util import SortedIterator 3 | 4 | class TestSortedIterator(unittest.TestCase): 5 | 6 | def test_merges_sorted_inputs(self): 7 | data = [ 8 | [ 9 | ((1, 1), 'some_value'), 10 | ((1, 2), 'some_value'), 11 | ((1, 3), 'some_value') 12 | ], 13 | [ 14 | ((1, 100), 'some_value'), 15 | ((1, 200), 'some_value'), 16 | ((1, 300), 'some_value') 17 | ], 18 | [ 19 | ((1, 10), 'some_value'), 20 | ((1, 20), 'some_value'), 21 | ((1, 30), 'some_value') 22 | ], 23 | [ 24 | ((1, 4), 'some_value'), 25 | ((1, 40), 'some_value'), 26 | ((1, 400), 'some_value') 27 | ] 28 | ] 29 | sorted_iterator = SortedIterator(data) 30 | expected = [ 31 | ((1, 1), 'some_value'), 32 | ((1, 2), 'some_value'), 33 | ((1, 3), 'some_value'), 34 | ((1, 4), 'some_value'), 35 | ((1, 10), 'some_value'), 36 | ((1, 20), 'some_value'), 37 | ((1, 30), 'some_value'), 38 | ((1, 40), 'some_value'), 39 | ((1, 100), 'some_value'), 40 | ((1, 200), 'some_value'), 41 | ((1, 300), 'some_value'), 42 | ((1, 400), 'some_value')] 43 | self.assertListEqual(list(sorted_iterator), expected) 44 | 45 | def test_assumes_individual_inputs_are_already_sorted(self): 46 | data = [ 47 | [ 48 | ((2, 1), 'some_value'), 49 | ((1, 1), 'some_value'), 50 | ], 51 | [ 52 | ((4, 1), 'some_value'), 53 | ((3, 1), 'some_value'), 54 | ] 55 | ] 56 | sorted_iterator = SortedIterator(data) 57 | expected = [ 58 | ((2, 1), 'some_value'), 59 | ((1, 1), 'some_value'), 60 | ((4, 1), 'some_value'), 61 | ((3, 1), 'some_value')] 62 | self.assertListEqual(list(sorted_iterator), expected) 63 | 64 | def test_handles_duplicates(self): 65 | data = [ 66 | [ 67 | ((1, 1), 'some_value'), 68 | ((1, 2), 'some_value'), 69 | ], 70 | [ 71 | ((1, 1), 'some_value'), 72 | ((1, 2), 'some_value'), 73 | ((1, 3), 'some_value'), 74 | ], 75 | [ 76 | ((1, 3), 'some_value'), 77 | ] 78 | ] 79 | sorted_iterator = SortedIterator(data) 80 | expected = [ 81 | ((1, 1), 'some_value'), 82 | ((1, 1), 'some_value'), 83 | ((1, 2), 'some_value'), 84 | ((1, 2), 'some_value'), 85 | ((1, 3), 'some_value'), 86 | ((1, 3), 'some_value')] 87 | self.assertListEqual(list(sorted_iterator), expected) 88 | 89 | def test_handles_empty_input(self): 90 | data = [ 91 | [((1, 1), 'some_value')], 92 | [], # <----- empty input 93 | [((2, 1), 'some_value')], 94 | ] 95 | sorted_iterator = SortedIterator(data) 96 | expected = [ 97 | ((1, 1), 'some_value'), 98 | ((2, 1), 'some_value')] 99 | self.assertListEqual(list(sorted_iterator), expected) 100 | -------------------------------------------------------------------------------- /test/test_wtrie.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import unittest 3 | from wtrie import Trie 4 | 5 | 6 | class TestWTrie(unittest.TestCase): 7 | def test_wtrie(self): 8 | t = Trie() 9 | self.assertEqual(t.add('hello'), 1) 10 | self.assertEqual(t.add('hell'), 2) 11 | self.assertEqual(t.add('hello'), 1) 12 | self.assertEqual(t.add('hellothere'), 3) 13 | self.assertEqual(t.add('good'), 4) 14 | self.assertEqual(t.add('goodbye'), 5) 15 | self.assertEqual(t.add('hello'), 1) 16 | self.assertEqual(t.add('hellsink'), 6) 17 | self.assertEqual(t.add(''), 0) 18 | 19 | # nodes = t.nodes 20 | # t.print_it() 21 | 22 | key, sz, pt = t.node_at_path() 23 | self.assertEqual(sz, 2) 24 | 25 | key, sz, pt = t.node_at_path(104) 26 | self.assertEqual(key, 'hell') 27 | self.assertEqual(pt, 0) 28 | self.assertEqual(sz, 2, 'actual %s' % sz) 29 | 30 | key2, sz, pt = t.node_at_path(104, 111) 31 | self.assertEqual(key2, 'o', 'actual %s' % key) 32 | self.assertEqual(pt, 2) 33 | self.assertEqual(sz, 1) 34 | 35 | key, sz, pt = t.node_at_path(104, 111, 116) 36 | self.assertEqual(key, 'there') 37 | self.assertEqual(pt, 1) 38 | self.assertEqual(sz, 0) 39 | 40 | n, k, _ = t.serialize() 41 | self.assertEqual(len(n), 7 * 4, "actual %d" % len(n)) 42 | self.assertEqual(len(k), 100, "actual %d" % len(k)) 43 | # print "sqork: %s" % t.kid_space 44 | 45 | print 'nodes', n 46 | print 'kids', k 47 | 48 | unpacked = struct.unpack_from("7I", n, 0) 49 | expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016) 50 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 51 | 52 | unpacked = struct.unpack_from("IH2I", k, 0) 53 | expected = (0, 0, 0x67000004, 0x68000002) 54 | self.assertEqual(unpacked, expected, unpacked) 55 | 56 | unpacked = struct.unpack_from("IH4cI", k, 16) 57 | expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005) 58 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 59 | 60 | unpacked = struct.unpack_from("IH3c", k, 32) 61 | expected = (0x0004, 0x0003, 'b', 'y', 'e') 62 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 63 | 64 | unpacked = struct.unpack_from("IH4c2I", k, 44) 65 | expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006) 66 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 67 | 68 | unpacked = struct.unpack_from("IHcI", k, 64) 69 | expected = (0x0002, 1, 'o', 0x74000003) 70 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 71 | 72 | unpacked = struct.unpack_from("IH5c", k, 76) 73 | expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e') 74 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 75 | 76 | unpacked = struct.unpack_from("IH4c", k, 88) 77 | expected = (0x0002, 0x0004, 's', 'i', 'n', 'k') 78 | self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked)) 79 | --------------------------------------------------------------------------------