├── .gitignore
├── .travis.yml
├── CHANGELOG
├── LICENSE
├── README.md
├── bin
    ├── __init__.py
    ├── hustle
    └── insert
├── bootstrap.sh
├── deps
    ├── .gitignore
    ├── AUTHORS.txt
    ├── INSTALL.txt
    ├── LICENSE.txt
    ├── Makefile
    ├── cardunion
    │   ├── cardunion.pyx
    │   └── test
    │   │   └── cardunion_test.py
    ├── libebset
    │   ├── LICENSE
    │   ├── boolarray.h
    │   ├── ewah.h
    │   ├── ewahutil.h
    │   ├── pyebset.pyx
    │   ├── runninglengthword.h
    │   └── test
    │   │   └── pyebset_test.py
    ├── liblmdb
    │   ├── COPYRIGHT
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README.md
    │   ├── cmdb.pxd
    │   ├── db.pyx
    │   ├── lmdb.h
    │   ├── mdb.c
    │   ├── mdb_copy.1
    │   ├── mdb_copy.c
    │   ├── mdb_stat.1
    │   ├── mdb_stat.c
    │   ├── midl.c
    │   ├── midl.h
    │   ├── setup.py
    │   └── test
    │   │   ├── test_cursor.py
    │   │   ├── test_db.py
    │   │   ├── test_greater_failure.py
    │   │   ├── test_intdb.py
    │   │   ├── test_intintdb.py
    │   │   ├── test_intreader.py
    │   │   ├── test_reader.py
    │   │   └── test_strintdb.py
    ├── liblru
    │   ├── clru.h
    │   └── pylru.pyx
    ├── liblz4
    │   ├── clz4.c
    │   ├── lz4.c
    │   ├── lz4.h
    │   ├── lz4hc.c
    │   └── lz4hc.h
    ├── liblzf
    │   ├── LICENSE
    │   ├── clzf.c
    │   ├── lzf.h
    │   ├── lzfP.h
    │   ├── lzf_c.c
    │   └── lzf_d.c
    ├── librtrie
    │   ├── main.c
    │   ├── pyrtrie.c
    │   ├── rtrie.c
    │   └── rtrie.h
    ├── libwtrie
    │   ├── test
    │   │   └── test_wtrie.py
    │   └── wtrie.pyx
    ├── maxhash
    │   ├── maxhash.pyx
    │   └── test
    │   │   └── maxhash_test.py
    ├── scamurmur3
    │   ├── murmur3.c
    │   ├── murmur3.h
    │   └── scamurmur3.c
    └── setup.py
├── doc
    ├── .gitignore
    ├── Makefile
    ├── _static
    │   └── hustle.png
    ├── _templates
    │   └── layout.html
    ├── api
    │   ├── core.rst
    │   └── hustle.rst
    ├── conf.py
    ├── howto
    │   ├── cli.rst
    │   ├── configure.rst
    │   ├── delete.rst
    │   ├── insert.rst
    │   ├── integration_tests.rst
    │   ├── query.rst
    │   └── schema.rst
    ├── hustle.png
    ├── index.rst
    └── start
    │   ├── install.rst
    │   └── tutorial.rst
├── hustle
    ├── __init__.py
    ├── cardinality.py
    └── core
    │   ├── __init__.py
    │   ├── column_fn.py
    │   ├── marble.py
    │   ├── pipeline.py
    │   ├── pipeworker.py
    │   ├── settings.py
    │   ├── stat.py
    │   └── util.py
├── integration_test
    ├── README
    ├── fixtures
    │   ├── imps.json
    │   ├── ip.json
    │   └── pixel.json
    ├── setup.py
    ├── test_aggregation.py
    ├── test_bool.py
    ├── test_cardinality.py
    ├── test_column_fn.py
    ├── test_drop.py
    ├── test_join.py
    ├── test_project_order.py
    └── test_simple_query.py
├── requirements-dev.txt
├── requirements.txt
├── settings.yaml
├── setup.py
└── test
    ├── fixtures
        └── keys
    ├── test_column.py
    ├── test_expression.py
    ├── test_lru_dict.py
    ├── test_marble.py
    ├── test_merge_wrapper.py
    ├── test_pipeline.py
    ├── test_pipeworker.py
    ├── test_query_checker.py
    ├── test_rtrie.py
    ├── test_stress_wtrie.py
    ├── test_table.py
    ├── test_util.py
    └── test_wtrie.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | *.a
 8 | 
 9 | # Distribution / packaging
10 | build/
11 | develop-eggs/
12 | dist/
13 | eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | doc/_build
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | .idea
24 | .ropeproject
25 | 
26 | # Installer logs
27 | pip-log.txt
28 | pip-delete-this-directory.txt
29 | 
30 | # Vim
31 | *.swp
32 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | dist: trusty
 3 | group: deprecated-2017Q4
 4 | 
 5 | language: python
 6 | notification:
 7 |     email: false
 8 | 
 9 | python:
10 |   - "2.7"
11 | 
12 | virtualenv:
13 |   system_site_packages: true
14 | 
15 | before_install:
16 |   - pushd .
17 |   - sudo apt-get update -qq
18 |   - sudo apt-get install -y curl erlang python-dev
19 |   - git clone https://github.com/discoproject/disco.git /tmp/disco
20 |   - cd /tmp/disco && git checkout develop && sudo make install
21 |   - cd lib && pip install .
22 |   - popd
23 | 
24 | install:
25 |   - sudo bash ./bootstrap.sh
26 |   - sudo pip install -r requirements-dev.txt
27 | 
28 | script: nosetests test/
29 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | 0.2.7 (Nov 7 2014)
 2 |     Fix issues in libebset for 32 bit architectures
 3 | 
 4 | 0.2.6 (July 29 2014)
 5 | 	Optimized #48. Making "limit" run faster
 6 | 	Added column functions. Being able to do column transforms, e.g. ip_aton,
 7 | 	ip_ntoa...
 8 | 	Added aggregate function h_combine. Being able to combine multiple values
 9 | 	with similar keys
10 | 	Added feature #66. Making query result purgable
11 | 	Added "server" option for the Hustle CLI
12 | 
13 | 0.2.5 (June 27 2014)
14 | 	Fixed issue #50. BooleanIx didn't handle missing values
15 | 	Added feature #11. Tag the query result in DDFS
16 | 	Added feature #49. Auto purge query related data if applicable
17 | 	Added feature to set the max cores that could be used by a query
18 | 	Added feature to enable profile for a query
19 | 	Fixed preprocess function of insert function to support filter functionality
20 | 	Added option for insert function to leave the marble for other uses
21 | 
22 | 0.2.4 (May 29 2014)
23 | 	Fixed issue #44. BooleanIx kept an invalid txn, crashed the insert
24 | 	Upgrade the exception handling to let Disco retry the failed task
25 | 
26 | 0.2.3 (May 16 2014)
27 | 	Fixed issue #39. Partition can't handle integer types
28 | 	Fixed issue in the Future class to be compatible with cat()
29 | 	Added feature #41. Support partition filters
30 | 	Optimized IO performance, speeded up tight loops
31 | 
32 | 0.2.2 (April 30 2014)
33 | 	Fixed issue #34. Wrong data types for aggregation columns
34 | 	Added decoder for CSV-like files
35 | 	Optimized performance for aggregation queries
36 | 
37 | 0.2.1 (April 22 2014)
38 | 	Fixed wrong argument of mget in MarbleStream
39 | 	Fixed the client of EWAHBoolArray to check the index faster
40 | 	Fixed the name collision of nested table
41 | 	Added PyYAML to the setup.py
42 | 	Unified 'dump' and 'edump' to the new function 'cat'
43 | 
44 | 0.2.0 (April 21 2014)
45 | 	Added feature #11. Serialization of Hustle tables
46 | 	Added feature #15. Compression of partition data
47 | 	Added feature #21. Optimization for in-stage combine
48 | 	Added feature #22. Boolean data type
49 | 	Added feature #25. Support for csv_decoder
50 | 	Added feature #27. Support for HyperLogLog and MinHash aggregation
51 | 	Added feature #32. Optimization for duplicate adjacent values
52 | 	Added functionality to get statistical information of Hustle tables
53 | 	Fixed issue #26. Can't insert data with a huge number of partitions
54 | 	Known issues to fix: #34
55 | 
56 | 0.1.3 (March 19 2014)
57 | 	Added mget() for lmdb Python client
58 | 	Improved the hustle_input_stream by using mget()
59 | 	Upgraded EWAHBoolArray from upstream
60 | 	Fixed a bug in libebset. len() is inaccurate on inverted bitsets
61 | 
62 | 0.1.2 (March 13 2014)
63 | 	Added feature #16. Alias column or aggregate functions
64 | 	Added feature #18. Non-blocking select()
65 | 	Fixed #17
66 | 
67 | 0.1.1 (March 12 2014)
68 | 	Added feature #8. 'delete' and 'drop' table
69 | 	Added feature #9. Query supports 'in' and 'not in'
70 | 	Simplified 'join' clause #10
71 | 	Fixed #19
72 | 
73 | 0.1.0 (March 6 2014)
74 | 	Initial release
75 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is the MIT license: http://www.opensource.org/licenses/mit-license.php
 2 | 
 3 | Copyright (c) 2014 Chango Inc. 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 6 | software and associated documentation files (the "Software"), to deal in the Software
 7 | without restriction, including without limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
 9 | to whom the Software is furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all copies or
12 | substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
15 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
16 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
17 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19 | DEALINGS IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Hustle](doc/_static/hustle.png)
 2 | 
 3 | A column oriented, embarrassingly distributed, relational event database.
 4 | 
 5 | Features
 6 | --------
 7 | 
 8 | * column oriented - super fast queries
 9 | * events - write only semantics
10 | * distributed insert - designed for petabyte scale distributed datasets with massive write loads
11 | * compressed - bitmap indexes, lz4, and prefix trie compression
12 | * relational - join gigantic data sets
13 | * partitioned - smart shards
14 | * embarrassingly distributed ([based on Disco](http://discoproject.org/))
15 | * embarrassingly fast ([uses LMDB](http://symas.com/mdb/))
16 | * NoSQL - Python DSL
17 | * bulk append only semantics
18 | * highly available, horizontally scalable
19 | * REPL/CLI query interface
20 | 
21 | Example Query
22 | -------------
23 | 
24 | ```
25 | select(impressions.ad_id, impressions.date, h_sum(pix.amount), h_count(),
26 |        where=((impressions.date < '2014-01-13') & (impressions.ad_id == 30010),
27 |                pix.date < '2014-01-13'),
28 |        join=(impressions.site_id, pix.site_id),
29 |        order_by=impressions.date)
30 | ```
31 | 
32 | 
33 | Installation
34 | ------------
35 | 
36 | After cloning this repo, here are some considerations:
37 | 
38 | * you will need Python 2.7 or higher - note that it *probably* won't work on 2.6 (has to do with pickling lambdas...)
39 | * you need to install Disco 0.5 and its dependencies - get that working first
40 | * you need to install Hustle and its 'deps' thusly:
41 | 
42 | ```
43 | cd hustle
44 | sudo ./bootstrap.sh
45 | ```
46 | 
47 | Please refer to the [Installation Guide](http://tspurway.github.io/hustle/start/install.html) for more details
48 | 
49 | Documentation
50 | -------------
51 | 
52 | [Hustle User Guide](http://tspurway.github.io/hustle/)
53 | 
54 | [Hustle Mailing List](http://groups.google.com/group/hustle-users)
55 | 
56 | Credits
57 | -------
58 | 
59 | Special thanks to following open-source projects:
60 | 
61 | * [EWAHBoolArray](https://github.com/lemire/EWAHBoolArray)
62 | * [disco](http://discoproject.org/)
63 | * [liblmdb](http://symas.com/mdb/)
64 | * [lz4](https://code.google.com/p/lz4/)
65 | * [ultrajson](https://github.com/esnme/ultrajson)
66 | 
67 | [![Build Status](https://travis-ci.org/tspurway/hustle.svg?branch=master)](https://travis-ci.org/tspurway/hustle)
68 | 


--------------------------------------------------------------------------------
/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/bin/__init__.py


--------------------------------------------------------------------------------
/bin/insert:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | if __name__ == '__main__':
  4 |     from argparse import ArgumentParser
  5 |     from disco.func import disco_input_stream
  6 |     from disco.comm import open_url
  7 |     from hustle import Table, insert
  8 |     from hustle.core.marble import csv_decoder
  9 |     from functools import partial
 10 | 
 11 |     parser = ArgumentParser(prog='insert', description="Hustle bulk load")
 12 | 
 13 |     parser.add_argument(
 14 |         "-s",
 15 |         "--server",
 16 |         dest="server",
 17 |         help="DDFS server destination",
 18 |         default='disco://localhost'
 19 |     )
 20 | 
 21 |     parser.add_argument(
 22 |         "-f",
 23 |         dest="infile",
 24 |         help="A file containing a list of all files to be inserted",
 25 |     )
 26 | 
 27 |     parser.add_argument(
 28 |         "-m",
 29 |         "--maxsize",
 30 |         dest="maxsize",
 31 |         help="Initial size of Hustle marble",
 32 |         default=1024*1024*1024
 33 |     )
 34 | 
 35 |     parser.add_argument(
 36 |         "-t",
 37 |         "--tmpdir",
 38 |         dest="tmpdir",
 39 |         help="Temporary directory for Hustle marble creation",
 40 |         default='/tmp'
 41 |     )
 42 | 
 43 |     parser.add_argument(
 44 |         "-p",
 45 |         dest="processor",
 46 |         help="a module.function for the Hustle import preprocessor",
 47 |     )
 48 | 
 49 |     parser.add_argument(
 50 |         "--disco-chunk",
 51 |         dest="disco_chunk",
 52 |         help="Indicated if the input files are in Disco CHUNK format",
 53 |         default=False,
 54 |         action='store_true'
 55 |     )
 56 | 
 57 |     parser.add_argument(
 58 |         "--csv-fields",
 59 |         dest="csv_fields",
 60 |         help="Assume input files are CSV and have the comma separated list of fields provided",
 61 |     )
 62 | 
 63 |     parser.add_argument(
 64 |         "--delimiter",
 65 |         dest="delimiter",
 66 |         help="For CSV input, this is the delimeter",
 67 |         default=','
 68 |     )
 69 | 
 70 |     parser.add_argument(
 71 |         "table",
 72 |         metavar='TABLE',
 73 |         type=str,
 74 |         help="The Hustle table to insert to",
 75 |     )
 76 | 
 77 |     parser.add_argument(
 78 |         "files",
 79 |         metavar='FILES',
 80 |         type=str,
 81 |         nargs="+",
 82 |         help="The files to insert",
 83 |     )
 84 | 
 85 |     options = parser.parse_args()
 86 | 
 87 |     tab = Table.from_tag(options.table, server=options.server)
 88 | 
 89 |     if options.infile:
 90 |         fd = open(options.infile)
 91 |         files = fd.readlines()
 92 |     else:
 93 |         files = options.files
 94 | 
 95 |     decoder = None
 96 |     if options.csv_fields:
 97 |         decoder = partial(csv_decoder, fieldnames=options.csv_fields.split(options.delimiter))
 98 | 
 99 |     preproc = None
100 |     if options.processor:
101 |         spec = options.processor.split('.')
102 |         modname = '.'.join(spec[:-1])
103 |         funcname = spec[-1]
104 |         mod = __import__(modname, fromlist=[funcname])
105 |         preproc = getattr(mod, funcname)
106 | 
107 |     streams = []
108 |     for f in files:
109 |         s = open_url(f)
110 |         if options.disco_chunk:
111 |             s = disco_input_stream(s, None, None)
112 |         streams.append(s)
113 | 
114 |     insert(tab, streams=streams, preprocess=preproc,
115 |            maxsize=int(options.maxsize), tmpdir=options.tmpdir, server=options.server,
116 |            lru_size=25000)
117 | 


--------------------------------------------------------------------------------
/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | PYTHON=python
 6 | CONFIG_DIR=/etc/hustle
 7 | BIN_DIR=/usr/local/bin
 8 | WORK_DIR=${PWD}
 9 | FROM=${WORK_DIR}/settings.yaml
10 | DEST=${CONFIG_DIR}/settings.yaml
11 | 
12 | pip install -r requirements.txt
13 | cd deps && make install
14 | cd ${WORK_DIR} && pip install .
15 | cp bin/hustle ${BIN_DIR}/hustle
16 | 
17 | if [[ ! -d ${CONFIG_DIR} ]]; then
18 |     mkdir ${CONFIG_DIR}
19 | elif [[ -f ${DEST} ]]; then
20 |     read -p "Settings file already exists, overwrite it? [Yes/No]: " rc;
21 |     if [[ "$rc" =~ [Nn][Oo] ]]; then
22 |         exit 0
23 |     fi
24 | fi
25 | cp ${FROM} ${DEST}
26 | 


--------------------------------------------------------------------------------
/deps/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | cardunion/cardunion.c
 3 | libwtrie/wtrie.c
 4 | maxhash/maxhash.c
 5 | liblru/pylru.cpp
 6 | liblmdb/db.c
 7 | liblmdb/mdb_copy
 8 | liblmdb/mdb_stat
 9 | libebset/pyebset.cpp
10 | *.pyc
11 | *.o
12 | *.a
13 | *.so
14 | *.swp
15 | 


--------------------------------------------------------------------------------
/deps/AUTHORS.txt:
--------------------------------------------------------------------------------
1 | Tim Spurway
2 | Nan Jiang
3 | Shayan Pooya
4 | 


--------------------------------------------------------------------------------
/deps/INSTALL.txt:
--------------------------------------------------------------------------------
 1 | INSTALL
 2 | =======
 3 | 
 4 | 1. Install C Library
 5 | --------------------
 6 | make
 7 | sudo make install
 8 | 
 9 | 2. Install Python Library
10 | -------------------------
11 | sudo python setup.py install
12 | 


--------------------------------------------------------------------------------
/deps/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | This is the MIT license: http://www.opensource.org/licenses/mit-license.php
 2 | 
 3 | Copyright (c) 2013 Chango Inc. 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 6 | software and associated documentation files (the "Software"), to deal in the Software
 7 | without restriction, including without limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
 9 | to whom the Software is furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all copies or
12 | substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
15 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
16 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
17 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
18 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
19 | DEALINGS IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/deps/Makefile:
--------------------------------------------------------------------------------
 1 | # Top level Makefile, liblmdb is the only thing needs tobe compiled
 2 | 
 3 | RM = rm -rf
 4 | PYTHON = python
 5 | LMDB = liblmdb
 6 | 
 7 | default: all
 8 | 
 9 | .DEFAULT:
10 | 	cd $(LMDB) && $(MAKE) $@
11 | 	$(PYTHON) setup.py build
12 | 
13 | install:
14 | 	cd $(LMDB) && $(MAKE) $@
15 | 	$(PYTHON) setup.py install
16 | 
17 | .PHONY: clean
18 | 
19 | clean:
20 | 	cd $(LMDB) && $(MAKE) $@
21 | 	$(RM) build
22 | 


--------------------------------------------------------------------------------
/deps/cardunion/test/cardunion_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from math import sqrt
  3 | from cardunion import Cardunion
  4 | 
  5 | 
  6 | class TestCardunion(unittest.TestCase):
  7 |     def setUp(self):
  8 |         self.log2m = 12
  9 |         self.error = 1.04 / sqrt(2 ** self.log2m)
 10 | 
 11 |     def test_mid_range_with_strings(self):
 12 |         self.execute(10000, self.log2m, self.error)
 13 | 
 14 |     def test_long_range_with_strings(self):
 15 |         self.execute(100000, self.log2m, self.error)
 16 | 
 17 |     def test_low_range_with_strings(self):
 18 |         self.execute(100, self.log2m, self.error)
 19 | 
 20 |     def execute(self, set_size, m, p):
 21 |         hll = Cardunion(m)
 22 |         for i in range(set_size):
 23 |             hll.add(str(i))
 24 | 
 25 |         estimate = hll.count()
 26 |         error = abs(estimate / float(set_size) - 1)
 27 |         self.assertLess(error, p)
 28 | 
 29 |     def test_with_duplicates(self):
 30 |         hll = Cardunion(self.log2m)
 31 |         set_size = 100000
 32 |         for i in range(set_size):
 33 |             if i % 3:
 34 |                 hll.add(str(i + 1))
 35 |             else:
 36 |                 hll.add(str(i))
 37 | 
 38 |         estimate = hll.count()
 39 |         expected = set_size * 2.0 / 3.0
 40 |         error = abs(estimate / float(expected) - 1)
 41 |         self.assertLess(error, self.error)
 42 | 
 43 |     def test_with_heavy_duplicates(self):
 44 |         hll = Cardunion(self.log2m)
 45 |         set_size = 100000
 46 |         for i in range(set_size):
 47 |             if i % 2 or i < set_size / 2:
 48 |                 hll.add(str(1))
 49 |             else:
 50 |                 hll.add(str(i))
 51 | 
 52 |         estimate = hll.count()
 53 |         expected = set_size * 1.0 / 4.0
 54 |         error = abs(estimate / float(expected) - 1)
 55 |         self.assertLess(error, self.error)
 56 | 
 57 |     def test_dumps(self):
 58 |         hll = Cardunion(self.log2m)
 59 |         hll_copy = Cardunion(self.log2m)
 60 |         for i in range(10000):
 61 |             hll.add(str(i))
 62 | 
 63 |         hll_copy.loads(hll.dumps())
 64 |         self.assertEqual(hll.count(), hll_copy.count())
 65 | 
 66 |     def test_sparse_dumps(self):
 67 |         hll = Cardunion(self.log2m)
 68 |         hll_copy = Cardunion(self.log2m)
 69 |         for i in range(500):
 70 |             hll.add(str(i))
 71 | 
 72 |         hll_copy.loads(hll.dumps())
 73 |         self.assertEqual(hll.count(), hll_copy.count())
 74 | 
 75 |     def test_union(self):
 76 |         hll = Cardunion(self.log2m)
 77 |         hll_1 = Cardunion(self.log2m)
 78 |         for i in range(10000):
 79 |             hll.add(str(i))
 80 |         for i in range(10000, 20000):
 81 |             hll_1.add(str(i))
 82 | 
 83 |         hll.union([hll_1])
 84 |         estimate = hll.count()
 85 |         error = abs(estimate / float(20000) - 1)
 86 |         self.assertLess(error, self.error)
 87 | 
 88 |     def test_bunion(self):
 89 |         hll = Cardunion(self.log2m)
 90 |         hll_1 = Cardunion(self.log2m)
 91 |         hll_2 = Cardunion(self.log2m)
 92 |         for i in range(10000):
 93 |             hll.add(str(i))
 94 |         for i in range(10000, 20000):
 95 |             hll_1.add(str(i))
 96 |         for i in range(20000, 30000):
 97 |             hll_2.add(str(i))
 98 | 
 99 |         hll.bunion([hll_1.dumps(), hll_2.dumps()])
100 |         estimate = hll.count()
101 |         error = abs(estimate / float(30000) - 1)
102 |         self.assertLess(error, self.error)
103 | 
104 |     def test_intersect(self):
105 |         """Since there is no theoretical error bound for intersection,
106 |         we'd use 3-sigma rule instead.
107 |         """
108 |         hll = Cardunion()
109 |         hll_1 = Cardunion()
110 |         for i in range(10000):
111 |             hll.add(str(i))
112 |         for i in range(5000, 15000):
113 |             hll_1.add(str(i))
114 | 
115 |         estimate, error, _ = Cardunion.intersect([hll_1, hll])
116 |         print estimate, error
117 |         self.assertTrue(5000 - 3 * error <= estimate <= 5000 + 3 * error)
118 | 
119 |     def test_intersect_big_small(self):
120 |         hll = Cardunion()
121 |         hll_1 = Cardunion()
122 |         for i in range(50):
123 |             hll.add(str(i))
124 |         for i in range(1, 100000):
125 |             hll_1.add(str(i))
126 | 
127 |         estimate, error, _ = Cardunion.intersect([hll_1, hll])
128 |         print estimate, error
129 |         self.assertTrue(50 - 3 * error <= estimate <= 50 + 3 * error)
130 | 
131 |     def test_intersect_a_few(self):
132 |         hll = Cardunion()
133 |         hll_1 = Cardunion()
134 |         hll_2 = Cardunion()
135 |         for i in range(5000):
136 |             hll.add(str(i))
137 |         for i in range(1, 100000):
138 |             hll_1.add(str(i))
139 |         for i in range(25, 1000):
140 |             hll_2.add(str(i))
141 | 
142 |         estimate, error, _ = Cardunion.intersect([hll_2, hll_1, hll])
143 |         print estimate, error
144 |         self.assertTrue(975 - 3 * error <= estimate <= 975 + 3 * error)
145 | 
146 |     def test_intersect_a_lot(self):
147 |         hlls = []
148 |         actual = 100000
149 |         nset = 10
150 |         for i in range(nset):
151 |             hll = Cardunion()
152 |             for j in range(actual):
153 |                 hll.add(str(i * 5000 + j))
154 |             hlls.append(hll)
155 | 
156 |         estimate, error, _ = Cardunion.intersect(hlls)
157 |         print estimate, error
158 |         self.assertTrue(actual - (nset - 1) * 5000 - 3 * error
159 |                         <= estimate <= actual - (nset - 1) * 5000 + 3 * error)
160 | 
161 |     def test_nonzero_counters(self):
162 |         h = Cardunion()
163 |         h.update_counter(1, 2)
164 |         h.update_counter(3, 4)
165 |         h.update_counter(5, 8)
166 |         self.assertEquals(list(h.nonzero_counters), [(1, 2), (3, 4), (5, 8)])
167 | 


--------------------------------------------------------------------------------
/deps/libebset/pyebset.pyx:
--------------------------------------------------------------------------------
  1 | from libc.stdint cimport uint64_t, uint32_t
  2 | from cython.operator cimport dereference as deref, preincrement as inc
  3 | from libcpp.vector cimport vector
  4 | from libcpp.string cimport string
  5 | 
  6 | IF UNAME_MACHINE != "x86_64":
  7 |     ctypedef uint32_t uint_t
  8 | ELSE:
  9 |     ctypedef uint64_t uint_t
 10 | 
 11 | cdef extern from "<ostream>" namespace "std":
 12 |     cdef cppclass ostream[T]:
 13 |         pass
 14 | 
 15 | cdef extern from "<sstream>" namespace "std":
 16 |     cdef cppclass stringstream:
 17 |         stringstream() except +
 18 |         string str()
 19 |         ostream write(char *, size_t)
 20 | 
 21 | cdef extern from "<algorithm>" namespace "std":
 22 |     cdef bint binary_search(vector[size_t].iterator,
 23 |                             vector[size_t].iterator,
 24 |                             uint_t&)
 25 | 
 26 | cdef extern from "ewah.h":
 27 |     cdef cppclass EWAHBoolArray[T]:
 28 |         EWAHBoolArray() nogil except +
 29 |         bint set(size_t i) nogil
 30 |         bint get(size_t i) nogil
 31 |         void logicaland(EWAHBoolArray&, EWAHBoolArray&) nogil
 32 |         void logicalor(EWAHBoolArray&, EWAHBoolArray&) nogil
 33 |         void logicalnot(EWAHBoolArray&) nogil
 34 |         size_t sizeInBytes() nogil
 35 |         void write(stringstream &, bint) nogil
 36 |         void read(stringstream &, bint) nogil
 37 |         vector[size_t] toArray() nogil
 38 |         size_t numberOfOnes() nogil
 39 |         bint operator==(EWAHBoolArray&) nogil
 40 |         bint operator!=(EWAHBoolArray&) nogil
 41 |         size_t sizeInBits() nogil
 42 |         void reset() nogil
 43 |         void inplace_logicalnot() nogil
 44 | 
 45 | 
 46 | cdef class BitSet:
 47 |     cdef EWAHBoolArray[uint_t] *thisptr
 48 |     cdef vector[size_t] indexes
 49 |     cdef bint updated
 50 | 
 51 |     def __cinit__(self):
 52 |         self.thisptr = new EWAHBoolArray[uint_t]()
 53 |         self.updated = True
 54 | 
 55 |     def __dealloc__(self):
 56 |         del self.thisptr
 57 | 
 58 |     def __setitem__(self, key, value):
 59 |         if value:
 60 |             self.set(key)
 61 | 
 62 |     def __getitem__(self, key):
 63 |         return self.thisptr.get(key)
 64 | 
 65 |     def set(self, size_t i):
 66 |         if self.thisptr.set(i):
 67 |             self.updated = True
 68 |             return True
 69 |         else:
 70 |             return False
 71 | 
 72 |     def get(self, size_t i):
 73 |         return self.thisptr.get(i)
 74 | 
 75 |     def dumps(self):
 76 |         cdef stringstream s
 77 | 
 78 |         self.thisptr.write(s, True)
 79 |         return s.str()
 80 | 
 81 |     def loads(self, s):
 82 |         cdef stringstream ss
 83 | 
 84 |         ss.write(s, len(s))
 85 |         self.thisptr.read(ss, True)
 86 |         self.updated = True
 87 |         return
 88 | 
 89 |     def size_in_bytes(self):
 90 |         return self.thisptr.sizeInBytes()
 91 | 
 92 |     def size_in_bits(self):
 93 |         return self.thisptr.sizeInBits()
 94 | 
 95 |     def reset(self):
 96 |         self.thisptr.reset()
 97 | 
 98 |     cpdef BitSet land(self, BitSet other):
 99 |         cdef BitSet s = BitSet()
100 | 
101 |         self.thisptr.logicaland(deref(other.thisptr), deref(s.thisptr))
102 |         return s
103 | 
104 |     def __and__(self, other):
105 |         return self.land(other)
106 | 
107 |     cpdef BitSet lor(self, BitSet other):
108 |         cdef BitSet s = BitSet()
109 | 
110 |         self.thisptr.logicalor(deref(other.thisptr), deref(s.thisptr))
111 |         return s
112 | 
113 |     def __or__(self, other):
114 |         return self.lor(other)
115 | 
116 |     cpdef BitSet lnot(self):
117 |         cdef BitSet s = BitSet()
118 | 
119 |         self.thisptr.logicalnot(deref(s.thisptr))
120 |         return s
121 | 
122 |     def lnot_inplace(self):
123 |         self.thisptr.inplace_logicalnot()
124 |         self.updated = True
125 |         return self
126 | 
127 |     def __richcmp__(BitSet l, BitSet r, int op):
128 |         cdef bint e
129 | 
130 |         if op == 2:
131 |             e = (deref(l.thisptr) == deref(r.thisptr))
132 |         elif op == 3:
133 |             e = (deref(l.thisptr) != deref(r.thisptr))
134 |         else:
135 |             raise AttributeError("Unsupported operators.")
136 |         return e
137 | 
138 |     def __invert__(self):
139 |         return self.lnot()
140 | 
141 |     def __iter__(self):
142 |         cdef vector[size_t] v = self.thisptr.toArray()
143 |         cdef size_t i
144 | 
145 |         IF UNAME_SYSNAME == "Linux":
146 |             cdef vector[uint_t].iterator it = v.begin()
147 |             while it != v.end():
148 |                 i = deref(it)
149 |                 yield i
150 |                 inc(it)
151 |         ELSE:
152 |             # clang compiler on Mac Os x will report error for the code above
153 |             return (i for i in <list>v)
154 | 
155 |     def __len__(self):
156 |         return self.thisptr.numberOfOnes()
157 | 
158 |     def __str__(self):
159 |         return self.dumps()
160 | 
161 |     def __contains__(self, size_t v):
162 |         if self.updated or (self.indexes.size() == 0):
163 |             self.indexes = self.thisptr.toArray()
164 |             self.updated = False
165 | 
166 |         return binary_search(self.indexes.begin(), self.indexes.end(), v)
167 | 


--------------------------------------------------------------------------------
/deps/libebset/test/pyebset_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from pyebset import BitSet
  3 | 
  4 | 
  5 | class BitSetTest(unittest.TestCase):
  6 |     """Docstring for BitSetTest """
  7 | 
  8 |     def test_set(self):
  9 |         b = BitSet()
 10 |         self.assertTrue(b.set(0))
 11 |         self.assertTrue(b.set(1))
 12 |         self.assertTrue(b.set(2))
 13 |         self.assertTrue(b.set(3))
 14 |         self.assertFalse(b.set(1))
 15 | 
 16 |     def test_dumps_loads(self):
 17 |         b = BitSet()
 18 |         self.assertTrue(b.set(0))
 19 |         self.assertTrue(b.set(1))
 20 |         self.assertTrue(b.set(4))
 21 |         self.assertTrue(b.set(8))
 22 |         self.assertTrue(b.set(16))
 23 |         s = BitSet()
 24 |         s.loads(b.dumps())
 25 |         self.assertEqual(b, s)
 26 | 
 27 |     def test_logical_ops(self):
 28 |         b = BitSet()
 29 |         b.set(0)
 30 |         b.set(1)
 31 |         b.set(4)
 32 |         b.set(8)
 33 |         b.set(16)
 34 |         bb = BitSet()
 35 |         bb.set(0)
 36 |         bb.set(1)
 37 |         bb.set(4)
 38 |         bb.set(9)
 39 |         cc = BitSet()
 40 |         cc.set(0)
 41 |         cc.set(1)
 42 |         cc.set(4)
 43 |         cc.set(8)
 44 |         cc.set(9)
 45 |         cc.set(16)
 46 |         dd = BitSet()
 47 |         dd.set(0)
 48 |         dd.set(1)
 49 |         dd.set(4)
 50 |         ee = BitSet()
 51 |         ee.set(2)
 52 |         ee.set(3)
 53 | 
 54 |         la = b & bb
 55 |         lo = b | bb
 56 |         ln = ~ dd
 57 |         ll = ~ ln
 58 |         self.assertEqual(lo, cc)
 59 |         self.assertNotEqual(la, dd)
 60 |         self.assertEqual(list(ln), list(ee))
 61 |         self.assertEqual(len(b), 5)
 62 |         self.assertEqual(len(bb), 4)
 63 |         self.assertEqual(len(cc), 6)
 64 |         self.assertEqual(len(dd), 3)
 65 |         self.assertEqual(len(ee), 2)
 66 |         self.assertEqual(len(la), 3)
 67 |         self.assertEqual(len(lo), 6)
 68 |         self.assertEqual(len(ln), 2)
 69 |         self.assertEqual(len(ll), 3)
 70 | 
 71 |     def test_logical_not(self):
 72 |         b = BitSet()
 73 |         b.set(0)
 74 |         b.set(1)
 75 |         b.set(8)
 76 |         b.set(9)
 77 |         c = ~b
 78 |         # test the logical not doesn't generate any numbers that are greater
 79 |         # than 9 in this case
 80 |         self.assertEqual(list(c), [2, 3, 4, 5, 6, 7])
 81 |         d = ~c
 82 |         self.assertListEqual(list(d), [0, 1, 8, 9])
 83 | 
 84 |     def test_logical_not_1(self):
 85 |         b = BitSet()
 86 |         b.set(0)
 87 |         b.set(1)
 88 |         b.set(7)
 89 |         b.set(8)
 90 |         c = ~b
 91 |         # test the logical not doesn't generate any numbers that are greater
 92 |         # than 9 in this case
 93 |         self.assertEqual(list(c), [2, 3, 4, 5, 6])
 94 |         d = ~c
 95 |         self.assertListEqual(list(d), [0, 1, 7, 8])
 96 | 
 97 |     def test_generator(self):
 98 |         b = BitSet()
 99 |         b.set(1)
100 |         b.set(4)
101 |         b.set(10)
102 |         b.set(100000)
103 |         b.set(12323131)
104 |         self.assertEqual(list(b), [1, 4, 10, 100000, 12323131])
105 | 
106 |     def test_contains(self):
107 |         b = BitSet()
108 |         b.set(1)
109 |         b.set(4)
110 |         b.set(10)
111 |         b.set(100000)
112 |         b.set(12323131)
113 |         for i in [1, 4, 10, 100000, 12323131]:
114 |             self.assertTrue(i in b)
115 |         for i in [2, 3, 5, 6, 1232312]:
116 |             self.assertTrue(i not in b)
117 | 
118 |     def test_eq_ne(self):
119 |         b = BitSet()
120 |         b.set(1)
121 |         b.set(2)
122 |         bb = BitSet()
123 |         bb.set(1)
124 |         bb.set(2)
125 |         cc = BitSet()
126 |         cc.set(2)
127 |         cc.set(3)
128 |         self.assertTrue(b == bb)
129 |         self.assertTrue(bb != cc)
130 | 


--------------------------------------------------------------------------------
/deps/liblmdb/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | Copyright 2011-2013 Howard Chu, Symas Corp.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted only as authorized by the OpenLDAP
 6 | Public License.
 7 | 
 8 | A copy of this license is available in the file LICENSE in the
 9 | top-level directory of the distribution or, alternatively, at
10 | <http://www.OpenLDAP.org/license.html>.
11 | 
12 | OpenLDAP is a registered trademark of the OpenLDAP Foundation.
13 | 
14 | Individual files and/or contributed packages may be copyright by
15 | other parties and/or subject to additional restrictions.
16 | 
17 | This work also contains materials derived from public sources.
18 | 
19 | Additional information about OpenLDAP can be obtained at
20 | <http://www.openldap.org/>.
21 | 


--------------------------------------------------------------------------------
/deps/liblmdb/LICENSE:
--------------------------------------------------------------------------------
 1 | The OpenLDAP Public License
 2 |   Version 2.8, 17 August 2003
 3 | 
 4 | Redistribution and use of this software and associated documentation
 5 | ("Software"), with or without modification, are permitted provided
 6 | that the following conditions are met:
 7 | 
 8 | 1. Redistributions in source form must retain copyright statements
 9 |    and notices,
10 | 
11 | 2. Redistributions in binary form must reproduce applicable copyright
12 |    statements and notices, this list of conditions, and the following
13 |    disclaimer in the documentation and/or other materials provided
14 |    with the distribution, and
15 | 
16 | 3. Redistributions must contain a verbatim copy of this document.
17 | 
18 | The OpenLDAP Foundation may revise this license from time to time.
19 | Each revision is distinguished by a version number.  You may use
20 | this Software under terms of this license revision or under the
21 | terms of any subsequent revision of the license.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS
24 | CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
25 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
26 | AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT
27 | SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S)
28 | OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
29 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
34 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 | POSSIBILITY OF SUCH DAMAGE.
36 | 
37 | The names of the authors and copyright holders must not be used in
38 | advertising or otherwise to promote the sale, use or other dealing
39 | in this Software without specific, written prior permission.  Title
40 | to copyright in this Software shall at all times remain with copyright
41 | holders.
42 | 
43 | OpenLDAP is a registered trademark of the OpenLDAP Foundation.
44 | 
45 | Copyright 1999-2003 The OpenLDAP Foundation, Redwood City,
46 | California, USA.  All Rights Reserved.  Permission to copy and
47 | distribute verbatim copies of this document is granted.
48 | 


--------------------------------------------------------------------------------
/deps/liblmdb/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for liblmdb (Lightning memory-mapped database library).
 2 | 
 3 | ########################################################################
 4 | # Configuration. The compiler options must enable threaded compilation.
 5 | #
 6 | # Preprocessor macros (for CPPFLAGS) of interest...
 7 | # Note that the defaults should already be correct for most
 8 | # platforms; you should not need to change any of these.
 9 | # Read their descriptions in mdb.c if you do:
10 | #
11 | # - MDB_USE_POSIX_SEM
12 | # - MDB_DSYNC
13 | # - MDB_FDATASYNC
14 | # - MDB_USE_PWRITEV
15 | #
16 | # There may be other macros in mdb.c of interest. You should
17 | # read mdb.c before changing any of them.
18 | #
19 | CC	= gcc
20 | W	= -W -Wall -Wno-unused-parameter -Wbad-function-cast
21 | THREADS = -pthread
22 | OPT = -O2 -g
23 | CFLAGS	= $(THREADS) $(OPT) $(W) $(XCFLAGS)
24 | LDLIBS	=
25 | SOLIBS	=
26 | prefix	= /usr/local
27 | 
28 | ########################################################################
29 | 
30 | IHDRS	= lmdb.h
31 | ILIBS	= liblmdb.a liblmdb.so
32 | IPROGS	= mdb_stat mdb_copy
33 | IDOCS	= mdb_stat.1 mdb_copy.1
34 | PROGS	= $(IPROGS)
35 | all:	$(ILIBS) $(PROGS)
36 | 
37 | install: $(ILIBS) $(IPROGS) $(IHDRS)
38 | 	for f in $(IPROGS); do cp $$f $(DESTDIR)$(prefix)/bin; done
39 | 	for f in $(ILIBS); do cp $$f $(DESTDIR)$(prefix)/lib; done
40 | 	for f in $(IHDRS); do cp $$f $(DESTDIR)$(prefix)/include; done
41 | 
42 | clean:
43 | 	rm -rf $(PROGS) *.[ao] *.so *~ testdb
44 | 
45 | test:	all
46 | 	mkdir testdb
47 | 	./mtest && ./mdb_stat testdb
48 | 
49 | liblmdb.a:	mdb.o midl.o
50 | 	ar rs $@ mdb.o midl.o
51 | 
52 | liblmdb.so:	mdb.o midl.o
53 | #	$(CC) $(LDFLAGS) -pthread -shared -Wl,-Bsymbolic -o $@ mdb.o midl.o $(SOLIBS)
54 | 	$(CC) $(LDFLAGS) -pthread -shared -o $@ mdb.o midl.o $(SOLIBS)
55 | 
56 | mdb_stat: mdb_stat.o liblmdb.a
57 | mdb_copy: mdb_copy.o liblmdb.a
58 | 
59 | mdb.o: mdb.c lmdb.h midl.h
60 | 	$(CC) $(CFLAGS) -fPIC $(CPPFLAGS) -c mdb.c
61 | 
62 | midl.o: midl.c midl.h
63 | 	$(CC) $(CFLAGS) -fPIC $(CPPFLAGS) -c midl.c
64 | 
65 | %:	%.o
66 | 	$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@
67 | 
68 | %.o:	%.c lmdb.h
69 | 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
70 | 


--------------------------------------------------------------------------------
/deps/liblmdb/README.md:
--------------------------------------------------------------------------------
 1 | Requires:
 2 | =======
 3 |  - liblmdb
 4 |  - Python 2.7 (that is all I have tested with)
 5 |  - Cython
 6 | 
 7 | Install:
 8 | =======
 9 | ```
10 |     $ sudo python setup.py install
11 | ```
12 | 
13 | Usage
14 | =====
15 | 
16 | Using Writer and Reader
17 | -----------------------
18 | 
19 |     >>> import mdb
20 |     >>> writer = mdb.Writer('/tmp/mdbtest')
21 |     >>> writer.put('foo', 'bar')
22 |     >>> writer.mput({"key": "value", "egg": "spam"})
23 |     >>> writer.close()
24 |     >>> reader = mdb.Reader('/tmp/mdbtest')
25 |     >>> reader.get('foo')
26 |     >>> for key, value in reader.iteritems():
27 |     ...   print key, value
28 |     >>> reader.close()
29 | 
30 | Using Integer Key
31 | -----------------
32 |     >>> writer = mdb.Writer('/tmp/mdbtest', dup=True, int_key=True)
33 |     >>> writer = writer.put(1, 'foo')
34 |     >>> writer = writer.put(1, 'bar')  # append a duplicate key
35 |     >>> writer.close()
36 |     >>> reader = mdb.DupReader('/tmp/mdbtest', int_key=True)
37 |     >>> for v in reader.get(1):
38 |     ...   print v
39 |     >>> reader.close()
40 |     
41 | Using Low-level Stuff
42 | ---------------------
43 |     >>> env = mdb.Env('/tmp/mdbtest')
44 |     >>> txn = env.begin_txn()
45 |     >>> db = env.open_db(txn)
46 |     >>> db.put(txn, 'hi', 'assinine')
47 |     >>> txn.commit()
48 |     >>> txn = env.begin_txn()
49 |     >>> print '"%s"' % db.get(txn, 'hi')  # --> assinine
50 |     >>> txn.close()
51 |     >>> db.close()
52 |     >>> env.close()
53 | 


--------------------------------------------------------------------------------
/deps/liblmdb/cmdb.pxd:
--------------------------------------------------------------------------------
  1 | cdef extern from 'lmdb.h':
  2 |     cdef enum:
  3 |         # env creation flags
  4 |         MDB_FIXEDMAP = 0x01
  5 |         MDB_NOSUBDIR = 0x4000
  6 |         MDB_NOSYNC = 0x10000
  7 |         MDB_RDONLY = 0x20000
  8 |         MDB_NOMETASYNC = 0x40000
  9 |         MDB_WRITEMAP = 0x80000
 10 |         MDB_MAPASYNC = 0x100000
 11 |         MDB_NOTLS = 0x200000
 12 |         MDB_NOLOCK = 0x400000
 13 |         MDB_NORDAHEAD = 0x800000
 14 | 
 15 |         # db open flags
 16 |         MDB_REVERSEKEY = 0x02
 17 |         MDB_DUPSORT = 0x04
 18 |         MDB_INTEGERKEY = 0x08
 19 |         MDB_DUPFIXED = 0x10
 20 |         MDB_INTEGERDUP = 0x20
 21 |         MDB_REVERSEDUP = 0x40
 22 |         MDB_CREATE = 0x40000
 23 | 
 24 |         # write flags
 25 |         MDB_NOOVERWRITE = 0x10
 26 |         MDB_NODUPDATA = 0x20
 27 |         MDB_CURRENT = 0x40
 28 |         MDB_RESERVE = 0x10000
 29 |         MDB_APPEND = 0x20000
 30 |         MDB_APPENDDUP = 0x40000
 31 |         MDB_MULTIPLE = 0x80000
 32 | 
 33 |         # MDB Return Values
 34 |         MDB_SUCCESS	= 0
 35 |         MDB_KEYEXIST = -30799
 36 |         MDB_NOTFOUND = -30798
 37 |         MDB_PAGE_NOTFOUND = -30797
 38 |         MDB_CORRUPTED = -30796
 39 |         MDB_PANIC = -30795
 40 |         MDB_VERSION_MISMATCH = -30794
 41 |         MDB_INVALID = -30793
 42 |         MDB_MAP_FULL = -30792
 43 |         MDB_DBS_FULL = -30791
 44 |         MDB_READERS_FULL = -30790
 45 |         MDB_TLS_FULL = -30789
 46 |         MDB_TXN_FULL = -30788
 47 |         MDB_CURSOR_FULL = -30787
 48 |         MDB_PAGE_FULL = -30786
 49 |         MDB_MAP_RESIZED = -30785
 50 |         MDB_INCOMPATIBLE = -30784
 51 |         MDB_BAD_RSLOT = -30783
 52 |         MDB_BAD_TXN = -30782
 53 |         MDB_BAD_VALSIZE = -30781
 54 |         MDB_LAST_ERRCODE = MDB_BAD_VALSIZE
 55 | 
 56 |     cdef enum Cursor_Op:
 57 |         # cursor operations
 58 |         MDB_FIRST = 0
 59 |         MDB_FIRST_DUP = 1
 60 |         MDB_GET_BOTH = 2
 61 |         MDB_GET_BOTH_RANGE = 3
 62 |         MDB_GET_CURRENT = 4
 63 |         MDB_GET_MULTIPLE = 5
 64 |         MDB_LAST = 6
 65 |         MDB_LAST_DUP = 7
 66 |         MDB_NEXT = 8
 67 |         MDB_NEXT_DUP = 9
 68 |         MDB_NEXT_MULTIPLE = 10
 69 |         MDB_NEXT_NODUP = 11
 70 |         MDB_PREV = 12
 71 |         MDB_PREV_DUP = 13
 72 |         MDB_PREV_NODUP = 14
 73 |         MDB_SET = 15
 74 |         MDB_SET_KEY = 16
 75 |         MDB_SET_RANGE = 17
 76 | 
 77 |     ctypedef struct MDB_txn:
 78 |         pass
 79 | 
 80 |     ctypedef struct MDB_env:
 81 |         pass
 82 | 
 83 |     ctypedef struct MDB_cursor:
 84 |         pass
 85 | 
 86 |     ctypedef unsigned int    MDB_dbi
 87 | 
 88 |     ctypedef struct MDB_val:
 89 |         size_t   mv_size
 90 |         void    *mv_data
 91 | 
 92 |     ctypedef   struct MDB_stat:
 93 |         unsigned int    ms_psize
 94 |         unsigned int    ms_depth
 95 |         size_t          ms_branch_pages
 96 |         size_t          ms_leaf_pages
 97 |         size_t          ms_overflow_pages
 98 |         size_t          ms_entries
 99 | 
100 |     ctypedef struct MDB_envinfo:
101 |         void            *me_mapaddr
102 |         size_t          me_mapsize
103 |         size_t          me_last_pgno
104 |         size_t          me_last_txnid
105 |         unsigned int    me_maxreaders
106 |         unsigned int    me_numreaders
107 | 
108 |     ctypedef int (*MDB_cmp_func)(const MDB_val *a, const MDB_val *b)
109 | 
110 |     char *mdb_strerror(int err)
111 |     int  mdb_env_create(MDB_env **env)
112 |     int  mdb_env_open(MDB_env *env, char *path, unsigned int flags, unsigned int mode)
113 |     int  mdb_env_copy(MDB_env *env, char *path)
114 |     int  mdb_env_stat(MDB_env *env, MDB_stat *stat)
115 |     int  mdb_env_info(MDB_env *env, MDB_envinfo *stat)
116 |     int  mdb_env_sync(MDB_env *env, int force)
117 |     int  mdb_env_set_flags(MDB_env *env, unsigned int flags, int onoff)
118 |     int  mdb_env_get_flags(MDB_env *env, unsigned int *flags)
119 |     int  mdb_env_get_path(MDB_env *env, char **path)
120 |     int  mdb_env_set_mapsize(MDB_env *env, size_t size)
121 |     int  mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
122 |     int  mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
123 |     void mdb_env_close(MDB_env *env)
124 | 
125 |     int  mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn)
126 |     int  mdb_txn_commit(MDB_txn *txn)
127 |     void mdb_txn_abort(MDB_txn *txn)
128 |     void mdb_txn_reset(MDB_txn *txn)
129 |     int  mdb_txn_renew(MDB_txn *txn)
130 | 
131 |     int  mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func cmp)
132 |     int  mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func cmp)
133 |     int  mdb_dbi_open(MDB_txn *txn, char *name, unsigned int flags, MDB_dbi *dbi)
134 |     void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
135 |     int  mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat)
136 |     int  mdb_drop(MDB_txn *txn, MDB_dbi dbi, int delete)
137 |     int  mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data)
138 |     int  mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned int flags)
139 |     int  mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data)
140 | 
141 |     int  mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor)
142 |     void mdb_cursor_close(MDB_cursor *cursor)
143 |     int  mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor)
144 |     int  mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned int op)
145 |     int  mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned int flags)
146 |     int  mdb_cursor_del(MDB_cursor *cursor, unsigned int flags)
147 |     int  mdb_cursor_count(MDB_cursor *cursor, size_t *countp)
148 | 


--------------------------------------------------------------------------------
/deps/liblmdb/mdb_copy.1:
--------------------------------------------------------------------------------
 1 | .TH MDB_COPY 1 "2012/12/12" "LMDB 0.9.5"
 2 | .\" Copyright 2012 Howard Chu, Symas Corp. All Rights Reserved.
 3 | .\" Copying restrictions apply.  See COPYRIGHT/LICENSE.
 4 | .SH NAME
 5 | mdb_copy \- LMDB environment copy tool
 6 | .SH SYNOPSIS
 7 | .B mdb_copy
 8 | .I srcpath\ [dstpath]
 9 | .SH DESCRIPTION
10 | The
11 | .B mdb_copy
12 | utility copies an LMDB environment. The environment can
13 | be copied regardless of whether it is currently in use.
14 | No lockfile is created, since it gets recreated at need.
15 | 
16 | If
17 | .I dstpath
18 | is specified it must be the path of an empty directory
19 | for storing the backup. Otherwise, the backup will be
20 | written to stdout.
21 | 
22 | .SH DIAGNOSTICS
23 | Exit status is zero if no errors occur.
24 | Errors result in a non-zero exit status and
25 | a diagnostic message being written to standard error.
26 | .SH CAVEATS
27 | This utility can trigger significant file size growth if run
28 | in parallel with write transactions, because pages which they
29 | free during copying cannot be reused until the copy is done.
30 | .SH "SEE ALSO"
31 | .BR mdb_stat (1)
32 | .SH AUTHOR
33 | Howard Chu of Symas Corporation <http://www.symas.com>
34 | 


--------------------------------------------------------------------------------
/deps/liblmdb/mdb_copy.c:
--------------------------------------------------------------------------------
 1 | /* mdb_copy.c - memory-mapped database backup tool */
 2 | /*
 3 |  * Copyright 2012 Howard Chu, Symas Corp.
 4 |  * All rights reserved.
 5 |  *
 6 |  * Redistribution and use in source and binary forms, with or without
 7 |  * modification, are permitted only as authorized by the OpenLDAP
 8 |  * Public License.
 9 |  *
10 |  * A copy of this license is available in the file LICENSE in the
11 |  * top-level directory of the distribution or, alternatively, at
12 |  * <http://www.OpenLDAP.org/license.html>.
13 |  */
14 | #ifdef _WIN32
15 | #include <windows.h>
16 | #define	MDB_STDOUT	GetStdHandle(STD_OUTPUT_HANDLE)
17 | #else
18 | #define	MDB_STDOUT	1
19 | #endif
20 | #include <stdio.h>
21 | #include <stdlib.h>
22 | #include <signal.h>
23 | #include "lmdb.h"
24 | 
25 | static void
26 | sighandle(int sig)
27 | {
28 | }
29 | 
30 | int main(int argc,char * argv[])
31 | {
32 | 	int rc;
33 | 	MDB_env *env;
34 | 	char *envname = argv[1];
35 | 
36 | 	if (argc<2 || argc>3) {
37 | 		fprintf(stderr, "usage: %s srcpath [dstpath]\n", argv[0]);
38 | 		exit(EXIT_FAILURE);
39 | 	}
40 | 
41 | #ifdef SIGPIPE
42 | 	signal(SIGPIPE, sighandle);
43 | #endif
44 | #ifdef SIGHUP
45 | 	signal(SIGHUP, sighandle);
46 | #endif
47 | 	signal(SIGINT, sighandle);
48 | 	signal(SIGTERM, sighandle);
49 | 
50 | 	rc = mdb_env_create(&env);
51 | 
52 | 	rc = mdb_env_open(env, envname, MDB_RDONLY, 0);
53 | 	if (rc) {
54 | 		printf("mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc));
55 | 	} else {
56 | 		if (argc == 2)
57 | 			rc = mdb_env_copyfd(env, MDB_STDOUT);
58 | 		else
59 | 			rc = mdb_env_copy(env, argv[2]);
60 | 		if (rc)
61 | 			printf("mdb_env_copy failed, error %d %s\n", rc, mdb_strerror(rc));
62 | 	}
63 | 	mdb_env_close(env);
64 | 
65 | 	return rc ? EXIT_FAILURE : EXIT_SUCCESS;
66 | }
67 | 


--------------------------------------------------------------------------------
/deps/liblmdb/mdb_stat.1:
--------------------------------------------------------------------------------
 1 | .TH MDB_STAT 1 "2012/12/12" "LMDB 0.9.5"
 2 | .\" Copyright 2012 Howard Chu, Symas Corp. All Rights Reserved.
 3 | .\" Copying restrictions apply.  See COPYRIGHT/LICENSE.
 4 | .SH NAME
 5 | mdb_stat \- LMDB environment status tool
 6 | .SH SYNOPSIS
 7 | .B mdb_stat
 8 | .BR \ envpath
 9 | [\c
10 | .BR \-e ]
11 | [\c
12 | .BR \-f [ f [ f ]]]
13 | [\c
14 | .BR \-n ]
15 | [\c
16 | .BR \-r [ r ]]
17 | [\c
18 | .BR \-a \ |
19 | .BI \-s \ subdb\fR]
20 | .SH DESCRIPTION
21 | The
22 | .B mdb_stat
23 | utility displays the status of an LMDB environment.
24 | .SH OPTIONS
25 | .TP
26 | .BR \-e
27 | Display information about the database environment.
28 | .TP
29 | .BR \-f
30 | Display information about the environment freelist.
31 | If \fB\-ff\fP is given, summarize each freelist entry.
32 | If \fB\-fff\fP is given, display the full list of page IDs in the freelist.
33 | .TP
34 | .BR \-n
35 | Display the status of an LMDB database which does not use subdirectories.
36 | .TP
37 | .BR \-r
38 | Display information about the environment reader table.
39 | Shows the process ID, thread ID, and transaction ID for each active
40 | reader slot. The process ID and transaction ID are in decimal, the
41 | thread ID is in hexadecimal. The transaction ID is displayed as "-"
42 | if the reader does not currently have a read transaction open.
43 | If \fB\-rr\fP is given, check for stale entries in the reader
44 | table and clear them. The reader table will be printed again
45 | after the check is performed.
46 | .TP
47 | .BR \-a
48 | Display the status of all of the subdatabases in the environment.
49 | .TP
50 | .BR \-s \ subdb
51 | Display the status of a specific subdatabase.
52 | .SH DIAGNOSTICS
53 | Exit status is zero if no errors occur.
54 | Errors result in a non-zero exit status and
55 | a diagnostic message being written to standard error.
56 | .SH "SEE ALSO"
57 | .BR mdb_copy (1)
58 | .SH AUTHOR
59 | Howard Chu of Symas Corporation <http://www.symas.com>
60 | 


--------------------------------------------------------------------------------
/deps/liblmdb/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from distutils.extension import Extension
 3 | from Cython.Distutils import build_ext
 4 | 
 5 | __version__ = '0.4.0'
 6 | 
 7 | setup(
 8 |     name = "mdb",
 9 |     version = __version__,
10 |     description = 'Python client of MDB-Lightning',
11 |     cmdclass = {'build_ext': build_ext},
12 |     author = 'Chango Inc.',
13 |     keywords=['mdb-ligtning', 'mdb', 'lmdb', 'key-value store'],
14 |     license='MIT',
15 |     ext_modules = [Extension("mdb", ["db.pyx", ],
16 |                              libraries=["lmdb"],
17 |                              library_dirs=["/usr/local/lib"],
18 |                              include_dirs=["/usr/local/include"],
19 |                              runtime_library_dirs=["/usr/local/lib"])]
20 | )
21 | 


--------------------------------------------------------------------------------
/deps/liblmdb/test/test_cursor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import mdb
 3 | from unittest import TestCase
 4 | 
 5 | 
 6 | class TestCursor(TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         import os
10 |         import errno
11 |         self.path = './testdbm'
12 |         try:
13 |             os.makedirs(self.path)
14 |         except OSError as e:
15 |             if e.errno == errno.EEXIST and os.path.isdir(self.path):
16 |                 pass
17 |             else:
18 |                 raise
19 |         self.env = mdb.Env(self.path, max_dbs=8)
20 |         self.txn = self.env.begin_txn()
21 |         self.db = self.env.open_db(self.txn, 'test_cursor')
22 |         self.db.drop(self.txn, 0)
23 |         self.txn.commit()
24 |         self.txn = self.env.begin_txn()
25 | 
26 |     def tearDown(self):
27 |         import shutil
28 |         self.txn.commit()
29 |         self.db.close()
30 |         self.env.close()
31 |         shutil.rmtree(self.path)
32 | 
33 |     def test_put(self):
34 |         # all keys must be sorted
35 |         cursor = mdb.Cursor(self.txn, self.db)
36 |         cursor.put('foo', 'bar', mdb.MDB_APPENDDUP)
37 |         self.assertEqual(cursor.get('foo'), ('foo', 'bar'))
38 | 
39 |     def test_put_unicode(self):
40 |         # all keys must be sorted
41 |         cursor = mdb.Cursor(self.txn, self.db)
42 |         cursor.put('fΩo', 'b∑r', mdb.MDB_APPENDDUP)
43 |         self.assertEqual(cursor.get('fΩo'), ('fΩo', 'b∑r'))
44 | 
45 |     def test_put_duplicate(self):
46 |         # all values must be sorted as well
47 |         cursor = mdb.Cursor(self.txn, self.db)
48 |         cursor.put('foo', 'bar', mdb.MDB_APPENDDUP)
49 |         cursor.put('foo', 'bar1', mdb.MDB_APPENDDUP)
50 |         self.assertEqual(cursor.count_dups(), 2)
51 |         self.assertEqual(cursor.get('foo'), ('foo', 'bar'))
52 |         while 1:
53 |             key, value = cursor.get(op=mdb.MDB_NEXT_DUP)
54 |             if not key:
55 |                 break
56 |             self.assertEqual((key, value), ('foo', 'bar1'))
57 | 
58 |     def test_delete_by_key(self):
59 |         cursor = mdb.Cursor(self.txn, self.db)
60 |         cursor.put('delete', 'done', mdb.MDB_APPENDDUP)
61 |         cursor.put('delete', 'done1', mdb.MDB_APPENDDUP)
62 |         key, value = cursor.get('delete')
63 |         cursor.delete(mdb.MDB_NODUPDATA)
64 |         self.assertEqual(cursor.get('delete'), (None, None))
65 | 
66 |     def test_delete_by_key_value(self):
67 |         cursor = mdb.Cursor(self.txn, self.db)
68 |         cursor.put('delete', 'done', mdb.MDB_APPENDDUP)
69 |         cursor.put('delete', 'done1', mdb.MDB_APPENDDUP)
70 |         key, value = cursor.get('delete')
71 |         cursor.delete()
72 |         self.assertEqual(cursor.get('delete'), ('delete', 'done1'))
73 | 
74 |     def test_delete_by_key_value_1(self):
75 |         cursor = mdb.Cursor(self.txn, self.db)
76 |         cursor.put('delete', 'done', mdb.MDB_APPENDDUP)
77 |         cursor.put('delete', 'done1', mdb.MDB_APPENDDUP)
78 |         cursor.put('delete', 'done2', mdb.MDB_APPENDDUP)
79 |         key, value = cursor.get('delete', 'done2', op=mdb.MDB_NEXT_DUP)
80 |         cursor.delete()
81 |         self.assertEqual(cursor.get('delete'), ('delete', 'done'))
82 | 


--------------------------------------------------------------------------------
/deps/liblmdb/test/test_greater_failure.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import mdb
  3 | from unittest import TestCase
  4 | 
  5 | 
  6 | class TestGreaterFailure(TestCase):
  7 | 
  8 |     def setUp(self):
  9 |         import os
 10 |         import errno
 11 |         self.path = './testdbmi'
 12 |         try:
 13 |             os.makedirs(self.path)
 14 |         except OSError as e:
 15 |             if e.errno == errno.EEXIST and os.path.isdir(self.path):
 16 |                 pass
 17 |             else:
 18 |                 raise
 19 |         self.env = mdb.Env(self.path, mapsize=1 * mdb.MB, max_dbs=8)
 20 | 
 21 |     def tearDown(self):
 22 |         import shutil
 23 |         self.env.close()
 24 |         shutil.rmtree(self.path)
 25 | 
 26 |     def drop_mdb(self):
 27 |         txn = self.env.begin_txn()
 28 |         db = self.env.open_db(txn, 'test_db',
 29 |                               flags=mdb.MDB_CREATE|mdb.MDB_DUPSORT|mdb.MDB_INTEGERKEY,
 30 |                               key_inttype=mdb.MDB_INT_32)
 31 |         db.drop(txn, 0)
 32 |         txn.commit()
 33 |         db.close()
 34 | 
 35 |     def test_intstr_greater_failure(self):
 36 |         # all keys must be sorted
 37 |         txn = self.env.begin_txn()
 38 |         db = self.env.open_db(txn, 'test_db',
 39 |                               flags=mdb.MDB_CREATE|mdb.MDB_INTEGERKEY|mdb.MDB_DUPSORT,
 40 |                               key_inttype=mdb.MDB_INT_32)
 41 |         db.put(txn,184504, 'bar1')
 42 |         db.put(txn,184031, 'bar2')
 43 |         db.put(txn,145248, 'bar3')
 44 |         db.put(txn,84131 , 'bar4')
 45 |         db.put(txn,3869  , 'bar5')
 46 |         db.put(txn,124034, 'bar6')
 47 |         db.put(txn,90752 , 'bar7')
 48 |         db.put(txn,48288 , 'bar8')
 49 |         db.put(txn,97573 , 'bar9')
 50 |         db.put(txn,18455 , 'bar0')
 51 | 
 52 |         txn.commit()
 53 |         txn = self.env.begin_txn()
 54 |         res = list(db.get_gt(txn, 50000))
 55 |         self.assertEqual(len(res), 7)
 56 |         res = list(db.get_gt(txn, 84131))
 57 |         self.assertEqual(len(res), 6)
 58 |         res = list(db.get_ge(txn, 84131))
 59 |         self.assertEqual(len(res), 7)
 60 |         txn.commit()
 61 |         db.close()
 62 | 
 63 |     def test_intint_greater_failure(self):
 64 |         # all keys must be sorted
 65 |         txn = self.env.begin_txn()
 66 |         db = self.env.open_db(txn, 'test_db',
 67 |                               flags=mdb.MDB_CREATE|mdb.MDB_INTEGERKEY|mdb.MDB_DUPSORT|mdb.MDB_INTEGERDUP,
 68 |                               key_inttype=mdb.MDB_INT_32)
 69 |         db.put(txn,184504, 1)
 70 |         db.put(txn,184031, 2)
 71 |         db.put(txn,145248, 3)
 72 |         db.put(txn,84131 , 4)
 73 |         db.put(txn,3869  , 5)
 74 |         db.put(txn,124034, 6)
 75 |         db.put(txn,90752 , 7)
 76 |         db.put(txn,48288 , 8)
 77 |         db.put(txn,97573 , 9)
 78 |         db.put(txn,18455 , 0)
 79 | 
 80 |         txn.commit()
 81 |         txn = self.env.begin_txn()
 82 |         res = list(db.get_gt(txn, 50000))
 83 |         self.assertEqual(len(res), 7)
 84 |         res = list(db.get_gt(txn, 84131))
 85 |         self.assertEqual(len(res), 6)
 86 |         res = list(db.get_ge(txn, 84131))
 87 |         self.assertEqual(len(res), 7)
 88 |         txn.commit()
 89 |         db.close()
 90 | 
 91 |     def test_strstr_greater_failure(self):
 92 |         # all keys must be sorted
 93 |         txn = self.env.begin_txn()
 94 |         db = self.env.open_db(txn, 'test_db',
 95 |                               flags=mdb.MDB_CREATE)
 96 |         db.put(txn,'holy', 'bar1')
 97 |         db.put(txn,'smolly', 'bar2')
 98 |         db.put(txn,'abacus', 'bar3')
 99 |         db.put(txn,'dreadlock' , 'bar4')
100 |         db.put(txn,'inno'  , 'bar5')
101 |         db.put(txn,'db', 'bar6')
102 |         db.put(txn,'idiotic' , 'bar7')
103 |         db.put(txn,'idioms' , 'bar8')
104 | 
105 |         txn.commit()
106 |         txn = self.env.begin_txn()
107 |         res = list(db.get_gt(txn, 'grover'))
108 |         self.assertEqual(len(res), 5)
109 |         res = list(db.get_gt(txn, 'db'))
110 |         self.assertEqual(len(res), 6)
111 |         res = list(db.get_ge(txn, 'db'))
112 |         self.assertEqual(len(res), 7)
113 |         txn.commit()
114 |         db.close()
115 | 
116 |     def test_strint_greater_failure(self):
117 |         # all keys must be sorted
118 |         txn = self.env.begin_txn()
119 |         db = self.env.open_db(txn, 'test_db',
120 |                               flags=mdb.MDB_CREATE|mdb.MDB_INTEGERDUP)
121 |         db.put(txn,'holy', 1)
122 |         db.put(txn,'smolly', 2)
123 |         db.put(txn,'abacus', 3)
124 |         db.put(txn,'dreadlock' , 4)
125 |         db.put(txn,'inno'  , 5)
126 |         db.put(txn,'db', 6)
127 |         db.put(txn,'idiotic' , 7)
128 |         db.put(txn,'idioms' , 8)
129 | 
130 |         txn.commit()
131 |         txn = self.env.begin_txn()
132 |         res = list(db.get_gt(txn, 'grover'))
133 |         self.assertEqual(len(res), 5)
134 |         res = list(db.get_gt(txn, 'db'))
135 |         self.assertEqual(len(res), 6)
136 |         res = list(db.get_ge(txn, 'db'))
137 |         self.assertEqual(len(res), 7)
138 |         txn.commit()
139 |         db.close()
140 | 
141 | 


--------------------------------------------------------------------------------
/deps/liblmdb/test/test_intreader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | from mdb import Writer, Reader, DupReader
 4 | 
 5 | 
 6 | class TestReaderWriter(TestCase):
 7 |     def setUp(self):
 8 |         pass
 9 | 
10 |     def tearDown(self):
11 |         import shutil
12 |         try:
13 |             shutil.rmtree('./test_rw')
14 |         except OSError:
15 |             pass
16 |         try:
17 |             shutil.rmtree('./test_rw_dup')
18 |         except OSError:
19 |             pass
20 | 
21 |     def test_reader_and_writer(self):
22 |         writer = Writer('./test_rw', int_key=True)
23 |         writer.drop()
24 |         writer.put(1234, 'bar')
25 |         writer.put(5678, 'spam')
26 |         reader = Reader('./test_rw', int_key=True)
27 |         value = reader.get(1234)
28 |         self.assertEqual(value, 'bar')
29 |         value = reader.get(5678)
30 |         self.assertEqual(value, 'spam')
31 | 
32 |     def test_dup_reader_and_writer(self):
33 |         def key_value_gen():
34 |             for i in range(3):
35 |                 yield 789, "value%d" % (i * i)
36 |         writer = Writer('./test_rw_dup', int_key=True, dup=True)
37 |         writer.drop()
38 |         writer.put(123, 'bar')
39 |         writer.put(456, 'spam')
40 |         writer.mput({123: "bar1", 456: "spam1"})
41 |         writer.mput(key_value_gen())
42 |         reader = DupReader('./test_rw_dup', int_key=True)
43 |         values = reader.get(123)
44 |         self.assertEqual(list(values), ['bar', 'bar1'])
45 |         values = reader.get(456)
46 |         self.assertEqual(list(values), ['spam', 'spam1'])
47 |         values = reader.get(789)
48 |         self.assertEqual(list(values), ['value0', 'value1', 'value4'])
49 | 


--------------------------------------------------------------------------------
/deps/liblmdb/test/test_reader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from unittest import TestCase
 3 | from mdb import Writer, Reader, DupReader
 4 | from ujson import dumps, loads
 5 | 
 6 | 
 7 | class TestReaderWriter(TestCase):
 8 |     def setUp(self):
 9 |         pass
10 | 
11 |     def tearDown(self):
12 |         import shutil
13 |         try:
14 |             shutil.rmtree('./test_rw')
15 |         except OSError:
16 |             pass
17 |         try:
18 |             shutil.rmtree('./test_rw_dup')
19 |         except OSError:
20 |             pass
21 | 
22 |     def test_reader_and_writer(self):
23 |         writer = Writer('./test_rw', encode_fn=dumps)
24 |         writer.drop()
25 |         writer.put('foo', 'bar')
26 |         writer.put('egg', 'spam')
27 |         reader = Reader('./test_rw', decode_fn=loads)
28 |         value = reader.get('foo')
29 |         self.assertEqual(value, 'bar')
30 |         value = reader.get('egg')
31 |         self.assertEqual(value, 'spam')
32 | 
33 |     def test_dup_reader_and_writer(self):
34 |         def key_value_gen():
35 |             for i in range(3):
36 |                 yield 'fixed', "value%d" % (i * i)
37 |         writer = Writer('./test_rw_dup', dup=True,
38 |                         encode_fn=dumps)
39 |         writer.drop()
40 |         writer.put('foo', 'bar')
41 |         writer.put('egg', 'spam')
42 |         writer.mput({"foo": "bar1", "egg": "spam1"})
43 |         writer.mput(key_value_gen())
44 |         reader = DupReader('./test_rw_dup',
45 |                            decode_fn=loads)
46 |         values = reader.get('foo')
47 |         self.assertEqual(list(values), ['bar', 'bar1'])
48 |         values = reader.get('egg')
49 |         self.assertEqual(list(values), ['spam', 'spam1'])
50 |         values = reader.get('fixed')
51 |         self.assertEqual(list(values), ['value0', 'value1', 'value4'])
52 | 


--------------------------------------------------------------------------------
/deps/liblru/clru.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2013, Chango Inc.
  3 |  *
  4 |  *  Based on the 'LRU cache implementation in C++' article by Tim Day.
  5 |  *  Copyright (c) 2010-2011, Tim Day <timday@timday.com>
  6 |  *
  7 |  *  Permission to use, copy, modify, and/or distribute this software for any
  8 |  *  purpose with or without fee is hereby granted, provided that the above
  9 |  *  copyright notice and this permission notice appear in all copies.
 10 |  *
 11 |  *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 12 |  *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 13 |  *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 14 |  *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 15 |  *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 16 |  *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 17 |  *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 18 |  */
 19 | 
 20 | #ifndef _CLRU_
 21 | #define _CLRU_
 22 | 
 23 | #include <cstdlib>
 24 | #include <cassert>
 25 | #include <list>
 26 | #include <map>
 27 | #include <cstring>
 28 | 
 29 | struct cmp_str {
 30 |     bool operator()(const char *a, const char *b) const {
 31 |         return strcmp(a, b) < 0;
 32 |     }
 33 | };
 34 | 
 35 | typedef void (*CharFunc)(char *, void *);
 36 | 
 37 | class CharLRU
 38 | {
 39 |     public:
 40 |         typedef std::list<char *> CharList;
 41 |         typedef std::map<char *, CharList::iterator, cmp_str> Map;
 42 |         CharLRU(CharFunc fetch, CharFunc evict, size_t c, void *cookie)
 43 |             :_fetch(fetch), _evict(evict), _capacity(c), _cookie(cookie)
 44 |         {
 45 |             assert(_capacity != 0);
 46 |         }
 47 | 
 48 |         void evictall() {
 49 |             unsigned long count = charlist.size();
 50 |             for (unsigned long i = 0; i < count; i++) {
 51 |                 evict();
 52 |             }
 53 |         }
 54 | 
 55 |         void set(char *k) {
 56 |             touch(k, false);
 57 |         }
 58 | 
 59 |         void get(char *k) {
 60 |             touch(k, true);
 61 |         }
 62 |     private:
 63 |         void touch(char *_k, bool shouldFetch) {
 64 |             const Map::iterator it =locationMap.find(_k);
 65 | 
 66 |             if (it == locationMap.end()) {
 67 |                 char *k = strdup(_k);
 68 |                 assert(k != NULL);
 69 |                 assert(strlen(_k) == strlen(k));
 70 |                 if (shouldFetch) {
 71 |                     _fetch(k, _cookie);
 72 |                 }
 73 |                 insert(k);
 74 |             } else {
 75 |                 charlist.splice(charlist.end(), charlist, it->second);
 76 |             }
 77 |         }
 78 | 
 79 |         void insert(char *k) {
 80 |             if (locationMap.size()==_capacity) 
 81 |                 evict();
 82 |             CharList::iterator it =charlist.insert(charlist.end(), k);
 83 |             locationMap.insert(std::make_pair(k, it));
 84 |         }
 85 | 
 86 |         void evict() {
 87 |             assert(!charlist.empty());
 88 | 
 89 |             const Map::iterator it =locationMap.find(charlist.front());
 90 |             assert(it != locationMap.end());
 91 | 
 92 |             char *k = it->first;
 93 |             assert(k != NULL);
 94 |             _evict(k, _cookie);
 95 |             locationMap.erase(it);
 96 |             charlist.pop_front();
 97 |             free(k);
 98 |         }
 99 | 
100 |         CharFunc _fetch, _evict;
101 |         const size_t _capacity;
102 |         void *_cookie;
103 | 
104 |         CharList charlist;
105 |         Map locationMap;
106 | };
107 | 
108 | typedef void (*IntFunc)(long, void *);
109 | class IntLRU
110 | {
111 |     public:
112 |         typedef std::list<long> IntList;
113 |         typedef std::map<long, IntList::iterator> Map;
114 |         IntLRU(IntFunc fetch, IntFunc evict, size_t c, void *cookie)
115 |             :_fetch(fetch), _evict(evict), _capacity(c), _cookie(cookie)
116 |         {
117 |             assert(_capacity != 0);
118 |         }
119 | 
120 |         void evictall() {
121 |             unsigned long count = longlist.size();
122 |             for (unsigned long i = 0; i < count; i++) {
123 |                 evict();
124 |             }
125 |         }
126 | 
127 |         void set(long k) {
128 |             touch(k, false);
129 |         }
130 | 
131 |         void get(long k) {
132 |             touch(k, true);
133 |         }
134 |     private:
135 |         void touch(long k, bool shouldFetch) {
136 |             const Map::iterator it = locationMap.find(k);
137 | 
138 |             if (it == locationMap.end()) {
139 |                 if (shouldFetch) {
140 |                     _fetch(k, _cookie);
141 |                 }
142 |                 insert(k);
143 |             } else {
144 |                 longlist.splice(longlist.end(), longlist, it->second);
145 |             }
146 |         }
147 | 
148 |         void insert(long k) {
149 |             if (locationMap.size()==_capacity) 
150 |                 evict();
151 |             IntList::iterator it =longlist.insert(longlist.end(), k);
152 |             locationMap.insert(std::make_pair(k, it));
153 |         }
154 | 
155 |         void evict() {
156 |             assert(!longlist.empty());
157 | 
158 |             const Map::iterator it =locationMap.find(longlist.front());
159 |             assert(it != locationMap.end());
160 | 
161 |             long k = it->first;
162 |             _evict(k, _cookie);
163 |             locationMap.erase(it);
164 |             longlist.pop_front();
165 |         }
166 | 
167 |         IntFunc _fetch, _evict;
168 |         const size_t _capacity;
169 |         void *_cookie;
170 | 
171 |         IntList longlist;
172 |         Map locationMap;
173 | };
174 | #endif
175 | 


--------------------------------------------------------------------------------
/deps/liblru/pylru.pyx:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import traceback
  3 | from collections import defaultdict
  4 | 
  5 | cdef extern from "clru.h":
  6 |     ctypedef void (*CharFetch)(char *, void *)
  7 |     ctypedef void (*CharEvict)(char *, void *)
  8 | 
  9 |     cdef cppclass CharLRU:
 10 |         CharLRU(CharFetch f, CharEvict e, size_t size, void *self)
 11 |         void get(char *k)
 12 |         void set(char *k)
 13 |         void evictall()
 14 | 
 15 | cdef void _charfetch(char *key, void *_self):
 16 |     self = <object>_self
 17 |     try:
 18 |         res = self._Fetch(key)
 19 |         if res is not None:
 20 |           self.kv[key] = res
 21 |     except Exception, e:
 22 |         print >> sys.stderr, traceback.format_exc()
 23 |         print >> sys.stderr, "Exception: %s" % str(e)
 24 | 
 25 | cdef void _charevict(char *key, void *_self):
 26 |     self = <object>_self
 27 |     try:
 28 |         try:
 29 |             val = self.kv[key]
 30 |         except KeyError:
 31 |             # key has no value
 32 |             return
 33 |         if val is None:
 34 |             print key, self.kv
 35 | 
 36 |         self._Evict(key, val)
 37 |     except Exception, e:
 38 |         print >> sys.stderr, traceback.format_exc()
 39 |         print >> sys.stderr, "Exception: %s" % str(e)
 40 |     del self.kv[key]
 41 | 
 42 | cdef class CharLRUDict(object):
 43 |     cdef CharLRU *store
 44 |     cdef dict kv
 45 |     cdef object _Fetch, _Evict
 46 | 
 47 |     property kv:
 48 |         def __get__(self):
 49 |             return self.kv
 50 | 
 51 |     property _Fetch:
 52 |         def __get__(self):
 53 |             return self._Fetch
 54 | 
 55 |     property _Evict:
 56 |         def __get__(self):
 57 |             return self._Evict
 58 | 
 59 |     def __cinit__(self, max_size=None, fetch=None, evict=None, factory=object):
 60 |         self.kv = <dict> defaultdict(factory)
 61 |         self._Fetch = fetch
 62 |         self._Evict = evict
 63 |         self.store = new CharLRU(_charfetch, _charevict, max_size, <void *>self)
 64 |         if self.store is NULL:
 65 |             raise MemoryError()
 66 | 
 67 |     def __dealloc__(self):
 68 |         if self.store is not NULL:
 69 |             del self.store
 70 | 
 71 |     def set(self, char *key, value):
 72 |         self.kv[key] = value
 73 |         self.store.set(key)
 74 | 
 75 |     def __setitem__(self, char *key, value):
 76 |         self.set(key, value)
 77 | 
 78 |     def get(self, char *key):
 79 |         self.store.get(key)
 80 |         return self.kv.get(key, None)
 81 | 
 82 |     def __getitem__(self, char *key):
 83 |         self.store.get(key)
 84 |         return self.kv[key]
 85 | 
 86 |     def evictAll(self):
 87 |         self.store.evictall()
 88 | 
 89 |     def _getContents(self):
 90 |         from copy import copy
 91 |         return copy(self.kv)
 92 | 
 93 | 
 94 | cdef extern from "clru.h":
 95 |     ctypedef void (*IntFetch)(long, void *)
 96 |     ctypedef void (*IntEvict)(long, void *)
 97 | 
 98 |     cdef cppclass IntLRU:
 99 |         IntLRU(IntFetch f, IntEvict e, size_t size, void *self)
100 |         void get(long k)
101 |         void set(long k)
102 |         void evictall()
103 | 
104 | cdef void _intfetch(long key, void *_self):
105 |     self = <object>_self
106 |     try:
107 |         res = self._Fetch(key)
108 |         if res is not None:
109 |             self.kv[key] = res
110 |     except Exception, e:
111 |         print >> sys.stderr, traceback.format_exc()
112 |         print >> sys.stderr, "Exception: %s" % str(e)
113 | 
114 | cdef void _intevict(long key, void *_self):
115 |     self = <object>_self
116 |     try:
117 |         try:
118 |             val = self.kv[key]
119 |         except KeyError:
120 |             # key has no value
121 |             return
122 |         if val is None:
123 |             print key, self.kv
124 |             print >> sys.stderr, key, self.kv
125 |         self._Evict(key, val)
126 |     except Exception, e:
127 |         print >> sys.stderr, traceback.format_exc()
128 |         print >> sys.stderr, "Exception: %s" % str(e)
129 |     del self.kv[key]
130 | 
131 | cdef class IntLRUDict(object):
132 |     cdef IntLRU *store
133 |     cdef dict kv
134 |     cdef object _Fetch, _Evict
135 | 
136 |     property kv:
137 |         def __get__(self):
138 |             return self.kv
139 | 
140 |     property _Fetch:
141 |         def __get__(self):
142 |             return self._Fetch
143 | 
144 |     property _Evict:
145 |         def __get__(self):
146 |             return self._Evict
147 | 
148 | 
149 |     def __cinit__(self, size_t max_size, object fetch=None, object evict=None, factory=object):
150 |         self.kv = <dict> defaultdict(factory)
151 |         self._Fetch = fetch
152 |         self._Evict = evict
153 |         self.store = new IntLRU(_intfetch, _intevict, max_size, <void *>self)
154 |         if self.store is NULL:
155 |             raise MemoryError()
156 | 
157 |     def __dealloc__(self):
158 |         if self.store is not NULL:
159 |             del self.store
160 | 
161 |     def set(self, long key, value):
162 |         self.kv[key] = value
163 |         self.store.set(key)
164 | 
165 |     def __setitem__(self, key, value):
166 |         self.set(key, value)
167 | 
168 |     def get(self, long key):
169 |         self.store.get(key)
170 |         return self.kv.get(key, None)
171 | 
172 |     def __getitem__(self, long key):
173 |         self.store.get(key)
174 |         return self.kv[key]
175 | 
176 |     def evictAll(self):
177 |         self.store.evictall()
178 | 
179 |     def _getContents(self):
180 |         from copy import copy
181 |         return copy(self.kv)
182 | 
183 | cdef class LRUDict(object):
184 |     @classmethod
185 |     def getDict(cls, max_size=None, fetch=None, evict=None, isInt=False, factory=object):
186 |         if isInt:
187 |             return IntLRUDict(max_size, fetch, evict, factory)
188 |         else:
189 |             return CharLRUDict(max_size, fetch, evict, factory)
190 | 


--------------------------------------------------------------------------------
/deps/liblz4/lz4hc.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |    LZ4 HC - High Compression Mode of LZ4
  3 |    Header File
  4 |    Copyright (C) 2011-2013, Yann Collet.
  5 |    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
  6 | 
  7 |    Redistribution and use in source and binary forms, with or without
  8 |    modification, are permitted provided that the following conditions are
  9 |    met:
 10 | 
 11 |        * Redistributions of source code must retain the above copyright
 12 |    notice, this list of conditions and the following disclaimer.
 13 |        * Redistributions in binary form must reproduce the above
 14 |    copyright notice, this list of conditions and the following disclaimer
 15 |    in the documentation and/or other materials provided with the
 16 |    distribution.
 17 | 
 18 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 |    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 |    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 |    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 |    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 |    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 |    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 |    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 |    You can contact the author at :
 31 |    - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
 32 |    - LZ4 source repository : http://code.google.com/p/lz4/
 33 | */
 34 | #pragma once
 35 | 
 36 | 
 37 | #if defined (__cplusplus)
 38 | extern "C" {
 39 | #endif
 40 | 
 41 | 
 42 | int LZ4_compressHC (const char* source, char* dest, int inputSize);
 43 | /*
 44 | LZ4_compressHC :
 45 |     return : the number of bytes in compressed buffer dest
 46 |              or 0 if compression fails.
 47 |     note : destination buffer must be already allocated. 
 48 |         To avoid any problem, size it to handle worst cases situations (input data not compressible)
 49 |         Worst case size evaluation is provided by function LZ4_compressBound() (see "lz4.h")
 50 | */
 51 | 
 52 | int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
 53 | /*
 54 | LZ4_compress_limitedOutput() :
 55 |     Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
 56 |     If it cannot achieve it, compression will stop, and result of the function will be zero.
 57 |     This function never writes outside of provided output buffer.
 58 | 
 59 |     inputSize  : Max supported value is 1 GB
 60 |     maxOutputSize : is maximum allowed size into the destination buffer (which must be already allocated)
 61 |     return : the number of output bytes written in buffer 'dest'
 62 |              or 0 if compression fails.
 63 | */
 64 | 
 65 | 
 66 | /* Note :
 67 | Decompression functions are provided within LZ4 source code (see "lz4.h") (BSD license)
 68 | */
 69 | 
 70 | 
 71 | /* Advanced Functions */
 72 | 
 73 | void* LZ4_createHC (const char* inputBuffer);
 74 | int   LZ4_compressHC_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize);
 75 | int   LZ4_compressHC_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize);
 76 | char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
 77 | int   LZ4_freeHC (void* LZ4HC_Data);
 78 | 
 79 | /* 
 80 | These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks.
 81 | In order to achieve this, it is necessary to start creating the LZ4HC Data Structure, thanks to the function :
 82 | 
 83 | void* LZ4_createHC (const char* inputBuffer);
 84 | The result of the function is the (void*) pointer on the LZ4HC Data Structure.
 85 | This pointer will be needed in all other functions.
 86 | If the pointer returned is NULL, then the allocation has failed, and compression must be aborted.
 87 | The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer.
 88 | The input buffer must be already allocated, and size at least 192KB.
 89 | 'inputBuffer' will also be the 'const char* source' of the first block.
 90 | 
 91 | All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'.
 92 | To compress each block, use either LZ4_compressHC_continue() or LZ4_compressHC_limitedOutput_continue().
 93 | Their behavior are identical to LZ4_compressHC() or LZ4_compressHC_limitedOutput(), 
 94 | but require the LZ4HC Data Structure as their first argument, and check that each block starts right after the previous one.
 95 | If next block does not begin immediately after the previous one, the compression will fail (return 0).
 96 | 
 97 | When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : 
 98 | char* LZ4_slideInputBufferHC(void* LZ4HC_Data);
 99 | must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer.
100 | Note that, for this function to work properly, minimum size of an input buffer must be 192KB.
101 | ==> The memory position where the next input data block must start is provided as the result of the function.
102 | 
103 | Compression can then resume, using LZ4_compressHC_continue() or LZ4_compressHC_limitedOutput_continue(), as usual.
104 | 
105 | When compression is completed, a call to LZ4_freeHC() will release the memory used by the LZ4HC Data Structure.
106 | */
107 | 
108 | 
109 | #if defined (__cplusplus)
110 | }
111 | #endif
112 | 


--------------------------------------------------------------------------------
/deps/liblzf/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modifica-
 4 | tion, are permitted provided that the following conditions are met:
 5 | 
 6 |   1.  Redistributions of source code must retain the above copyright notice,
 7 |       this list of conditions and the following disclaimer.
 8 | 
 9 |   2.  Redistributions in binary form must reproduce the above copyright
10 |       notice, this list of conditions and the following disclaimer in the
11 |       documentation and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
14 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
15 | CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
16 | EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
17 | CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
18 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
19 | OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
20 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
21 | ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
22 | OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | Alternatively, the contents of this file may be used under the terms of
25 | the GNU General Public License ("GPL") version 2 or any later version,
26 | in which case the provisions of the GPL are applicable instead of
27 | the above. If you wish to allow the use of your version of this file
28 | only under the terms of the GPL and not to allow others to use your
29 | version of this file under the BSD license, indicate your decision
30 | by deleting the provisions above and replace them with the notice
31 | and other provisions required by the GPL. If you do not delete the
32 | provisions above, a recipient may use your version of this file under
33 | either the BSD or the GPL.
34 | 


--------------------------------------------------------------------------------
/deps/liblzf/lzf.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without modifica-
  5 |  * tion, are permitted provided that the following conditions are met:
  6 |  * 
  7 |  *   1.  Redistributions of source code must retain the above copyright notice,
  8 |  *       this list of conditions and the following disclaimer.
  9 |  * 
 10 |  *   2.  Redistributions in binary form must reproduce the above copyright
 11 |  *       notice, this list of conditions and the following disclaimer in the
 12 |  *       documentation and/or other materials provided with the distribution.
 13 |  * 
 14 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 15 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 16 |  * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 17 |  * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 18 |  * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 20 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 21 |  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 22 |  * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 23 |  * OF THE POSSIBILITY OF SUCH DAMAGE.
 24 |  *
 25 |  * Alternatively, the contents of this file may be used under the terms of
 26 |  * the GNU General Public License ("GPL") version 2 or any later version,
 27 |  * in which case the provisions of the GPL are applicable instead of
 28 |  * the above. If you wish to allow the use of your version of this file
 29 |  * only under the terms of the GPL and not to allow others to use your
 30 |  * version of this file under the BSD license, indicate your decision
 31 |  * by deleting the provisions above and replace them with the notice
 32 |  * and other provisions required by the GPL. If you do not delete the
 33 |  * provisions above, a recipient may use your version of this file under
 34 |  * either the BSD or the GPL.
 35 |  */
 36 | 
 37 | #ifndef LZF_H
 38 | #define LZF_H
 39 | 
 40 | /***********************************************************************
 41 | **
 42 | **	lzf -- an extremely fast/free compression/decompression-method
 43 | **	http://liblzf.plan9.de/
 44 | **
 45 | **	This algorithm is believed to be patent-free.
 46 | **
 47 | ***********************************************************************/
 48 | 
 49 | #define LZF_VERSION 0x0105 /* 1.5, API version */
 50 | 
 51 | /*
 52 |  * Compress in_len bytes stored at the memory block starting at
 53 |  * in_data and write the result to out_data, up to a maximum length
 54 |  * of out_len bytes.
 55 |  *
 56 |  * If the output buffer is not large enough or any error occurs return 0,
 57 |  * otherwise return the number of bytes used, which might be considerably
 58 |  * more than in_len (but less than 104% of the original size), so it
 59 |  * makes sense to always use out_len == in_len - 1), to ensure _some_
 60 |  * compression, and store the data uncompressed otherwise (with a flag, of
 61 |  * course.
 62 |  *
 63 |  * lzf_compress might use different algorithms on different systems and
 64 |  * even different runs, thus might result in different compressed strings
 65 |  * depending on the phase of the moon or similar factors. However, all
 66 |  * these strings are architecture-independent and will result in the
 67 |  * original data when decompressed using lzf_decompress.
 68 |  *
 69 |  * The buffers must not be overlapping.
 70 |  *
 71 |  * If the option LZF_STATE_ARG is enabled, an extra argument must be
 72 |  * supplied which is not reflected in this header file. Refer to lzfP.h
 73 |  * and lzf_c.c.
 74 |  *
 75 |  */
 76 | unsigned int 
 77 | lzf_compress (const void *const in_data,  unsigned int in_len,
 78 |               void             *out_data, unsigned int out_len);
 79 | 
 80 | /*
 81 |  * Decompress data compressed with some version of the lzf_compress
 82 |  * function and stored at location in_data and length in_len. The result
 83 |  * will be stored at out_data up to a maximum of out_len characters.
 84 |  *
 85 |  * If the output buffer is not large enough to hold the decompressed
 86 |  * data, a 0 is returned and errno is set to E2BIG. Otherwise the number
 87 |  * of decompressed bytes (i.e. the original length of the data) is
 88 |  * returned.
 89 |  *
 90 |  * If an error in the compressed data is detected, a zero is returned and
 91 |  * errno is set to EINVAL.
 92 |  *
 93 |  * This function is very fast, about as fast as a copying loop.
 94 |  */
 95 | unsigned int 
 96 | lzf_decompress (const void *const in_data,  unsigned int in_len,
 97 |                 void             *out_data, unsigned int out_len);
 98 | 
 99 | #endif
100 | 
101 | 


--------------------------------------------------------------------------------
/deps/liblzf/lzfP.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without modifica-
  5 |  * tion, are permitted provided that the following conditions are met:
  6 |  * 
  7 |  *   1.  Redistributions of source code must retain the above copyright notice,
  8 |  *       this list of conditions and the following disclaimer.
  9 |  * 
 10 |  *   2.  Redistributions in binary form must reproduce the above copyright
 11 |  *       notice, this list of conditions and the following disclaimer in the
 12 |  *       documentation and/or other materials provided with the distribution.
 13 |  * 
 14 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 15 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 16 |  * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 17 |  * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 18 |  * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 20 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 21 |  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 22 |  * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 23 |  * OF THE POSSIBILITY OF SUCH DAMAGE.
 24 |  *
 25 |  * Alternatively, the contents of this file may be used under the terms of
 26 |  * the GNU General Public License ("GPL") version 2 or any later version,
 27 |  * in which case the provisions of the GPL are applicable instead of
 28 |  * the above. If you wish to allow the use of your version of this file
 29 |  * only under the terms of the GPL and not to allow others to use your
 30 |  * version of this file under the BSD license, indicate your decision
 31 |  * by deleting the provisions above and replace them with the notice
 32 |  * and other provisions required by the GPL. If you do not delete the
 33 |  * provisions above, a recipient may use your version of this file under
 34 |  * either the BSD or the GPL.
 35 |  */
 36 | 
 37 | #ifndef LZFP_h
 38 | #define LZFP_h
 39 | 
 40 | #define STANDALONE 1 /* at the moment, this is ok. */
 41 | 
 42 | #ifndef STANDALONE
 43 | # include "lzf.h"
 44 | #endif
 45 | 
 46 | /*
 47 |  * Size of hashtable is (1 << HLOG) * sizeof (char *)
 48 |  * decompression is independent of the hash table size
 49 |  * the difference between 15 and 14 is very small
 50 |  * for small blocks (and 14 is usually a bit faster).
 51 |  * For a low-memory/faster configuration, use HLOG == 13;
 52 |  * For best compression, use 15 or 16 (or more, up to 23).
 53 |  */
 54 | #ifndef HLOG
 55 | # define HLOG 16
 56 | #endif
 57 | 
 58 | /*
 59 |  * Sacrifice very little compression quality in favour of compression speed.
 60 |  * This gives almost the same compression as the default code, and is
 61 |  * (very roughly) 15% faster. This is the preferred mode of operation.
 62 |  */
 63 | #ifndef VERY_FAST
 64 | # define VERY_FAST 1
 65 | #endif
 66 | 
 67 | /*
 68 |  * Sacrifice some more compression quality in favour of compression speed.
 69 |  * (roughly 1-2% worse compression for large blocks and
 70 |  * 9-10% for small, redundant, blocks and >>20% better speed in both cases)
 71 |  * In short: when in need for speed, enable this for binary data,
 72 |  * possibly disable this for text data.
 73 |  */
 74 | #ifndef ULTRA_FAST
 75 | # define ULTRA_FAST 0
 76 | #endif
 77 | 
 78 | /*
 79 |  * Unconditionally aligning does not cost very much, so do it if unsure
 80 |  */
 81 | #ifndef STRICT_ALIGN
 82 | # define STRICT_ALIGN !(defined(__i386) || defined (__amd64))
 83 | #endif
 84 | 
 85 | /*
 86 |  * You may choose to pre-set the hash table (might be faster on some
 87 |  * modern cpus and large (>>64k) blocks, and also makes compression
 88 |  * deterministic/repeatable when the configuration otherwise is the same).
 89 |  */
 90 | #ifndef INIT_HTAB
 91 | # define INIT_HTAB 0
 92 | #endif
 93 | 
 94 | /*
 95 |  * Avoid assigning values to errno variable? for some embedding purposes
 96 |  * (linux kernel for example), this is necessary. NOTE: this breaks
 97 |  * the documentation in lzf.h.
 98 |  */
 99 | #ifndef AVOID_ERRNO
100 | # define AVOID_ERRNO 0
101 | #endif
102 | 
103 | /*
104 |  * Whether to pass the LZF_STATE variable as argument, or allocate it
105 |  * on the stack. For small-stack environments, define this to 1.
106 |  * NOTE: this breaks the prototype in lzf.h.
107 |  */
108 | #ifndef LZF_STATE_ARG
109 | # define LZF_STATE_ARG 0
110 | #endif
111 | 
112 | /*
113 |  * Whether to add extra checks for input validity in lzf_decompress
114 |  * and return EINVAL if the input stream has been corrupted. This
115 |  * only shields against overflowing the input buffer and will not
116 |  * detect most corrupted streams.
117 |  * This check is not normally noticeable on modern hardware
118 |  * (<1% slowdown), but might slow down older cpus considerably.
119 |  */
120 | #ifndef CHECK_INPUT
121 | # define CHECK_INPUT 1
122 | #endif
123 | 
124 | /*****************************************************************************/
125 | /* nothing should be changed below */
126 | 
127 | typedef unsigned char u8;
128 | 
129 | typedef const u8 *LZF_STATE[1 << (HLOG)];
130 | 
131 | #if !STRICT_ALIGN
132 | /* for unaligned accesses we need a 16 bit datatype. */
133 | # include <limits.h>
134 | # if USHRT_MAX == 65535
135 |     typedef unsigned short u16;
136 | # elif UINT_MAX == 65535
137 |     typedef unsigned int u16;
138 | # else
139 | #  undef STRICT_ALIGN
140 | #  define STRICT_ALIGN 1
141 | # endif
142 | #endif
143 | 
144 | #if ULTRA_FAST
145 | # if defined(VERY_FAST)
146 | #  undef VERY_FAST
147 | # endif
148 | #endif
149 | 
150 | #if INIT_HTAB
151 | # ifdef __cplusplus
152 | #  include <cstring>
153 | # else
154 | #  include <string.h>
155 | # endif
156 | #endif
157 | 
158 | #endif
159 | 
160 | 


--------------------------------------------------------------------------------
/deps/liblzf/lzf_d.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without modifica-
  5 |  * tion, are permitted provided that the following conditions are met:
  6 |  * 
  7 |  *   1.  Redistributions of source code must retain the above copyright notice,
  8 |  *       this list of conditions and the following disclaimer.
  9 |  * 
 10 |  *   2.  Redistributions in binary form must reproduce the above copyright
 11 |  *       notice, this list of conditions and the following disclaimer in the
 12 |  *       documentation and/or other materials provided with the distribution.
 13 |  * 
 14 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
 15 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
 16 |  * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 17 |  * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
 18 |  * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 20 |  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 21 |  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
 22 |  * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 23 |  * OF THE POSSIBILITY OF SUCH DAMAGE.
 24 |  *
 25 |  * Alternatively, the contents of this file may be used under the terms of
 26 |  * the GNU General Public License ("GPL") version 2 or any later version,
 27 |  * in which case the provisions of the GPL are applicable instead of
 28 |  * the above. If you wish to allow the use of your version of this file
 29 |  * only under the terms of the GPL and not to allow others to use your
 30 |  * version of this file under the BSD license, indicate your decision
 31 |  * by deleting the provisions above and replace them with the notice
 32 |  * and other provisions required by the GPL. If you do not delete the
 33 |  * provisions above, a recipient may use your version of this file under
 34 |  * either the BSD or the GPL.
 35 |  */
 36 | 
 37 | #include "lzfP.h"
 38 | 
 39 | #if AVOID_ERRNO
 40 | # define SET_ERRNO(n)
 41 | #else
 42 | # include <errno.h>
 43 | # define SET_ERRNO(n) errno = (n)
 44 | #endif
 45 | 
 46 | /*
 47 | #if (__i386 || __amd64) && __GNUC__ >= 3
 48 | # define lzf_movsb(dst, src, len)                \
 49 |    asm ("rep movsb"                              \
 50 |         : "=D" (dst), "=S" (src), "=c" (len)     \
 51 |         :  "0" (dst),  "1" (src),  "2" (len));
 52 | #endif
 53 | */
 54 | 
 55 | unsigned int 
 56 | lzf_decompress (const void *const in_data,  unsigned int in_len,
 57 |                 void             *out_data, unsigned int out_len)
 58 | {
 59 |   u8 const *ip = (const u8 *)in_data;
 60 |   u8       *op = (u8 *)out_data;
 61 |   u8 const *const in_end  = ip + in_len;
 62 |   u8       *const out_end = op + out_len;
 63 | 
 64 |   do
 65 |     {
 66 |       unsigned int ctrl = *ip++;
 67 | 
 68 |       if (ctrl < (1 << 5)) /* literal run */
 69 |         {
 70 |           ctrl++;
 71 | 
 72 |           if (op + ctrl > out_end)
 73 |             {
 74 |               SET_ERRNO (E2BIG);
 75 |               return 0;
 76 |             }
 77 | 
 78 | #if CHECK_INPUT
 79 |           if (ip + ctrl > in_end)
 80 |             {
 81 |               SET_ERRNO (EINVAL);
 82 |               return 0;
 83 |             }
 84 | #endif
 85 | 
 86 | #ifdef lzf_movsb
 87 |           lzf_movsb (op, ip, ctrl);
 88 | #else
 89 |           do
 90 |             *op++ = *ip++;
 91 |           while (--ctrl);
 92 | #endif
 93 |         }
 94 |       else /* back reference */
 95 |         {
 96 |           unsigned int len = ctrl >> 5;
 97 | 
 98 |           u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
 99 | 
100 | #if CHECK_INPUT
101 |           if (ip >= in_end)
102 |             {
103 |               SET_ERRNO (EINVAL);
104 |               return 0;
105 |             }
106 | #endif
107 |           if (len == 7)
108 |             {
109 |               len += *ip++;
110 | #if CHECK_INPUT
111 |               if (ip >= in_end)
112 |                 {
113 |                   SET_ERRNO (EINVAL);
114 |                   return 0;
115 |                 }
116 | #endif
117 |             }
118 | 
119 |           ref -= *ip++;
120 | 
121 |           if (op + len + 2 > out_end)
122 |             {
123 |               SET_ERRNO (E2BIG);
124 |               return 0;
125 |             }
126 | 
127 |           if (ref < (u8 *)out_data)
128 |             {
129 |               SET_ERRNO (EINVAL);
130 |               return 0;
131 |             }
132 | 
133 | #ifdef lzf_movsb
134 |           len += 2;
135 |           lzf_movsb (op, ref, len);
136 | #else
137 |           *op++ = *ref++;
138 |           *op++ = *ref++;
139 | 
140 |           do
141 |             *op++ = *ref++;
142 |           while (--len);
143 | #endif
144 |         }
145 |     }
146 |   while (ip < in_end);
147 | 
148 |   return op - (u8 *)out_data;
149 | }
150 | 
151 | 


--------------------------------------------------------------------------------
/deps/librtrie/main.c:
--------------------------------------------------------------------------------
 1 | d#include <stdio.h>
 2 | #include "rtrie.h"
 3 | 
 4 | uint8_t nodes[] = {0, 0, 0, 2, 16, 0, 0, 1, 11, 0, 0, 2, 19, 0, 0, 0, 4, 0, 0, 1, 8, 0, 0, 0, 22, 0, 0, 0};
 5 | uint8_t kids[] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 103, 2, 0, 0, 104, 0, 0, 0, 0, 4, 0, 103, 111, 111, 100,
 6 |         0, 0, 5, 0, 0, 98, 4, 0, 0, 0, 3, 0, 98, 121, 101, 0, 0, 0, 0, 0, 0, 0, 4, 0, 104, 101, 108, 108, 0,
 7 |         0, 1, 0, 0, 111, 6, 0, 0, 115, 2, 0, 0, 0, 1, 0, 111, 0, 3, 0, 0, 116, 1, 0, 0, 0, 5, 0, 116, 104, 101,
 8 |         114, 101, 0, 2, 0, 0, 0, 4, 0, 115, 105, 110, 107, 0, 0};
 9 | 
10 | int main(int argc, const char * argv[])
11 | {
12 |     char result[256];
13 |     size_t rlen;
14 |     uint32_t *node32s = (uint32_t*)nodes;
15 |     uint32_t *kid32s = (uint32_t*)kids;
16 | 
17 |     printf("testing value_for_vid()...\n");
18 |     int x = value_for_vid(node32s, kid32s, 3, result, &rlen);
19 |     result[rlen] = '\0';
20 |     printf("Done rval=%d %s\n", x, result);
21 |     x = value_for_vid(node32s, kid32s, 1, result, &rlen);
22 |     result[rlen] = '\0';
23 |     printf("Done rval=%d %s\n", x, result);
24 |     x = value_for_vid(node32s, kid32s, 4, result, &rlen);
25 |     result[rlen] = '\0';
26 |     printf("Done rval=%d %s\n", x, result);
27 | 
28 |     printf("testing vid_for_value()...\n");
29 |     uint32_t vid=0;
30 |     x = vid_for_value(node32s, kid32s, "hello", 5, &vid);
31 |     printf("Done vid=%d %d\n", vid, x);
32 | 
33 |     x = vid_for_value(node32s, kid32s, "hellothere", 10, &vid);
34 |     printf("Done vid=%d %d\n", vid, x);
35 | 
36 |     x = vid_for_value(node32s, kid32s, "good", 4, &vid);
37 |     printf("Done vid=%d %d\n", vid, x);
38 | 
39 |     x = vid_for_value(node32s, kid32s, "dung", 4, &vid);
40 |     printf("Done vid=%d %d\n", vid, x);
41 | 
42 |     return 0;
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/deps/librtrie/pyrtrie.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdint.h>
  3 | #include <stdint.h>
  4 | 
  5 | #include "Python.h"
  6 | #include "rtrie.h"
  7 | 
  8 | 
  9 | #if PY_MAJOR_VERSION >= 3
 10 |     #define PYSTR_CREATE PyBytes_FromStringAndSize
 11 | #else
 12 |     #define PYSTR_CREATE PyString_FromStringAndSize
 13 | #endif
 14 | 
 15 | #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
 16 |     typedef int Py_ssize_t;
 17 |     #define PY_SSIZE_T_MAX INT_MAX
 18 |     #define PY_SSIZE_T_MIN INT_MIN
 19 | #endif
 20 | 
 21 | 
 22 | static PyObject *
 23 | py_print_it(PyObject *self, PyObject *args)
 24 | {
 25 |     uint64_t node_ptr, kid_ptr;
 26 |     uint32_t *nodes;
 27 |     uint32_t *kids;
 28 | 
 29 |     if (!PyArg_ParseTuple(args, "K|K", &node_ptr, &kid_ptr))
 30 |         return NULL;
 31 | 
 32 |     nodes = (uint32_t *)node_ptr;
 33 |     kids = (uint32_t *)kid_ptr;
 34 | 
 35 |     print_it(nodes, kids);
 36 |     Py_XINCREF(Py_None);
 37 |     return Py_None;
 38 | }
 39 | 
 40 | static PyObject *
 41 | py_summarize(PyObject *self, PyObject *args)
 42 | {
 43 |     uint64_t node_ptr, kid_ptr;
 44 |     uint32_t size;
 45 |     uint32_t *nodes;
 46 |     uint32_t *kids;
 47 | 
 48 |     if (!PyArg_ParseTuple(args, "K|K|I", &node_ptr, &kid_ptr, &size))
 49 |         return NULL;
 50 | 
 51 |     nodes = (uint32_t *)node_ptr;
 52 |     kids = (uint32_t *)kid_ptr;
 53 | 
 54 |     summarize(nodes, kids, size);
 55 |     Py_XINCREF(Py_None);
 56 |     return Py_None;
 57 | }
 58 | 
 59 | static PyObject *
 60 | py_value_for_vid(PyObject *self, PyObject *args)
 61 | {
 62 |     uint64_t node_ptr, kid_ptr;
 63 |     uint32_t vid;
 64 |     uint32_t *nodes;
 65 |     uint32_t *kids;
 66 |     char res[8092];
 67 |     size_t rlen;
 68 | 
 69 |     if (!PyArg_ParseTuple(args, "K|K|I", &node_ptr, &kid_ptr, &vid))
 70 |         return NULL;
 71 | 
 72 |     nodes = (uint32_t *)node_ptr;
 73 |     kids = (uint32_t *)kid_ptr;
 74 | 
 75 |     if (!value_for_vid(nodes, kids, vid, res, &rlen)) {
 76 |         return PYSTR_CREATE(res, rlen);
 77 |     }
 78 |     Py_XINCREF(Py_None);
 79 |     return Py_None;
 80 | }
 81 | 
 82 | //TODO: these routines should return 0 on not found
 83 | // int vid_for_value(uint32_t *nodes, uint32_t *kids, char *key, uint16_t key_len, uint32_t *vid);
 84 | static PyObject *
 85 | py_vid_for_value(PyObject *self, PyObject *args)
 86 | {
 87 |     uint64_t node_ptr, kid_ptr;
 88 |     uint32_t vid;
 89 |     uint32_t *nodes;
 90 |     uint32_t *kids;
 91 |     char *key;
 92 |     Py_ssize_t key_len;
 93 | 
 94 |     if (!PyArg_ParseTuple(args, "K|K|s#", &node_ptr, &kid_ptr, &key, &key_len))
 95 |         return NULL;
 96 | 
 97 |     nodes = (uint32_t *)node_ptr;
 98 |     kids = (uint32_t *)kid_ptr;
 99 | 
100 |     if (!vid_for_value(nodes, kids, key, key_len, &vid)) {
101 |         return PyInt_FromLong((long)vid);
102 |     }
103 |     Py_XINCREF(Py_None);
104 |     return Py_None;
105 | }
106 | 
107 | 
108 | PyDoc_STRVAR(module_doc, "Python wrapper for the rtrie.");
109 | 
110 | static PyMethodDef rtrie_methods[] = {
111 | 
112 |     {"value_for_vid", py_value_for_vid, METH_VARARGS,
113 |         "Get Value based on VID"},
114 |     {"vid_for_value", py_vid_for_value, METH_VARARGS,
115 |         "Get VID based on Value"},
116 |     {"print_it", py_print_it, METH_VARARGS,
117 |         "Print rtrie"},
118 |     {"summarize", py_summarize, METH_VARARGS,
119 |         "Summarize rtrie"},
120 | 
121 |     {NULL, NULL, 0, NULL}
122 | };
123 | 
124 | 
125 | #if PY_MAJOR_VERSION <= 2
126 | 
127 | extern PyMODINIT_FUNC
128 | initrtrie(void)
129 | {
130 |     PyObject *m;
131 | 
132 |     m = Py_InitModule3("rtrie", rtrie_methods, module_doc);
133 | 
134 |     if (m == NULL)
135 |         return;
136 |     PyModule_AddStringConstant(m, "__version__", MODULE_VERSION);
137 | }
138 | 
139 | #else
140 | 
141 | /* Python 3.x */
142 | 
143 | static PyModuleDef rtrie_module = {
144 |     PyModuleDef_HEAD_INIT,
145 |     "rtrie",
146 |     module_doc,
147 |     -1,
148 |     rtrie_methods,
149 |     NULL,
150 |     NULL,
151 |     NULL,
152 |     NULL
153 | };
154 | 
155 | extern PyMODINIT_FUNC
156 | PyInit_rtrie(void)
157 | {
158 |     PyObject *m;
159 | 
160 |     m = PyModule_Create(&rtrie_module);
161 |     if (m == NULL)
162 |         goto finally;
163 |     PyModule_AddStringConstant(m, "__version__", MODULE_VERSION);
164 | 
165 | finally:
166 |     return m;
167 | }
168 | 
169 | #endif
170 | 


--------------------------------------------------------------------------------
/deps/librtrie/rtrie.c:
--------------------------------------------------------------------------------
  1 | #include "stdint.h"
  2 | #include "string.h"
  3 | #include "stdio.h"
  4 | 
  5 | static char *_val_for_vid(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *rval) {
  6 |     char *curr = rval;
  7 |     uint32_t node = nodes[vid];
  8 |     uint32_t kid_offset = (uint32_t )0x00ffffff & node;
  9 |     uint32_t parent = kids[kid_offset++];
 10 | 
 11 |     if (parent) {
 12 |         curr = _val_for_vid(nodes, kids, parent, rval);
 13 |     }
 14 | 
 15 |     uint16_t *radix = (uint16_t *)&kids[kid_offset];
 16 |     char *radix_chars = (char *)(radix + 1);
 17 |     memcpy(curr, radix_chars, *radix);
 18 |     curr += *radix;
 19 |     return curr;
 20 | }
 21 | 
 22 | 
 23 | int value_for_vid(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *result, size_t *rlen) {
 24 |     char *end = _val_for_vid(nodes, kids, vid, result);
 25 |     *rlen = end - result;
 26 |     return 0;
 27 | }
 28 | 
 29 | 
 30 | static uint32_t _find_binary(uint32_t *knodes, uint8_t kid_len, unsigned char selector) {
 31 |     int lower = 0;
 32 |     int upper = kid_len - 1;
 33 | 
 34 |     while (lower <= upper) {
 35 |         int mid = lower + ((upper - lower) / 2);
 36 | 
 37 |         // unpack the node - the high order byte is the selector for those children
 38 |         uint32_t knode = knodes[mid];
 39 |         unsigned char rselect = (unsigned char) (knode >> 24);
 40 |         uint32_t node = ((uint32_t)0x00ffffff) & knode;
 41 | 
 42 |         if (rselect == selector) {
 43 |             return node;
 44 |         }
 45 |         else if (rselect < selector) {
 46 |             lower = mid + 1;
 47 |         }
 48 |         else{
 49 |             upper = mid - 1;
 50 |         }
 51 |      }
 52 |     return 0;
 53 | }
 54 | 
 55 | 
 56 | static uint32_t _vid_for_value(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *key, uint16_t key_len) {
 57 |     if (!key_len) {
 58 |         return vid;
 59 |     }
 60 | 
 61 |     uint32_t node = nodes[vid];
 62 |     uint8_t kid_len = (uint8_t)(node >> 24);
 63 |     uint32_t kid_offset = ((uint32_t )0x00ffffff & node) + 1;
 64 |     uint16_t *radix = (uint16_t *)&kids[kid_offset];
 65 |     uint16_t radix_len = *radix;
 66 |     uint16_t i;
 67 | 
 68 |     // we need to compare the radix to the key
 69 |     if (radix_len <= key_len) {
 70 |         char *radix_chars = (char *)(radix + 1);
 71 |         for (i = 0; i < radix_len; i++) {
 72 |             if (radix_chars[i] != key[i]) {
 73 |                 return 0;
 74 |             }
 75 |         }
 76 | 
 77 |         // did we find the VID?
 78 |         if (radix_len == key_len) {
 79 |             return vid;
 80 |         }
 81 | 
 82 |         // we have a matching radix, take the 'rest' of the key and match with it's children
 83 |         char *selector = key + radix_len;
 84 |         uint16_t selector_len = key_len - radix_len;
 85 |         uint16_t width = 2 + radix_len;
 86 |         kid_offset += width / 4;
 87 |         if (width % 4) {
 88 |             kid_offset++;
 89 |         }
 90 |         uint32_t *knodes = kids + kid_offset;
 91 |         uint32_t knode = _find_binary(knodes, kid_len, (unsigned char)(*selector));
 92 |         if (knode) {
 93 |             return _vid_for_value(nodes, kids, knode, selector, selector_len);
 94 |         }
 95 |     }
 96 |     return 0;
 97 | 
 98 | }
 99 | 
100 | int vid_for_value(uint32_t *nodes, uint32_t *kids, char *key, uint16_t key_len, uint32_t *vid) {
101 |     uint32_t node = _vid_for_value(nodes, kids, 0, key, key_len);
102 |     if (node) {
103 |         *vid = node;
104 |         return 0;
105 |     }
106 |     return -1;
107 | }
108 | 
109 | 
110 | static void _print_it(uint32_t *nodes, uint32_t *kids, uint32_t curr_node, unsigned char selector, int depth) {
111 |     int i;
112 |     uint32_t node = nodes[curr_node];
113 |     uint8_t kid_len = (uint8_t)(node >> 24);
114 |     uint32_t kid_offset = (uint32_t )0x00ffffff & node;
115 |     uint32_t *kid = kids + kid_offset;
116 |     uint32_t parent = kid[0];
117 |     uint16_t radix_len = ((uint16_t *)kid)[2];
118 |     for(i = 0; i < depth; ++i) {
119 |         printf("   ");
120 |     }
121 |     if(radix_len > 0) {
122 |         char *radix = ((char *)kid) + 6;
123 |         printf("%d '%.*s' ", radix_len, radix_len, radix);
124 |     }
125 |     else {
126 |         printf("<none> ");
127 |     }
128 |     printf("%d(%d) '%c'(0x%x) - %d\n", curr_node, parent, selector, selector, kid_len);
129 | 
130 |     // pad
131 |     uint32_t child_offset = 6 + radix_len;
132 |     child_offset += (4 - (child_offset % 4)) % 4;
133 |     child_offset /= 4;
134 | 
135 |     // process kids
136 |     uint32_t *children = kid + child_offset;
137 |     for(i = 0; i < kid_len; ++i) {
138 |         uint32_t child = children[i];
139 |         unsigned char sel = (unsigned char)(child >> 24);
140 |         uint32_t new_node = child & 0x00ffffff;
141 |         _print_it(nodes, kids, new_node, sel, depth + 1);
142 |     }
143 | }
144 | 
145 | void print_it(uint32_t *nodes, uint32_t *kids) {
146 |     _print_it(nodes, kids, 0, '\0', 0);
147 | }
148 | 
149 | 
150 | void summarize(uint32_t *nodes, uint32_t *kids, int num_nodes) {
151 |     int i;
152 |     printf("Summarize nodes=%p kids=%p num_nodes=%d\n", nodes, kids, num_nodes);
153 |     for(i = 0; i < num_nodes; ++i) {
154 |         uint32_t node = nodes[i];
155 |         uint8_t kid_len = (uint8_t)(node >> 24);
156 |         uint32_t kid_offset = (uint32_t )0x00ffffff & node;
157 |         uint32_t *kid = kids + kid_offset;
158 |         uint32_t parent = kid[0];
159 |         uint16_t radix_len = ((uint16_t *)kid)[2];
160 | 
161 |         printf("%d %d | %d ", kid_len, kid_offset, parent);
162 |         if(radix_len > 0) {
163 |             char *radix = ((char *)kid) + 6;
164 |             printf("%d '%.*s'\n", radix_len, radix_len, radix);
165 |         }
166 |         else {
167 |             printf("0 ''\n");
168 |         }
169 |         //printf("%d %d(%d) - %d, %d\n", i, node, parent, kid_len, kid_offset);
170 |     }
171 | }


--------------------------------------------------------------------------------
/deps/librtrie/rtrie.h:
--------------------------------------------------------------------------------
 1 | #ifndef _RTRIE_H_
 2 | #define _RTRIE_H_
 3 | 
 4 | #include "stdint.h"
 5 | #include "stddef.h"
 6 | 
 7 | int value_for_vid(uint32_t *nodes, uint32_t *kids, uint32_t vid, char *result, size_t *rlen);
 8 | int vid_for_value(uint32_t *nodes, uint32_t *kids, char *key, uint16_t key_len, uint32_t *vid);
 9 | void print_it(uint32_t *nodes, uint32_t *kids);
10 | void summarize(uint32_t *nodes, uint32_t *kids, int num_nodes); 
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/deps/libwtrie/test/test_wtrie.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import unittest
 3 | from wtrie import Trie
 4 | 
 5 | 
 6 | class TestWTrie(unittest.TestCase):
 7 |     def test_wtrie(self):
 8 |         t = Trie()
 9 |         self.assertEqual(t.add('hello'), 1)
10 |         self.assertEqual(t.add('hell'), 2)
11 |         self.assertEqual(t.add('hello'), 1)
12 |         self.assertEqual(t.add('hellothere'), 3)
13 |         self.assertEqual(t.add('good'), 4)
14 |         self.assertEqual(t.add('goodbye'), 5)
15 |         self.assertEqual(t.add('hello'), 1)
16 |         self.assertEqual(t.add('hellsink'), 6)
17 |         self.assertEqual(t.add(''), 0)
18 | 
19 |         # nodes = t.nodes
20 |         # t.print_it()
21 | 
22 |         key, sz, pt = t.node_at_path()
23 |         self.assertEqual(sz, 2)
24 | 
25 |         key, sz, pt = t.node_at_path(104)
26 |         self.assertEqual(key, 'hell')
27 |         self.assertEqual(pt, 0)
28 |         self.assertEqual(sz, 2, 'actual %s' % sz)
29 | 
30 |         key2, sz, pt = t.node_at_path(104, 111)
31 |         self.assertEqual(key2, 'o', 'actual %s' % key)
32 |         self.assertEqual(pt, 2)
33 |         self.assertEqual(sz, 1)
34 | 
35 |         key, sz, pt = t.node_at_path(104, 111, 116)
36 |         self.assertEqual(key, 'there')
37 |         self.assertEqual(pt, 1)
38 |         self.assertEqual(sz, 0)
39 | 
40 |         n, k, _ = t.serialize()
41 |         self.assertEqual(len(n), 7 * 4, "actual %d" % len(n))
42 |         self.assertEqual(len(k), 100, "actual %d" % len(k))
43 |         # print "sqork: %s" % t.kid_space
44 | 
45 |         print 'nodes', n
46 |         print 'kids', k
47 | 
48 |         unpacked = struct.unpack_from("7I", n, 0)
49 |         expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016)
50 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
51 | 
52 |         unpacked = struct.unpack_from("IH2I", k, 0)
53 |         expected = (0, 0, 0x67000004, 0x68000002)
54 |         self.assertEqual(unpacked, expected, unpacked)
55 | 
56 |         unpacked = struct.unpack_from("IH4cI", k, 16)
57 |         expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005)
58 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
59 | 
60 |         unpacked = struct.unpack_from("IH3c", k, 32)
61 |         expected = (0x0004, 0x0003, 'b', 'y', 'e')
62 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
63 | 
64 |         unpacked = struct.unpack_from("IH4c2I", k, 44)
65 |         expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006)
66 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
67 | 
68 |         unpacked = struct.unpack_from("IHcI", k, 64)
69 |         expected = (0x0002, 1, 'o', 0x74000003)
70 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
71 | 
72 |         unpacked = struct.unpack_from("IH5c", k, 76)
73 |         expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e')
74 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
75 | 
76 |         unpacked = struct.unpack_from("IH4c", k, 88)
77 |         expected = (0x0002, 0x0004, 's', 'i', 'n', 'k')
78 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
79 | 


--------------------------------------------------------------------------------
/deps/maxhash/test/maxhash_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from maxhash import MinHeap, MaxHash
 3 | 
 4 | 
 5 | class TestMinHeap(unittest.TestCase):
 6 |     def test_pop(self):
 7 |         m = MinHeap(1024, (3, 5, 2, 1))
 8 |         self.assertEqual(m.pop(), 1)
 9 |         self.assertEqual(m.pop(), 2)
10 |         self.assertEqual(m.pop(), 3)
11 |         self.assertEqual(m.pop(), 5)
12 | 
13 |     def test_push(self):
14 |         m = MinHeap(1024, ())
15 |         m.push(1)
16 |         m.push(3)
17 |         m.push(2)
18 |         self.assertEqual(m.pop(), 1)
19 |         self.assertEqual(m.pop(), 2)
20 |         self.assertEqual(m.pop(), 3)
21 |         m.push(1)
22 |         m.push(3)
23 |         m.push(2)
24 |         m.push(4)
25 |         m.push(6)
26 |         m.push(5)
27 |         self.assertEqual(m.pop(), 1)
28 |         self.assertEqual(m.pop(), 2)
29 |         self.assertEqual(m.pop(), 3)
30 |         self.assertEqual(m.pop(), 4)
31 |         self.assertEqual(m.pop(), 5)
32 |         self.assertEqual(m.pop(), 6)
33 | 
34 |     def test_nlargest(self):
35 |         m = MinHeap(1024, [1, 2, 3, 4, 2, 1, 5, 6])
36 |         l = list(m.nlargest(3))
37 |         l.sort()
38 |         self.assertEqual(l, [4, 5, 6])
39 | 
40 | 
41 | class TestMaxHash(unittest.TestCase):
42 |     def test_add(self):
43 |         m = MaxHash(8192)
44 |         m.add(str(1))
45 |         m.add(str(2))
46 |         m.add(str(3))
47 |         m.add(str(4))
48 |         self.assertEqual(len(m.uniq()), 4)
49 | 
50 |     def test_merge(self):
51 |         r1 = range(10000)
52 |         m1 = MaxHash(8192)
53 |         r2 = range(2000, 12000)
54 |         m2 = MaxHash(8192)
55 |         r3 = range(15000)
56 |         m3 = MaxHash(8192)
57 |         for i in r1:
58 |             m1.add(str(i))
59 |         for i in r2:
60 |             m2.add(str(i))
61 |         for i in r3:
62 |             m3.add(str(i))
63 |         m2.merge(m1)
64 |         ix = MaxHash.get_jaccard_index([m2, m3])
65 |         self.assertAlmostEqual(ix, 0.80, 2)
66 | 
67 |     def test_union(self):
68 |         r1 = range(10000)
69 |         m1 = MaxHash(8192)
70 |         r2 = range(2000, 12000)
71 |         m2 = MaxHash(8192)
72 |         r3 = range(15000)
73 |         m3 = MaxHash(8192)
74 |         for i in r1:
75 |             m1.add(str(i))
76 |         for i in r2:
77 |             m2.add(str(i))
78 |         for i in r3:
79 |             m3.add(str(i))
80 |         m4 = m1.union(m2)
81 |         ix = MaxHash.get_jaccard_index([m3, m4])
82 |         self.assertAlmostEqual(ix, 0.80, 2)
83 | 
84 |     def test_jarcard_index(self):
85 |         r1 = range(10000)
86 |         m1 = MaxHash(8192)
87 |         r2 = range(2000, 10000)
88 |         m2 = MaxHash(8192)
89 |         for i in r1:
90 |             m1.add(str(i))
91 |         for i in r2:
92 |             m2.add(str(i))
93 |         ix = MaxHash.get_jaccard_index([m1, m2])
94 |         self.assertAlmostEqual(ix, 0.80, 2)
95 | 


--------------------------------------------------------------------------------
/deps/scamurmur3/murmur3.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the
 3 | // public domain. The author hereby disclaims copyright to this source
 4 | // code.
 5 | 
 6 | #ifndef _MURMURHASH3_H_
 7 | #define _MURMURHASH3_H_
 8 | 
 9 | #include <stdint.h>
10 | 
11 | //-----------------------------------------------------------------------------
12 | 
13 | void MurmurHash3_x86_32 (const void *key, int len, uint32_t seed, void *out);
14 | 
15 | void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out);
16 | 
17 | void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out);
18 | 
19 | //-----------------------------------------------------------------------------
20 | 
21 | #endif // _MURMURHASH3_H_
22 | 


--------------------------------------------------------------------------------
/deps/scamurmur3/scamurmur3.c:
--------------------------------------------------------------------------------
  1 | 
  2 | // License: MIT License
  3 | // http://www.opensource.org/licenses/mit-license.php
  4 | 
  5 | // SMHasher code is from SMHasher project, authored by Austin Appleby, et al.
  6 | // http://code.google.com/p/smhasher/
  7 | 
  8 | // Python extension code by Patrick Hensley
  9 | // Ported from C++ to C by Chango Corp.
 10 | 
 11 | 
 12 | #include <Python.h>
 13 | #include "murmur3.h"
 14 | 
 15 | 
 16 | #if PY_VERSION_HEX < 0x02050000
 17 | typedef int Py_ssize_t;
 18 | #define PY_SSIZE_T_MAX INT_MAX
 19 | #define PY_SSIZE_T_MIN INT_MIN
 20 | #endif
 21 | 
 22 | 
 23 | static PyObject *
 24 | _py_murmur3_128(PyObject *self, PyObject *args, int x86, int size)
 25 | {
 26 |     const char *key;
 27 |     Py_ssize_t len;
 28 |     uint32_t seed = 0;
 29 |     unsigned char out[16];
 30 | 
 31 |     if (!PyArg_ParseTuple(args, "s#|I", &key, &len, &seed)) {
 32 |         return NULL;
 33 |     }
 34 | 
 35 |     if (x86) {
 36 |         MurmurHash3_x86_128((void *)key, len, seed, &out);
 37 |     } else {
 38 |         MurmurHash3_x64_128((void *)key, len, seed, &out);
 39 |     }
 40 | 
 41 |     return _PyLong_FromByteArray((const unsigned char *)&out, size, 0, 0);
 42 | }
 43 | 
 44 | static PyObject *
 45 | py_murmur3_x86_32(PyObject *self, PyObject *args)
 46 | {
 47 |     const char *key;
 48 |     Py_ssize_t len;
 49 |     uint32_t seed = 0;
 50 |     unsigned char out[4];
 51 | 
 52 |     if (!PyArg_ParseTuple(args, "s#|I", &key, &len, &seed)) {
 53 |         return NULL;
 54 |     }
 55 | 
 56 |     MurmurHash3_x86_32((void *)key, len, seed, &out);
 57 | 
 58 |     return _PyLong_FromByteArray((const unsigned char *)&out, 4, 0, 0);
 59 | }
 60 | 
 61 | static PyObject *
 62 | py_murmur3_x86_64(PyObject *self, PyObject *args)
 63 | {
 64 |     return _py_murmur3_128(self, args, 1, 8);
 65 | }
 66 | 
 67 | 
 68 | static PyObject *
 69 | py_murmur3_x64_64(PyObject *self, PyObject *args)
 70 | {
 71 |     return _py_murmur3_128(self, args, 0, 8);
 72 | }
 73 | 
 74 | 
 75 | static PyObject *
 76 | py_murmur3_x86_128(PyObject *self, PyObject *args)
 77 | {
 78 |     return _py_murmur3_128(self, args, 1, 16);
 79 | }
 80 | 
 81 | 
 82 | static PyObject *
 83 | py_murmur3_x64_128(PyObject *self, PyObject *args)
 84 | {
 85 |     return _py_murmur3_128(self, args, 0, 16);
 86 | }
 87 | 
 88 | 
 89 | PyDoc_STRVAR(module_doc, "Python wrapper for the SMHasher routines.");
 90 | 
 91 | static PyMethodDef scamurmur3_methods[] = {
 92 |     {"murmur3_x86_32", py_murmur3_x86_32, METH_VARARGS,
 93 |         "Make an x86 murmur3 32-bit hash value"},
 94 | 
 95 |     {"murmur3_x86_64", py_murmur3_x86_64, METH_VARARGS,
 96 |         "Make an x86 murmur3 64-bit hash value"},
 97 |     {"murmur3_x64_64", py_murmur3_x64_64, METH_VARARGS,
 98 |         "Make an x64 murmur3 64-bit hash value"},
 99 | 
100 |     {"murmur3_x86_128", py_murmur3_x86_128, METH_VARARGS,
101 |         "Make an x86 murmur3 128-bit hash value"},
102 |     {"murmur3_x64_128", py_murmur3_x64_128, METH_VARARGS,
103 |         "Make an x64 murmur3 128-bit hash value"},
104 | 
105 |     {NULL, NULL, 0, NULL}
106 | };
107 | 
108 | 
109 | #if PY_MAJOR_VERSION <= 2
110 | 
111 | extern PyMODINIT_FUNC
112 | initscamurmur3(void)
113 | {
114 |     PyObject *m;
115 | 
116 |     m = Py_InitModule3("scamurmur3", scamurmur3_methods, module_doc);
117 | 
118 |     if (m == NULL)
119 |         return;
120 |     PyModule_AddStringConstant(m, "__version__", MODULE_VERSION);
121 | }
122 | 
123 | #else
124 | 
125 | /* Python 3.x */
126 | 
127 | static PyModuleDef scamurmur3_module = {
128 |     PyModuleDef_HEAD_INIT,
129 |     "scamurmur3",
130 |     module_doc,
131 |     -1,
132 |     scamurmur3_methods,
133 |     NULL,
134 |     NULL,
135 |     NULL,
136 |     NULL
137 | };
138 | 
139 | extern PyMODINIT_FUNC
140 | PyInit_smhasher(void)
141 | {
142 |     PyObject *m;
143 | 
144 |     m = PyModule_Create(&smhasher_module);
145 |     if (m == NULL)
146 |         goto finally;
147 |     PyModule_AddStringConstant(m, "__version__", MODULE_VERSION);
148 | 
149 | finally:
150 |     return m;
151 | }
152 | 
153 | #endif
154 | 
155 | 


--------------------------------------------------------------------------------
/deps/setup.py:
--------------------------------------------------------------------------------
  1 | from distutils.core import setup
  2 | from distutils.extension import Extension
  3 | from Cython.Distutils import build_ext
  4 | 
  5 | 
  6 | __version__ = '"0.3.0"'
  7 | 
  8 | 
  9 | scamurmur3 = Extension(
 10 |     'scamurmur3',
 11 |     sources=[
 12 |         'scamurmur3/scamurmur3.c',
 13 |         'scamurmur3/murmur3.c',
 14 |     ],
 15 |     include_dirs=['./scamurmur3'],
 16 |     define_macros=[('MODULE_VERSION', __version__)]
 17 | )
 18 | 
 19 | clzf = Extension(
 20 |     'clzf',
 21 |     sources=[
 22 |         'liblzf/clzf.c',
 23 |         'liblzf/lzf_c.c',
 24 |         'liblzf/lzf_d.c',
 25 |     ],
 26 |     include_dirs=['./liblzf'],
 27 |     define_macros=[('MODULE_VERSION', __version__)]
 28 | )
 29 | 
 30 | rtrie = Extension(
 31 |     'rtrie',
 32 |     sources=[
 33 |         'librtrie/pyrtrie.c',
 34 |         'librtrie/rtrie.c',
 35 |     ],
 36 |     include_dirs=['./librtrie'],
 37 |     define_macros=[('MODULE_VERSION', __version__)]
 38 | )
 39 | 
 40 | wtrie = Extension(
 41 |     "wtrie",
 42 |     sources=["libwtrie/wtrie.pyx"],
 43 |     define_macros=[('MODULE_VERSION', __version__)]
 44 | )
 45 | 
 46 | clz4 = Extension(
 47 |     'clz4',
 48 |     sources=[
 49 |         'liblz4/clz4.c',
 50 |         'liblz4/lz4.c',
 51 |         'liblz4/lz4hc.c',
 52 |     ],
 53 |     include_dirs=['./liblz4'],
 54 |     define_macros=[('MODULE_VERSION', __version__)]
 55 | )
 56 | 
 57 | cardunion = Extension(
 58 |     "cardunion",
 59 |     ["cardunion/cardunion.pyx"],
 60 |     libraries=["m"],
 61 |     define_macros=[('MODULE_VERSION', __version__)]
 62 | )
 63 | 
 64 | ebitset = Extension(
 65 |     "pyebset",
 66 |     sources=["libebset/pyebset.pyx"],
 67 |     include_dirs=['./libebset'],
 68 |     language="c++",
 69 |     define_macros=[('MODULE_VERSION', __version__)]
 70 | )
 71 | 
 72 | maxhash = Extension(
 73 |     "maxhash",
 74 |     sources=["maxhash/maxhash.pyx"],
 75 |     define_macros=[('MODULE_VERSION', __version__)]
 76 | )
 77 | 
 78 | lru = Extension(
 79 |     "pylru",
 80 |     sources=["liblru/pylru.pyx"],
 81 |     include_dirs=['./liblru'],
 82 |     language="c++",
 83 |     define_macros=[('MODULE_VERSION', __version__)]
 84 | )
 85 | 
 86 | 
 87 | lmdb = Extension(
 88 |     "mdb",
 89 |     sources=["liblmdb/db.pyx", ],
 90 |     libraries=["lmdb"],
 91 |     library_dirs=["/usr/local/lib"],
 92 |     include_dirs=["/usr/local/include"],
 93 |     runtime_library_dirs=["/usr/local/lib"])
 94 | 
 95 | 
 96 | setup(
 97 |     name = "hustle-deps",
 98 |     version = __version__,
 99 |     cmdclass = {'build_ext': build_ext},
100 |     description=('Hustle-deps: a collection of dependent libraries.'),
101 |     author = 'Chango Inc.',
102 |     license = 'MIT',
103 |     ext_modules = [
104 |         scamurmur3,
105 |         cardunion,
106 |         ebitset,
107 |         maxhash,
108 |         clzf,
109 |         clz4,
110 |         rtrie,
111 |         wtrie,
112 |         lru,
113 |         lmdb,
114 |     ]
115 | )
116 | 


--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | 


--------------------------------------------------------------------------------
/doc/_static/hustle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/doc/_static/hustle.png


--------------------------------------------------------------------------------
/doc/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 | 
 3 | {%- block extrahead %}
 4 | {{ super() }}
 5 | <script type="text/javascript">
 6 |   var _gaq = _gaq || [];
 7 |   _gaq.push(['_setAccount', 'UA-48426035-1']);
 8 |   _gaq.push(['_trackPageview']);
 9 | </script>
10 | {% endblock %}
11 | 
12 | {% block footer %}
13 | {{ super() }}
14 | <div class="footer">
15 | <script type="text/javascript">
16 |   (function() {
17 |     var ga = document.createElement('script');
18 |     ga.src = ('https:' == document.location.protocol ?
19 |               'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
20 |     ga.setAttribute('async', 'true');
21 |     document.documentElement.firstChild.appendChild(ga);
22 |   })();
23 | </script>
24 | </div>
25 | {% endblock %}
26 | 
27 | 


--------------------------------------------------------------------------------
/doc/api/core.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: hustle.core.marble
2 |     :members:


--------------------------------------------------------------------------------
/doc/api/hustle.rst:
--------------------------------------------------------------------------------
1 | .. automodule:: hustle
2 |     :members:


--------------------------------------------------------------------------------
/doc/howto/cli.rst:
--------------------------------------------------------------------------------
 1 | .. _cliguide:
 2 | 
 3 | Hustle Command Line Interface (CLI)
 4 | ===================================
 5 | 
 6 | After installing Hustle, you can invoke the Hustle CLI from the installation directory like this::
 7 | 
 8 |     bin/hustle
 9 | 
10 | Assuming you've installed everything and have a running and correctly configured *Disco* instance, you will get a
11 | Python prompt looking something like this::
12 | 
13 |     ➜  bin git:(develop) ✗ ./hustle
14 |     Loading Hustle Tables from disco://localhost
15 |        impressions
16 |        pixels
17 |     Welcome to Hustle!  Type `commands()` or `tables()` for some help, `exit()` to leave.
18 |     >>>
19 | 
20 | We see here that the CLI has loaded the Hustle tables from the *disco://localhost* cluster called *impressions*
21 | and *pixels*.  The CLI actually loads these into Python's global variable space, so that these
22 | :class:`Tables <hustle.Table>` are actually instantiated with their table names in the Python namespace::
23 | 
24 |     >>> schema(impressions)
25 |     ad_id (int32,IX)                  cpm_millis (uint32)               date (string,IX,PT)
26 |     site_id (dict(32),IX)             time (uint32,IX)                  token (string,IX)
27 |     url (dict(32))
28 | 
29 | gives the *schema* of the *impressions* table.  Doing a query is just as simple::
30 | 
31 |     >>> select(impressions.ad_id, h_sum(impressions.cpm_millis), where=impressions.date == '2014-01-20')
32 |     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
33 |                                        ad_id                          sum(cpm_millis)
34 |     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
35 |                                       30,016                                    1,690
36 |                                       30,003                                      925
37 |                                       30,019                                    2,023
38 |                                       30,024                                    1,511
39 |                                       30,009                                      863
40 |                                       30,025                                    3,124
41 |                                       30,010                                    2,555
42 |                                       30,011                                    2,150
43 |                                       30,014                                    4,491
44 | 
45 | 
46 | The CLI offers the following features over and above being a 'normal' Python REPL:
47 | - configurable command history
48 | - no *import* statements required to load Hustle functionality
49 | - auto-completion (with TAB key) of all Hustle functions, Tables, and Columns
50 | - query results (from :func:`select <hustle.select>` are automatically sent to *stdout*
51 | 


--------------------------------------------------------------------------------
/doc/howto/configure.rst:
--------------------------------------------------------------------------------
 1 | .. _configureguide:
 2 | 
 3 | Configuring Hustle
 4 | ==================
 5 | 
 6 | Hustle has a configuration file located at::
 7 | 
 8 |     /etc/hustle/settings.yaml
 9 | 
10 | and has the following possible settings:
11 | 
12 | ==============      ==============================      ==============================================
13 | Name                Default Value                       Description
14 | ==============      ==============================      ==============================================
15 | server              disco://localhost                   The Disco master node
16 | worker_class        hustle.core.pipeworker.Worker       The Disco Worker class
17 | dump                False                               True will automatically print select() results
18 | nest                False                               True will return a Table from select()
19 | partition           16                                  The number of partitions for restrict-select
20 | history_size        1000                                The number of history entries in the CLI
21 | ==============      ==============================      ==============================================
22 | 
23 | 


--------------------------------------------------------------------------------
/doc/howto/delete.rst:
--------------------------------------------------------------------------------
 1 | .. _deleteguide:
 2 | 
 3 | Deleting Data in Hustle
 4 | =======================
 5 | Deleting data in Hustle is partition-oriented, which means you *can't* remove specific rows as conventional database systems dose. There are two functions to do this with different granularities.
 6 | 
 7 | Delete
 8 | ------
 9 | :func:`delete() <hustle.delete>` function *only* deletes data but keeps the table definition. If a :class:`Table <hustle.Table>` object specified, all data in that table will be deleted. To delete a particular range of partitions, pass it an :class:`Expr <hustle.core.marble.Expr>`, for example, "impressions.date < '2014-01-01'".
10 | 
11 | .. seealso::
12 | 
13 |     :func:`hustle.delete`
14 |         Hustle's delete statement
15 | 
16 |     :ref:`schemadesign`
17 |         Details of the Hustle Partition
18 | 
19 | Drop
20 | ----
21 | Use :func:`drop() <hustle.drop>` function to delete the whole table, including data, all partitions, and table definition. Unlike :func:`delete() <hustle.delete>`, it *only* takes a :class:`Table <hustle.Table>` object to specify the table you want to drop.
22 | 
23 | .. seealso::
24 | 
25 |     :func:`hustle.drop`
26 |         Hustle's drop statement
27 | 


--------------------------------------------------------------------------------
/doc/howto/insert.rst:
--------------------------------------------------------------------------------
 1 | .. _insertguide:
 2 | 
 3 | Inserting Data To Hustle
 4 | ========================
 5 | 
 6 | The process of inserting data into a Hustle cluster is referred to as a *distributed* insert.  It is
 7 | distributed because the client machine does the heavy lifting of creating a
 8 | :class:`Marble <hustle.core.marble.Marble>`, which is a self-contained large grained database fragment, which
 9 | is then `pushed into the distributed file system DDFS <http://disco.readthedocs.org/en/latest/howto/ddfs.html#ddfs>`_,
10 | which is a relatively inexpensive HTTP operation.  The write throughput to the Hustle cluster, then, is only
11 | bound by the number of machines inserting into it.
12 | 
13 | Hustle currently supports `one JSON object per line <http://json.org>`_ style input, as well as
14 | `Disco's native results format <http://disco.readthedocs.org/en/latest/faq.html#chaining>`_.
15 | 
16 | Here is an example insert::
17 | 
18 |     from hustle import Table, insert
19 |     impressions = Table.from_tag('impressions')
20 |     insert(impressions, './imprsions-june-8.json', server='disco://hustle')
21 | 
22 | Hustle provides a command-line tool for inserting data located at :code:`bin/insert`.  Here is the *--help* for
23 | it::
24 | 
25 |     ➜ hustle/bin > ./insert --help
26 |     usage: insert [-h] [-s SERVER] [-f INFILE] [-m MAXSIZE] [-t TMPDIR]
27 |               [-p PROCESSOR] [--disco-chunk]
28 |               TABLE FILES [FILES ...]
29 | 
30 |     Hustle bulk load
31 | 
32 |     positional arguments:
33 |       TABLE                 The Hustle table to insert to
34 |       FILES                 The JSON formated files to insert
35 | 
36 |     optional arguments:
37 |       -h, --help            show this help message and exit
38 |       -s SERVER, --server SERVER
39 |                             DDFS server destination
40 |       -f INFILE             A file containing a list of all files to be inserted
41 |       -m MAXSIZE, --maxsize MAXSIZE
42 |                             Initial size of Hustle marble
43 |       -t TMPDIR, --tmpdir TMPDIR
44 |                             Temporary directory for Hustle marble creation
45 |       -p PROCESSOR          a module.function for the Hustle import preprocessor
46 |       --disco-chunk         Indicated if the input files are in Disco CHUNK format
47 | 
48 | .. seealso::
49 | 
50 |     Page :ref:`integrationtests`
51 |         Hustle's Integration Test Suite for creating and inserting to partitioned Tables.
52 | 
53 |     :func:`insert() function <hustle.insert>`
54 | 
55 | 


--------------------------------------------------------------------------------
/doc/howto/integration_tests.rst:
--------------------------------------------------------------------------------
 1 | .. _integrationtests:
 2 | 
 3 | Hustle Integration Test Suite
 4 | =============================
 5 | 
 6 | The Hustle Integration Test suite is a good place  to see non-trivial Hustle Tables created,
 7 | data inserted into them, and some subsequent queries.  They are located in::
 8 | 
 9 |     hustle/integration_test
10 | 
11 | To run the test suite, ensure you have installed `Nose <https://nose.readthedocs.org/en/latest/>`_ and
12 | :ref:`Hustle <installguide>`.  Before you run the integration tests, you will need to make sure
13 | `Disco <http://discoproject.org/>`_ is running and that you have run the *setup.py* script once::
14 | 
15 |     python hustle/integration_test/setup.py
16 | 
17 | You can then execute the *nosetests* in the integration suite::
18 | 
19 |     cd hustle/integration_test
20 |     nosetests
21 | 
22 | 


--------------------------------------------------------------------------------
/doc/hustle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/doc/hustle.png


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
  1 | Hustle Documentation
  2 | ====================
  3 | 
  4 | Hustle is a distributed, column oriented, relational
  5 | `OLAP Database <http://en.wikipedia.org/wiki/Online_analytical_processing>`_.  Hustle supports parallel insertions
  6 | and queries over large data sets, stored on an unreliable cluster of computers.  It is meant to load and query the
  7 | enormous data sets typical of ad-tech, high volume web services, and other large-scale analytics applications.
  8 | 
  9 | Hustle is a distributed database.  When data is inserted into Hustle, it is replicated across a cluster to enhance
 10 | availability, horizontal scalability and enable parallel query execution.  When data is replicated on multiple nodes,
 11 | your database becomes resistant to node failure because there is always multiple copies of it on the cluster.  This
 12 | allows you to simply add more machines to increase both overall storage and to decrease query time by performing
 13 | more operations in parallel.
 14 | 
 15 | Hustle is a relational database, so, unlike other NoSQL databases, it stores its data in rows and columns in a fixed
 16 | schema.  This means that you must *create* Tables with a fixed number of Columns of specific data types, before
 17 | *inserting* data into the database.  The advantage of this is that both storage and query execution can be
 18 | fine tuned to minimize both the data footprint and the query execution time.
 19 | 
 20 | Hustle uses a `column oriented format <http://en.wikipedia.org/wiki/Column-oriented_DBMS>`_ for storing data.  This
 21 | scheme is often used for very large databases, as it is more efficient for aggregation operations such as sum() and
 22 | average() functions over a particular column as well as relational *joins* across tables.
 23 | 
 24 | Although Hustle has a relational data model, it is not a SQL database.  Hustle extends the Python language for
 25 | its relational query facility.  Let's take a look at a typical Hustle query in Python::
 26 | 
 27 |     select(impressions.ad_id, h_sum(pixels.amount), h_count(),
 28 |            where=(impressions.date < '2014-01-13', pixels.date < '2014-01-13'),
 29 |            join=(impressions.site_id, pixels.site_id),
 30 |            order_by='ad_id', desc=True)
 31 | 
 32 | which would be equivalent to the SQL query::
 33 | 
 34 |     SELECT i.ad_id, i.site_id, sum(p.amount), count(*)
 35 |     FROM impressions i
 36 |     JOIN pixels p on p.site_id = p.site_id
 37 |     WHERE i.date < '2014-01-13' and p.date < '2014-01-13'
 38 |     ORDER BY i.ad_id DESC
 39 |     GROUP BY i.ad_id, i.site_id
 40 | 
 41 | The two approaches seem equivalent, however, Python is extensible, whereas SQL is not.  You can do much more
 42 | with Hustle than just query data.  Hustle was designed to express distributed computation over indexed data which
 43 | includes, but is not limited to the classic relational *select* statement.  SQL is good at queries, not as an ecosystem
 44 | for general purpose data-centric distributed computation.
 45 | 
 46 | Hustle is meant for large, distributed inserts, and has *append only* semantics.  It is suited to very large *log*
 47 | file style inputs, and once data is inserted, it cannot be changed.  This scheme is typically suitable for
 48 | distributed applications that generate large log files, with many (possibly hundreds of) thousands of events
 49 | per second.  Hustle has been streamlined to accept structured JSON log files as its primary input format, and to
 50 | perform *distributed* inserts.  A distributed insert delegates most of the database creation work to the *client*,
 51 | thereby freeing up the cluster's resources and avoiding a central computational pinch point like in other *write bound*
 52 | relational OLAP databases.  Hustle can easily handle almost unlimited write load using this scheme.
 53 | 
 54 | Hustle utilizes modern compression and indexing data structures and algorithms to minimize overall memory footprint
 55 | and to maximize query performance.  It utilizes bitmap indexes, prefix trie (dictionary) and lz4 compression, and has a
 56 | very rich set of string and numeric data types of various sizes.  Typically, Hustle data sets are 25% to 50% than
 57 | their equivalent GZIPed JSON sources.
 58 | 
 59 | Hustle has several auxiliary tools:
 60 | 
 61 | * a command line interface (CLI) Python shell with auto-completion of Hustle tables and functions
 62 | * a client side insert script
 63 | 
 64 | Features
 65 | --------
 66 | 
 67 | * column oriented - super fast queries
 68 | * distributed insert - Hustle is designed for petabyte scale datasets in a distributed environment with massive write loads
 69 | * compressed - bitmap indexes, lz4, and prefix trie compression
 70 | * relational - join gigantic data sets
 71 | * partitioned - smart shards
 72 | * embarrassingly distributed (`based on Disco <http://discoproject.org/>`_)
 73 | * embarrassingly fast (`uses LMDB <http://symas.com/mdb/>`_)
 74 | * NoSQL - Python DSL
 75 | * bulk append only semantics
 76 | * highly available, horizontally scalable
 77 | * REPL/CLI query interface
 78 | 
 79 | Getting started
 80 | ---------------
 81 | 
 82 | .. toctree::
 83 |    :titlesonly:
 84 | 
 85 |    start/install
 86 | ..   start/tutorial
 87 | 
 88 | Hustle In Depth
 89 | ---------------
 90 | 
 91 | .. toctree::
 92 |    :titlesonly:
 93 | 
 94 |    howto/integration_tests
 95 |    howto/configure
 96 |    howto/cli
 97 |    howto/schema
 98 |    howto/query
 99 |    howto/insert
100 |    howto/delete
101 | 
102 | Reference
103 | ---------
104 | 
105 | .. toctree::
106 |    :titlesonly:
107 | 
108 |    api/hustle
109 |    api/core
110 | 


--------------------------------------------------------------------------------
/doc/start/install.rst:
--------------------------------------------------------------------------------
 1 | .. _installguide:
 2 | 
 3 | Installing Hustle
 4 | =================
 5 | 
 6 | Hustle is hosted on `GitHub <https://github.com/chango/hustle>`_ and should be cloned from that repo::
 7 | 
 8 |     git clone git@github.com:chango/hustle.git
 9 | 
10 | Dependencies
11 | ------------
12 | 
13 | Hustle has the following dependencies:
14 | 
15 | * you will need `Python 2.7 <http://www.python.org/downloads/>`_
16 | * you will need `Cython <http://cython.org/>`_
17 | * you will need `Disco 0.5 <http://disco.readthedocs.org/en/latest/start/install.html>`_
18 | * you will need `ultrajson <https://github.com/esnme/ultrajson>`_
19 | * you will need `PyYAML <http://pyyaml.org>`_
20 | 
21 | Installing the Hustle Client
22 | ----------------------------
23 | 
24 | In order to run Hustle, you will need to install it onto an existing *Disco v0.5* cluster.
25 | 
26 | In order to query a Hustle/Disco cluster, you will need to install the Hustle software on that *client* machine::
27 | 
28 |     cd hustle
29 |     sudo ./bootstrap.sh
30 | 
31 | This will build and install Hustle on your client machine.
32 | 
33 | Installing on the Cluster
34 | -------------------------
35 | 
36 | Disco is a distributed system and may have many nodes.  Each of the nodes in your Disco cluster will need to install
37 | the Hustle dependencies.  These can be found in the *hustle/deps* directory.  The easiest way to install Hustle on
38 | your disco slave nodes is to::
39 | 
40 |     cd hustle/deps
41 |     make
42 |     sudo make install
43 | 
44 | on **ALL** you disco slave nodes.
45 | 
46 | You may now want to go and run the :ref:`Integration Tests <integrationtests>` to validate your installation.
47 | 


--------------------------------------------------------------------------------
/doc/start/tutorial.rst:
--------------------------------------------------------------------------------
1 | .. _tutorial:
2 | 
3 | Hustle Tutorial
4 | ===============
5 | 
6 | coming soon....


--------------------------------------------------------------------------------
/hustle/cardinality.py:
--------------------------------------------------------------------------------
 1 | from hustle.core.marble import Aggregation, Column
 2 | 
 3 | import mdb
 4 | 
 5 | 
 6 | def h_cardinality(col):
 7 |     """
 8 |     """
 9 | 
10 |     def _inner_deault():
11 |         from cardunion import Cardunion
12 |         return Cardunion(12)
13 | 
14 |     def _inner_hll_accumulate(a, v):
15 |         a.bunion([v])
16 |         return a
17 | 
18 |     return Aggregation("cardinality",
19 |                        col,
20 |                        f=_inner_hll_accumulate,
21 |                        g=lambda a: a.count(),
22 |                        h=lambda a: a.dumps(),
23 |                        default=_inner_deault,
24 |                        result_spec=Column('_cardinality_type', type_indicator=mdb.MDB_UINT_32))
25 | 
26 | 
27 | def h_union(col):
28 |     def _inner_deault():
29 |         from cardunion import Cardunion
30 |         return Cardunion(12)
31 | 
32 |     def _inner_hll_accumulate(a, v):
33 |         a.bunion([v])
34 |         return a
35 | 
36 |     return Aggregation("union",
37 |                        col,
38 |                        f=_inner_hll_accumulate,
39 |                        g=lambda a, c: a.dumps(),
40 |                        h=lambda a: a.dumps(),
41 |                        default=_inner_deault,
42 |                        result_spec=Column('_union_type', type_indicator=mdb.MDB_STR, compression_indicator=3))
43 | 
44 | 
45 | def h_minhash_merge(col):
46 |     def _inner_deault():
47 |         from maxhash import MaxHash
48 |         return MaxHash()
49 | 
50 |     def _inner_hll_accumulate(a, v):
51 |         from maxhash import MaxHash
52 |         a.merge(MaxHash.loads(v))
53 |         return a
54 | 
55 |     return Aggregation("minhash_merge",
56 |                        col,
57 |                        f=_inner_hll_accumulate,
58 |                        g=lambda a, c: a.dumps(),
59 |                        h=lambda a: a.dumps(),
60 |                        default=_inner_deault,
61 |                        result_spec=Column('_minhash_merge_type', type_indicator=mdb.MDB_STR, compression_indicator=3))
62 | 


--------------------------------------------------------------------------------
/hustle/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tspurway/hustle/e62bf1269b446ea6fae23bc5698f845a2f3247c7/hustle/core/__init__.py


--------------------------------------------------------------------------------
/hustle/core/column_fn.py:
--------------------------------------------------------------------------------
  1 | from hustle.core.marble import Column
  2 | 
  3 | import mdb
  4 | 
  5 | 
  6 | class ColumnFn(object):
  7 |     """
  8 |     Decorator for column functions.
  9 | 
 10 |     Note that the decorating column will inherit all properties from the
 11 |     decorated one by default. If the column function will change the data type
 12 |     of the original column, remember to overwrite its corresponding types,
 13 |     i.e. type_indicator, index_indicator, rtrie_indicator, compression_indicator,
 14 |     and boolean. This matters when you want to store the query result back to
 15 |     the database. All the specific indicators show as follows:
 16 | 
 17 |     ==============             ==================
 18 |     type_indicator             Description
 19 |     ==============             ==================
 20 |     mdb.MDB_STR                String
 21 |     mdb.MDB_INT_8/16/32/64     Integer
 22 |     mdb.MDB_UINT_8/16/32/64    Unsigned Integer
 23 |     ==============             ==================
 24 | 
 25 |     ==============             ==================
 26 |     compression_indicator      Description
 27 |     ==============             ==================
 28 |           0                    Trie
 29 |           1                    String
 30 |           2                    LZ4
 31 |           3                    Binary
 32 |     ==============             ==================
 33 | 
 34 |     ==============             ==================
 35 |     trie_indicator             Description
 36 |     ==============             ==================
 37 |     mdb.MDB_UINT_16            16 bit Trie
 38 |     mdb.MDB_UINT_32            32 bit Trie(default)
 39 |     ==============             ==================
 40 | 
 41 |     ==============             ==================
 42 |     boolean                    Description
 43 |     ==============             ==================
 44 |         True                   Boolean Type
 45 |     ==============             ==================
 46 | 
 47 |     ==============             ==================
 48 |     index_indicator            Description
 49 |     ==============             ==================
 50 |            1                   index(default)
 51 |            2                   wide index
 52 |     ==============             ==================
 53 |     """
 54 |     def __init__(self,
 55 |                  type_indicator=None,
 56 |                  index_indicator=None,
 57 |                  compression_indicator=None,
 58 |                  rtrie_indicator=None,
 59 |                  boolean=None):
 60 |         self.type_indicator = type_indicator
 61 |         self.index_indicator = index_indicator
 62 |         self.compression_indicator = compression_indicator
 63 |         self.rtrie_indicator = rtrie_indicator
 64 |         self.boolean = boolean
 65 | 
 66 |     def __call__(self, fn):
 67 |         def wrap(column):
 68 |             index_indicator = self.index_indicator if self.index_indicator is \
 69 |                 not None else column.index_indicator
 70 |             type_indicator = self.type_indicator if self.type_indicator is \
 71 |                 not None else column.type_indicator
 72 |             rtrie_indicator = self.rtrie_indicator if self.rtrie_indicator is \
 73 |                 not None else column.rtrie_indicator
 74 |             compression_indicator = self.compression_indicator if \
 75 |                 self.compression_indicator is not None else column.compression_indicator
 76 |             is_boolean = self.boolean if self.boolean is not None else column.is_boolean
 77 | 
 78 |             new_column = Column(column.name, column.table, index_indicator,
 79 |                                 column.partition, type_indicator, compression_indicator,
 80 |                                 rtrie_indicator, alias=column.alias, boolean=is_boolean,
 81 |                                 column_fn=fn)
 82 |             return new_column
 83 |         return wrap
 84 | 
 85 | 
 86 | # column functions defined as follows:
 87 | 
 88 | @ColumnFn(type_indicator=mdb.MDB_STR)
 89 | def ip_ntoa(val):
 90 |     import socket
 91 |     import struct
 92 |     try:
 93 |         ip = socket.inet_ntoa(struct.pack(">L", val))
 94 |     except:
 95 |         ip = "0.0.0.0"
 96 |     return ip
 97 | 
 98 | 
 99 | @ColumnFn(type_indicator=mdb.MDB_UINT_32)
100 | def ip_aton(val):
101 |     import socket
102 |     import struct
103 |     try:
104 |         ip = struct.unpack(">L", socket.inet_aton(val))[0]
105 |     except:
106 |         ip = 0
107 |     return ip
108 | 
109 | 
110 | @ColumnFn(type_indicator=mdb.MDB_INT_16)
111 | def year(val):
112 |     """
113 |     extract YEAR from "YYYY-MM-DD". Return 0 if failed
114 |     """
115 |     try:
116 |         year = int(val[:4])
117 |     except:
118 |         year = -1
119 |     return year
120 | 
121 | 
122 | @ColumnFn(type_indicator=mdb.MDB_INT_8)
123 | def month(val):
124 |     """
125 |     extract MONTH from "YYYY-MM-DD". Return 0 if failed
126 |     """
127 |     try:
128 |         month = int(val[5:7])
129 |     except:
130 |         month = -1
131 |     return month
132 | 
133 | 
134 | @ColumnFn(type_indicator=mdb.MDB_INT_8)
135 | def day(val):
136 |     """
137 |     extract DAY from "YYYY-MM-DD". Return 0 if failed
138 |     """
139 |     try:
140 |         day = int(val[-2:])
141 |     except:
142 |         day = -1
143 |     return day
144 | 
145 | 
146 | # the old school way to write a column function
147 | # def ip_ntoa(column):
148 |     # "column function for converting an integer IPv4 to a string"
149 |     # import mdb
150 |     # if column.type_indicator != mdb.MDB_UINT_32:
151 |         # raise TypeError("Specified column should be of the uint_32 type")
152 |     # new_column = Column(column.name, column.table, column.index_indicator,
153 |                         # column.partition, mdb.MDB_STR, compression_indicator=0,
154 |                         # rtrie_indicator=column.rtrie_indicator, alias=column.alias,
155 |                         # boolean = False, column_fn=_ip_ntoa)
156 |     # return new_column
157 | 


--------------------------------------------------------------------------------
/hustle/core/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from disco.ddfs import DDFS
 3 | from disco.core import Disco
 4 | 
 5 | 
 6 | def guess_settings():
 7 |     for settings_file in (os.path.expanduser('~/.hustle'),
 8 |                           '/etc/hustle/settings.yaml'):
 9 |         if os.path.exists(settings_file):
10 |             return settings_file
11 |     return ''
12 | 
13 | 
14 | defaults = {
15 |     'settings_file': guess_settings(),
16 |     'server': 'disco://localhost',
17 |     'nest': False,
18 |     'dump': False,
19 |     'worker_class': 'disco.worker.classic.worker.Worker',
20 |     'partition': 16,
21 |     'history_size': 1000
22 | }
23 | 
24 | overrides = {}
25 | 
26 | 
27 | class Settings(dict):
28 |     def __init__(self, *args, **kwargs):
29 |         # load the defaults
30 |         super(Settings, self).update(defaults)
31 | 
32 |         # override with the settings file
33 |         path = kwargs.get('settings_file') or self['settings_file']
34 |         if path and os.path.exists(path):
35 |             try:
36 |                 import yaml
37 |                 self.update(yaml.load(open(path)))
38 |             except:
39 |                 pass  # if ya can't ya can't
40 | 
41 |         # final overrides
42 |         super(Settings, self).update(overrides)
43 |         super(Settings, self).__init__(*args, **kwargs)
44 | 
45 |         # set up ddfs and disco
46 |         if not self['server'].startswith('disco://'):
47 |             self['server'] = 'disco://' + self['server']
48 | 
49 |         if 'ddfs' not in self:
50 |             self['ddfs'] = DDFS(self['server'])
51 |         self['server'] = Disco(self['server'])
52 | 
53 |         # set up worker
54 |         if 'worker' not in self:
55 |             worker_mod, _, worker_class = self['worker_class'].rpartition('.')
56 |             mod = __import__(worker_mod, {}, {}, worker_mod)
57 |             self['worker'] = getattr(mod, worker_class)()
58 | 


--------------------------------------------------------------------------------
/hustle/core/stat.py:
--------------------------------------------------------------------------------
 1 | from disco.core import Job
 2 | from disco.worker.task_io import task_input_stream
 3 | from hustle.core.pipeworker import Worker, HustleStage
 4 | 
 5 | import hustle
 6 | import hustle.core
 7 | import hustle.core.marble
 8 | 
 9 | 
10 | def stat_input_stream(fd, size, url, params):
11 |     from disco import util
12 |     from hustle.core.marble import MarbleStream
13 | 
14 |     try:
15 |         scheme, netloc, rest = util.urlsplit(url)
16 |     except Exception as e:
17 |         print "Error handling hustle_input_stream for %s. %s" % (url, e)
18 |         raise e
19 | 
20 |     otab = None
21 |     try:
22 |         # print "FLurlG: %s" % url
23 |         fle = util.localize(rest, disco_data=params._task.disco_data,
24 |                             ddfs_data=params._task.ddfs_data)
25 |         # print "FLOGLE: %s" % fle
26 |         otab = MarbleStream(fle)
27 |         rows = otab.number_rows
28 |         frows = float(rows)
29 |         rval = {'_': rows, }
30 |         for field, (subdb, subindexdb, _, column, _) in otab.dbs.iteritems():
31 |             if subindexdb:
32 |                 rval[field] = subindexdb.stat(otab.txn)['ms_entries'] / frows
33 |         yield '', rval
34 |     except Exception as e:
35 |         print "Gibbers: %s" % e
36 |         raise e
37 |     finally:
38 |         if otab:
39 |             otab.close()
40 | 
41 | 
42 | class StatPipe(Job):
43 |     required_modules = [
44 |         ('hustle', hustle.__file__),
45 |         ('hustle.core', hustle.core.__file__),
46 |         ('hustle.core.marble', hustle.core.marble.__file__)]
47 | 
48 |     def __init__(self, master):
49 | 
50 |         super(StatPipe, self).__init__(master=master, worker=Worker())
51 |         self.pipeline = [('split',
52 |                           HustleStage('stat',
53 |                                       process=process_stat,
54 |                                       input_chain=[task_input_stream,
55 |                                                    stat_input_stream]))]
56 | 
57 | 
58 | def process_stat(interface, state, label, inp, task):
59 |     from disco import util
60 | 
61 |     # inp contains a set of replicas, let's force local #HACK
62 |     input_processed = False
63 |     for i, inp_url in inp.input.replicas:
64 |         scheme, (netloc, port), rest = util.urlsplit(inp_url)
65 |         if netloc == task.host:
66 |             input_processed = True
67 |             inp.input = inp_url
68 |             break
69 | 
70 |     if not input_processed:
71 |         raise Exception("Input %s not processed, no LOCAL resource found."
72 |                         % str(inp.input))
73 | 
74 |     for key, value in inp:
75 |         interface.output(0).add(key, value)
76 | 


--------------------------------------------------------------------------------
/hustle/core/util.py:
--------------------------------------------------------------------------------
  1 | from disco import func
  2 | from disco import util
  3 | from disco.settings import DiscoSettings
  4 | 
  5 | import collections
  6 | 
  7 | 
  8 | class Peekable(object):
  9 |     def __init__(self, iterable):
 10 |         self._iterable = iter(iterable)
 11 |         self._cache = collections.deque()
 12 | 
 13 |     def __iter__(self):
 14 |         return self
 15 | 
 16 |     def _fillcache(self, n):
 17 |         if n is None:
 18 |             n = 1
 19 |         while len(self._cache) < n:
 20 |             self._cache.append(self._iterable.next())
 21 | 
 22 |     def next(self, n=None):
 23 |         self._fillcache(n)
 24 |         if n is None:
 25 |             result = self._cache.popleft()
 26 |         else:
 27 |             result = [self._cache.popleft() for i in range(n)]
 28 |         return result
 29 | 
 30 |     def peek(self, n=None):
 31 |         self._fillcache(n)
 32 |         if n is None:
 33 |             result = self._cache[0]
 34 |         else:
 35 |             result = [self._cache[i] for i in range(n)]
 36 |         return result
 37 | 
 38 | 
 39 | class SortedIterator(object):
 40 | 
 41 |     def __init__(self, inputs):
 42 |         ins = [Peekable(input) for input in inputs]
 43 |         self.collection = sorted(ins, key=self._key)
 44 | 
 45 |     def __iter__(self):
 46 |         return self
 47 | 
 48 |     def next(self):
 49 |         removes = []
 50 |         reinsert = None
 51 |         rval = None
 52 |         for stream in self.collection:
 53 |             try:
 54 |                 rval = stream.next()
 55 |                 reinsert = stream
 56 |                 break
 57 |             except StopIteration:
 58 |                 removes.append(stream)
 59 | 
 60 |         if rval:
 61 |             for remove in removes:
 62 |                 self.collection.remove(remove)
 63 |             if reinsert:
 64 |                 self.collection.remove(reinsert)
 65 |                 try:
 66 |                     reinsert.peek()
 67 |                 except:
 68 |                     pass
 69 |                 else:
 70 |                     removes = []
 71 |                     reinsert_index = 0
 72 |                     for stream in self.collection:
 73 |                         try:
 74 |                             stream.peek()
 75 |                             if self._key(reinsert) < self._key(stream):
 76 |                                 break
 77 |                         except:
 78 |                             removes.append(stream)
 79 |                         reinsert_index += 1
 80 |                     self.collection.insert(reinsert_index, reinsert)
 81 |                     for remove in removes:
 82 |                         self.collection.remove(remove)
 83 |             return rval
 84 |         raise StopIteration
 85 | 
 86 |     def _key(self, stream):
 87 |         try:
 88 |             key, value = stream.peek()
 89 |             return tuple(key)
 90 |         except StopIteration:
 91 |             return tuple()
 92 | 
 93 | 
 94 | def sorted_iterator(urls,
 95 |                     reader=func.chain_reader,
 96 |                     input_stream=(func.map_input_stream,),
 97 |                     notifier=func.notifier,
 98 |                     params=None,
 99 |                     ddfs=None):
100 | 
101 |     from disco.worker import Input
102 |     from disco.worker.classic.worker import Worker
103 | 
104 |     worker = Worker(map_reader=reader, map_input_stream=input_stream)
105 |     settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
106 | 
107 |     inputs = []
108 |     for input in util.inputlist(urls, settings=settings):
109 |         notifier(input)
110 |         instream = Input(input, open=worker.opener('map', 'in', params))
111 |         if instream:
112 |             inputs.append(instream)
113 | 
114 |     return SortedIterator(inputs)
115 | 
116 | 
117 | def ensure_list(val):
118 |     if not isinstance(val, list):
119 |         if isinstance(val, (tuple, set)):
120 |             return list(val)
121 |         return [val]
122 |     return val
123 | 


--------------------------------------------------------------------------------
/integration_test/README:
--------------------------------------------------------------------------------
 1 | Make sure you have Disco (0.5 or later) running locally.
 2 | 
 3 | Make sure you've installed the ../deps correctly.
 4 | 
 5 | Make sure you run setup.py to create all of the tables the tests will need.
 6 | 
 7 | Run the test from within this directory using nose.
 8 | 
 9 | Good luck, and please let us know if you are having problems or have feedback!
10 | 
11 | tspurway AT gmail D0T com
12 | 
13 | 


--------------------------------------------------------------------------------
/integration_test/fixtures/ip.json:
--------------------------------------------------------------------------------
  1 | {"ip":3221291265,"exchange_id":"OpenX"}
  2 | {"ip":2130706433,"exchange_id":"OpenX"}
  3 | {"ip":3232235777,"exchange_id":"Rubycon"}
  4 | {"ip":3232235777,"exchange_id":"Appnexus"}
  5 | {"ip":3221291265,"exchange_id":"OpenX"}
  6 | {"ip":3221291266,"exchange_id":"Rubycon"}
  7 | {"ip":3232235777,"exchange_id":"OpenX"}
  8 | {"ip":3221291266,"exchange_id":"Appnexus"}
  9 | {"ip":3221291265,"exchange_id":"OpenX"}
 10 | {"ip":3232235777,"exchange_id":"OpenX"}
 11 | {"ip":3221291265,"exchange_id":"Appnexus"}
 12 | {"ip":2130706433,"exchange_id":"Rubycon"}
 13 | {"ip":2130706433,"exchange_id":"OpenX"}
 14 | {"ip":3221291266,"exchange_id":"Rubycon"}
 15 | {"ip":2130706433,"exchange_id":"Appnexus"}
 16 | {"ip":3232235777,"exchange_id":"Rubycon"}
 17 | {"ip":3232235777,"exchange_id":"Rubycon"}
 18 | {"ip":2130706433,"exchange_id":"Adx"}
 19 | {"ip":3232235777,"exchange_id":"OpenX"}
 20 | {"ip":2130706433,"exchange_id":"Adx"}
 21 | {"ip":2130706433,"exchange_id":"Adx"}
 22 | {"ip":3232235777,"exchange_id":"Appnexus"}
 23 | {"ip":3232235777,"exchange_id":"OpenX"}
 24 | {"ip":2130706433,"exchange_id":"OpenX"}
 25 | {"ip":2130706433,"exchange_id":"Adx"}
 26 | {"ip":3232235777,"exchange_id":"Adx"}
 27 | {"ip":3221291265,"exchange_id":"Appnexus"}
 28 | {"ip":3232235777,"exchange_id":"OpenX"}
 29 | {"ip":3232235777,"exchange_id":"Appnexus"}
 30 | {"ip":3221291266,"exchange_id":"Rubycon"}
 31 | {"ip":3221291265,"exchange_id":"OpenX"}
 32 | {"ip":3221291266,"exchange_id":"Appnexus"}
 33 | {"ip":3221291265,"exchange_id":"Rubycon"}
 34 | {"ip":3221291265,"exchange_id":"Adx"}
 35 | {"ip":3221291266,"exchange_id":"Adx"}
 36 | {"ip":3221291265,"exchange_id":"OpenX"}
 37 | {"ip":3221291265,"exchange_id":"Appnexus"}
 38 | {"ip":3221291266,"exchange_id":"OpenX"}
 39 | {"ip":3221291265,"exchange_id":"OpenX"}
 40 | {"ip":3221291266,"exchange_id":"Adx"}
 41 | {"ip":3221291266,"exchange_id":"Appnexus"}
 42 | {"ip":3221291265,"exchange_id":"OpenX"}
 43 | {"ip":3221291266,"exchange_id":"OpenX"}
 44 | {"ip":3232235777,"exchange_id":"Appnexus"}
 45 | {"ip":3232235777,"exchange_id":"Rubycon"}
 46 | {"ip":2130706433,"exchange_id":"Adx"}
 47 | {"ip":3232235777,"exchange_id":"Adx"}
 48 | {"ip":2130706433,"exchange_id":"OpenX"}
 49 | {"ip":2130706433,"exchange_id":"Appnexus"}
 50 | {"ip":3232235777,"exchange_id":"Adx"}
 51 | {"ip":3221291265,"exchange_id":"Appnexus"}
 52 | {"ip":2130706433,"exchange_id":"Appnexus"}
 53 | {"ip":3232235777,"exchange_id":"Rubycon"}
 54 | {"ip":3221291265,"exchange_id":"Appnexus"}
 55 | {"ip":3232235777,"exchange_id":"Adx"}
 56 | {"ip":3221291266,"exchange_id":"OpenX"}
 57 | {"ip":3221291265,"exchange_id":"Rubycon"}
 58 | {"ip":2130706433,"exchange_id":"OpenX"}
 59 | {"ip":3232235777,"exchange_id":"Adx"}
 60 | {"ip":3232235777,"exchange_id":"Rubycon"}
 61 | {"ip":2130706433,"exchange_id":"OpenX"}
 62 | {"ip":3221291266,"exchange_id":"Rubycon"}
 63 | {"ip":2130706433,"exchange_id":"Appnexus"}
 64 | {"ip":3221291265,"exchange_id":"Adx"}
 65 | {"ip":3232235777,"exchange_id":"Adx"}
 66 | {"ip":3221291265,"exchange_id":"OpenX"}
 67 | {"ip":3221291265,"exchange_id":"Appnexus"}
 68 | {"ip":3221291265,"exchange_id":"Appnexus"}
 69 | {"ip":3221291265,"exchange_id":"Adx"}
 70 | {"ip":2130706433,"exchange_id":"Adx"}
 71 | {"ip":3221291266,"exchange_id":"Appnexus"}
 72 | {"ip":3221291265,"exchange_id":"Appnexus"}
 73 | {"ip":3232235777,"exchange_id":"OpenX"}
 74 | {"ip":3221291266,"exchange_id":"OpenX"}
 75 | {"ip":3221291266,"exchange_id":"Rubycon"}
 76 | {"ip":3221291265,"exchange_id":"OpenX"}
 77 | {"ip":3221291266,"exchange_id":"Adx"}
 78 | {"ip":3221291266,"exchange_id":"OpenX"}
 79 | {"ip":3221291265,"exchange_id":"Adx"}
 80 | {"ip":3221291265,"exchange_id":"Adx"}
 81 | {"ip":2130706433,"exchange_id":"Appnexus"}
 82 | {"ip":2130706433,"exchange_id":"OpenX"}
 83 | {"ip":3232235777,"exchange_id":"Adx"}
 84 | {"ip":3221291266,"exchange_id":"Adx"}
 85 | {"ip":3221291266,"exchange_id":"Adx"}
 86 | {"ip":2130706433,"exchange_id":"Rubycon"}
 87 | {"ip":2130706433,"exchange_id":"Appnexus"}
 88 | {"ip":3221291265,"exchange_id":"OpenX"}
 89 | {"ip":3221291265,"exchange_id":"Adx"}
 90 | {"ip":2130706433,"exchange_id":"Appnexus"}
 91 | {"ip":3232235777,"exchange_id":"Rubycon"}
 92 | {"ip":3232235777,"exchange_id":"Rubycon"}
 93 | {"ip":3221291266,"exchange_id":"Appnexus"}
 94 | {"ip":3232235777,"exchange_id":"Rubycon"}
 95 | {"ip":3221291265,"exchange_id":"Adx"}
 96 | {"ip":2130706433,"exchange_id":"OpenX"}
 97 | {"ip":3221291266,"exchange_id":"Adx"}
 98 | {"ip":3221291266,"exchange_id":"Adx"}
 99 | {"ip":3221291266,"exchange_id":"Adx"}
100 | {"ip":3232235777,"exchange_id":"Adx"}
101 | 


--------------------------------------------------------------------------------
/integration_test/setup.py:
--------------------------------------------------------------------------------
  1 | from hustle import Table, insert
  2 | from hustle.core.settings import Settings, overrides
  3 | import ujson
  4 | 
  5 | 
  6 | IMPS = '__test_imps'
  7 | PIXELS = '__test_pixels'
  8 | PIXELS_HLL = '__test_pixels_hll'
  9 | IPS = '__test_ips'
 10 | 
 11 | 
 12 | def imp_process(data):
 13 |     from disco.util import urlsplit
 14 | 
 15 |     _, (host, _), _ = urlsplit(data['url'])
 16 |     if host.startswith('www.'):
 17 |         host = host[4:]
 18 |     data['site_id'] = host
 19 | 
 20 | 
 21 | def insert_hll(table, file=None, streams=None, preprocess=None,
 22 |                maxsize=100 * 1024 * 1024, tmpdir='/tmp', decoder=ujson.decode,
 23 |                lru_size=10000, hll_field=None, **kwargs):
 24 |     from cardunion import Cardunion
 25 |     import os
 26 | 
 27 |     settings = Settings(**kwargs)
 28 |     ddfs = settings['ddfs']
 29 | 
 30 |     def part_tag(name, partition=None):
 31 |         rval = "hustle:" + name
 32 |         if partition:
 33 |             rval += ':' + str(partition)
 34 |         return rval
 35 | 
 36 |     def hll_iter(strms):
 37 |         buf = {}
 38 |         fields = table._field_names
 39 |         fields.remove('hll')
 40 |         #  fields.remove('maxhash')
 41 | 
 42 |         for stream in strms:
 43 |             for line in stream:
 44 |                 try:
 45 |                     data = decoder(line)
 46 |                 except Exception as e:
 47 |                     print "Exception decoding record (skipping): %s %s" % (e, line)
 48 |                 else:
 49 |                     if preprocess:
 50 |                         if not preprocess(data):
 51 |                             continue
 52 |                     key = ujson.dumps([data[f] for f in fields])
 53 |                     if key not in buf:
 54 |                         hll = Cardunion(12)
 55 |                         buf[key] = hll
 56 |                     else:
 57 |                         hll = buf[key]
 58 | 
 59 |                     hll.add(data[hll_field])
 60 | 
 61 |         for key, hll in buf.iteritems():
 62 |             data = dict(zip(fields, ujson.loads(key)))
 63 |             data['hll'] = hll.dumps()
 64 |             yield data
 65 | 
 66 |     if file:
 67 |         streams = [open(file)]
 68 |     lines, partition_files = table._insert([hll_iter(streams)],
 69 |                                            maxsize=maxsize, tmpdir=tmpdir,
 70 |                                            decoder=lambda x: x, lru_size=lru_size)
 71 |     if partition_files is not None:
 72 |         for part, pfile in partition_files.iteritems():
 73 |             tag = part_tag(table._name, part)
 74 |             ddfs.push(tag, [pfile])
 75 |             print 'pushed %s, %s' % (part, tag)
 76 |             os.unlink(pfile)
 77 |     return table._name, lines
 78 | 
 79 | 
 80 | def ensure_tables():
 81 |     overrides['server'] = 'disco://localhost'
 82 |     overrides['dump'] = False
 83 |     overrides['nest'] = False
 84 |     settings = Settings()
 85 |     ddfs = settings['ddfs']
 86 | 
 87 |     imps = Table.create(IMPS,
 88 |                         columns=['wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis',
 89 |                                  'index int ad_id', 'index string date', 'index uint time', 'bit click',
 90 |                                  'index bit impression', 'bit conversion'],
 91 |                         partition='date',
 92 |                         force=True)
 93 |     pixels = Table.create(PIXELS,
 94 |                           columns=['wide index string token', 'index bit isActive', 'index trie site_id',
 95 |                                    'uint amount', 'index int account_id', 'index trie city', 'index trie16 state',
 96 |                                    'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'],
 97 |                           partition='date',
 98 |                           force=True)
 99 |     pixel_hlls = Table.create(PIXELS_HLL,
100 |                               columns=['index bit isActive', 'index trie site_id', 'index int account_id',
101 |                                        'index trie city', 'index trie16 state', 'index string date',
102 |                                        'binary hll'],
103 |                               partition='date',
104 |                               force=True)
105 |     ips = Table.create(IPS,
106 |                        columns=['index trie16 exchange_id', 'index uint32 ip'],
107 |                        force=True)
108 | 
109 |     tags = ddfs.list("hustle:%s:" % IMPS)
110 |     if len(tags) == 0:
111 |         # insert the files
112 |         insert(imps, File='fixtures/imps.json', preprocess=imp_process)
113 | 
114 |     tags = ddfs.list("hustle:%s:" % PIXELS)
115 |     if len(tags) == 0:
116 |         # insert the files
117 |         insert(pixels, File='fixtures/pixel.json')
118 | 
119 |     tags = ddfs.list("hustle:%s:" % IPS)
120 |     if len(tags) == 0:
121 |         # insert the files
122 |         insert(ips, File='fixtures/ip.json')
123 | 
124 |     tags = ddfs.list("hustle:%s:" % PIXELS_HLL)
125 |     if len(tags) == 0:
126 |         # insert the files
127 |         insert_hll(pixel_hlls, file='./fixtures/pixel.json', hll_field='token')
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     ensure_tables()
132 | 


--------------------------------------------------------------------------------
/integration_test/test_aggregation.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from hustle import select, Table, h_sum, h_count, star
  3 | from setup import IMPS
  4 | from hustle.core.settings import Settings, overrides
  5 | 
  6 | 
  7 | class TestAggregation(unittest.TestCase):
  8 |     def setUp(self):
  9 |         overrides['server'] = 'disco://localhost'
 10 |         overrides['dump'] = False
 11 |         overrides['nest'] = False
 12 |         self.settings = Settings()
 13 | 
 14 |     def tearDown(self):
 15 |         pass
 16 | 
 17 |     def test_count(self):
 18 |         imps = Table.from_tag(IMPS)
 19 |         res = select(h_count(), where=imps)
 20 |         count = list(res)[0][0]
 21 |         res.purge()
 22 |         self.assertEqual(count, 200)
 23 | 
 24 |     def test_simple_aggregation(self):
 25 |         imps = Table.from_tag(IMPS)
 26 |         results = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
 27 | 
 28 |         sum_millis = {}
 29 |         for ad_id, millis in results:
 30 |             if ad_id not in sum_millis:
 31 |                 sum_millis[ad_id] = [0, 0]
 32 |             sum_millis[ad_id][0] += millis
 33 |             sum_millis[ad_id][1] += 1
 34 |         results.purge()
 35 | 
 36 |         results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')
 37 |         self.assertGreater(len(list(results)), 0)
 38 |         for ad_id, millis, count in results:
 39 |             ad_tup = sum_millis[ad_id]
 40 |             self.assertEqual(millis, ad_tup[0])
 41 |             self.assertEqual(count, ad_tup[1])
 42 |         results.purge()
 43 | 
 44 |     def test_ordered_aggregation(self):
 45 |         imps = Table.from_tag(IMPS)
 46 |         resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
 47 | 
 48 |         sum_millis = {}
 49 |         for ad_id, millis in resx:
 50 |             if ad_id not in sum_millis:
 51 |                 sum_millis[ad_id] = [0, 0]
 52 |             sum_millis[ad_id][0] += millis
 53 |             sum_millis[ad_id][1] += 1
 54 | 
 55 |         results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
 56 |                          where=imps.date == '2014-01-27',
 57 |                          order_by=2,
 58 |                          limit=3,
 59 |                          nest=True)
 60 |         self.assertGreater(len(list(results)), 0)
 61 |         lowest = 0
 62 |         for ad_id, millis, count in results:
 63 |             self.assertLessEqual(lowest, count)
 64 |             lowest = count
 65 |             ad_tup = sum_millis[ad_id]
 66 |             self.assertEqual(millis, ad_tup[0])
 67 |             self.assertEqual(count, ad_tup[1])
 68 |         self.assertEqual(len(list(results)), min(len(sum_millis), 3))
 69 | 
 70 |         resx.purge()
 71 | 
 72 |     def test_multiple_group_bys(self):
 73 |         imps = Table.from_tag(IMPS)
 74 |         results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22')
 75 | 
 76 |         sum_millis = {}
 77 |         for ad_id, dt, millis in results:
 78 |             key = str(ad_id) + dt
 79 |             if key not in sum_millis:
 80 |                 sum_millis[key] = [0, 0]
 81 |             sum_millis[key][0] += millis
 82 |             sum_millis[key][1] += 1
 83 |         results.purge()
 84 | 
 85 |         results = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22')
 86 |         self.assertGreater(len(list(results)), 0)
 87 |         for ad_id, dt, millis, count in results:
 88 |             ad_tup = sum_millis[str(ad_id) + dt]
 89 |             self.assertEqual(millis, ad_tup[0])
 90 |             self.assertEqual(count, ad_tup[1])
 91 |         results.purge()
 92 | 
 93 |     def test_nested_agg(self):
 94 |         imps = Table.from_tag(IMPS)
 95 |         results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22')
 96 | 
 97 |         sum_millis = {}
 98 |         for ad_id, dt, millis in results:
 99 |             key = str(ad_id) + dt
100 |             if key not in sum_millis:
101 |                 sum_millis[key] = [0, 0]
102 |             sum_millis[key][0] += millis
103 |             sum_millis[key][1] += 1
104 |         results.purge()
105 | 
106 |         newtab = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(),
107 |                         where=imps.date > '2014-01-22',
108 |                         nest=True)
109 |         results = select(*star(newtab), where=newtab)
110 |         self.assertGreater(len(list(results)), 0)
111 |         for ad_id, dt, millis, count in results:
112 |             ad_tup = sum_millis[str(ad_id) + dt]
113 |             self.assertEqual(millis, ad_tup[0])
114 |             self.assertEqual(count, ad_tup[1])
115 |         results.purge()
116 | 
117 |     def test_overflow(self):
118 |         from itertools import izip
119 | 
120 |         imps = Table.from_tag(IMPS)
121 |         fly_results = select(imps.date, h_sum(imps.impression), where=imps, order_by=imps.date)
122 | 
123 |         nest_tab = select(imps.date, h_sum(imps.impression), where=imps, nest=True)
124 |         nest_results = select(*star(nest_tab), where=nest_tab, order_by=0)
125 | 
126 |         for ((fdate, fimps), (ndate, nimps)) in izip(fly_results, nest_results):
127 |             self.assertEqual(fdate, ndate)
128 |             self.assertEqual(fimps, nimps)
129 |         nest_results.purge()
130 | 
131 | 


--------------------------------------------------------------------------------
/integration_test/test_bool.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle import select, Table, h_sum, h_count
 3 | from setup import IMPS, PIXELS
 4 | from hustle.core.settings import Settings, overrides
 5 | 
 6 | 
 7 | class TestBool(unittest.TestCase):
 8 |     def setUp(self):
 9 |         overrides['server'] = 'disco://localhost'
10 |         overrides['dump'] = False
11 |         overrides['nest'] = False
12 |         self.settings = Settings()
13 | 
14 |     def tearDown(self):
15 |         pass
16 | 
17 |     def test_project(self):
18 |         imps = Table.from_tag(IMPS)
19 |         res = select(imps.click, imps.conversion, imps.impression, where=imps)
20 |         clicks = conversions = impressions = 0
21 |         for (click, conv, imp) in res:
22 |             clicks += click
23 |             conversions += conv
24 |             impressions += imp
25 | 
26 |         self.assertEqual(clicks, 21)
27 |         self.assertEqual(conversions, 5)
28 |         self.assertEqual(impressions, 174)
29 |         res.purge()
30 | 
31 |     def test_aggregate(self):
32 |         imps = Table.from_tag(IMPS)
33 |         res = select(h_sum(imps.click), h_sum(imps.conversion), h_sum(imps.impression), where=imps)
34 | 
35 |         (clicks, conversions, impressions) = list(res)[0]
36 | 
37 |         self.assertEqual(clicks, 21)
38 |         self.assertEqual(conversions, 5)
39 |         self.assertEqual(impressions, 174)
40 |         res.purge()
41 | 
42 |     def test_bool_values(self):
43 |         pix = Table.from_tag(PIXELS)
44 |         res = select(pix.isActive, where=pix.isActive == True)
45 |         actives = 0
46 |         for (act, ) in res:
47 |             actives += act
48 | 
49 |         self.assertEqual(actives, 234)
50 |         res.purge()
51 | 
52 |         res = select(pix.isActive, where=pix.isActive == 0)
53 |         actives = 0
54 |         for (act, ) in res:
55 |             actives += 1
56 | 
57 |         self.assertEqual(actives, 266)
58 |         res.purge()
59 | 
60 |     def test_bit_values(self):
61 |         pix = Table.from_tag(PIXELS)
62 |         res = select(pix.isActive, where=pix.isActive == 1)
63 |         actives = 0
64 |         for (act, ) in res:
65 |             actives += act
66 | 
67 |         self.assertEqual(actives, 234)
68 |         res.purge()
69 | 


--------------------------------------------------------------------------------
/integration_test/test_cardinality.py:
--------------------------------------------------------------------------------
 1 | from hustle import select, Table
 2 | from setup import PIXELS_HLL
 3 | from hustle.core.settings import Settings, overrides
 4 | from hustle.cardinality import h_cardinality as h_hll
 5 | 
 6 | from collections import defaultdict
 7 | from operator import itemgetter
 8 | 
 9 | import unittest
10 | import ujson
11 | 
12 | 
13 | HLL_ESTIMATE_ERROR = .04
14 | 
15 | 
16 | class TestCardinalityQuery(unittest.TestCase):
17 |     def setUp(self):
18 |         overrides['server'] = 'disco://localhost'
19 |         overrides['dump'] = False
20 |         overrides['nest'] = False
21 |         self.settings = Settings()
22 | 
23 |     def tearDown(self):
24 |         pass
25 | 
26 |     def checkEstimate(self, estimate, expect):
27 |         self.assertAlmostEqual(estimate, expect,
28 |                                delta=int(HLL_ESTIMATE_ERROR * expect))
29 | 
30 |     def test_cardinality_all(self):
31 |         hll = Table.from_tag(PIXELS_HLL)
32 |         res = select(h_hll(hll.hll), where=hll)
33 |         estimate = next(iter(res))[0]
34 |         tokens = set([])
35 |         with open("./fixtures/pixel.json") as f:
36 |             for line in f:
37 |                 record = ujson.loads(line)
38 |                 tokens.add(record["token"])
39 |         self.checkEstimate(estimate, len(tokens))
40 |         res.purge()
41 | 
42 |     def test_cardinality_on_condition(self):
43 |         hll = Table.from_tag(PIXELS_HLL)
44 |         active_tokens = set([])
45 |         inactive_tokens = set([])
46 |         with open("./fixtures/pixel.json") as f:
47 |             for line in f:
48 |                 record = ujson.loads(line)
49 |                 if record["isActive"]:
50 |                     active_tokens.add(record["token"])
51 |                 else:
52 |                     inactive_tokens.add(record["token"])
53 |         res = select(h_hll(hll.hll), where=(hll.isActive == 1))
54 |         estimate = next(iter(res))[0]
55 |         self.checkEstimate(estimate, len(active_tokens))
56 |         res.purge()
57 | 
58 |         res = select(h_hll(hll.hll), where=(hll.isActive == 0))
59 |         estimate = next(iter(res))[0]
60 |         self.checkEstimate(estimate, len(inactive_tokens))
61 |         res.purge()
62 | 
63 |     def test_cardinality_with_order_by(self):
64 |         hll = Table.from_tag(PIXELS_HLL)
65 |         tokens_by_date = defaultdict(set)
66 |         with open("./fixtures/pixel.json") as f:
67 |             for line in f:
68 |                 record = ujson.loads(line)
69 |                 tokens_by_date[record["date"]].add(record["token"])
70 |         result = [(date, len(tokens)) for date, tokens in tokens_by_date.items()]
71 | 
72 |         # Test order by date
73 |         expects = sorted(result, key=itemgetter(0), reverse=True)
74 |         res = select(hll.date, h_hll(hll.hll), where=hll, order_by=0, desc=True)
75 |         estimates = list(res)
76 |         for i, (date, expected_cardinality) in enumerate(expects):
77 |             self.assertEqual(estimates[i][0], date)
78 |             self.checkEstimate(estimates[i][1], expected_cardinality)
79 |         res.purge()
80 | 
81 |         # Test order by hll
82 |         res = select(hll.date, h_hll(hll.hll), where=hll, order_by=1, desc=True)
83 |         l = list(res)
84 |         for i in range(len(l) - 1):
85 |             self.assertTrue(l[i][1] >= l[i + 1][1])
86 |         res.purge()
87 | 


--------------------------------------------------------------------------------
/integration_test/test_column_fn.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle import select, Table, h_max, h_min
 3 | from hustle.core.column_fn import ip_ntoa
 4 | from setup import IPS
 5 | from hustle.core.settings import Settings, overrides
 6 | 
 7 | 
 8 | class TestSimpleQuery(unittest.TestCase):
 9 |     def setUp(self):
10 |         overrides['server'] = 'disco://localhost'
11 |         overrides['dump'] = False
12 |         overrides['nest'] = False
13 |         self.settings = Settings()
14 | 
15 |     def tearDown(self):
16 |         pass
17 | 
18 |     def test_column_fn(self):
19 |         ips = Table.from_tag(IPS)
20 |         res = select(ips.exchange_id, ip_ntoa(ips.ip),
21 |                      where=ips.exchange_id == "Adx")
22 |         results = list(res)
23 |         self.assertEqual(len(results), 29)
24 |         res.purge()
25 | 
26 |     def test_column_fn_with_agg(self):
27 |         ips = Table.from_tag(IPS)
28 |         res = select(ips.exchange_id, h_max(ip_ntoa(ips.ip)),
29 |                      where=ips, order_by=(ips.exchange_id,))
30 |         results = list(res)
31 |         res.purge()
32 |         exchanges = [ex for ex, _ in results]
33 |         ipss = [ip for _, ip in results]
34 |         self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges)
35 |         self.assertListEqual(['192.168.1.1'] * 4, ipss)
36 | 
37 |         res = select(ips.exchange_id, h_min(ip_ntoa(ips.ip)),
38 |                      where=ips, order_by=(ips.exchange_id,))
39 |         results = list(res)
40 |         res.purge()
41 |         exchanges = [ex for ex, _ in results]
42 |         ipss = [ip for _, ip in results]
43 |         self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges)
44 |         self.assertListEqual(['127.0.0.1'] * 4, ipss)
45 | 
46 |     def test_column_fn_with_distinct(self):
47 |         ips = Table.from_tag(IPS)
48 |         res = select(ip_ntoa(ips.ip),
49 |                      where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),),
50 |                      distinct=True)
51 |         results = list(res)
52 |         res.purge()
53 |         ipss = [ip[0] for ip in results]
54 |         self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'],
55 |                              ipss)
56 | 
57 |     def test_column_fn_with_nest(self):
58 |         ips = Table.from_tag(IPS)
59 |         res = select(ip_ntoa(ips.ip),
60 |                      where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),),
61 |                      distinct=True, nest=True)
62 |         ret = select(res.ip, where=res, order_by=(res.ip,))
63 |         results = list(ret)
64 |         ret.purge()
65 |         ipss = [ip[0] for ip in results]
66 |         self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'],
67 |                              ipss)
68 | 


--------------------------------------------------------------------------------
/integration_test/test_drop.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle import Table, insert, drop, delete, get_partitions
 3 | from hustle.core.settings import Settings, overrides
 4 | 
 5 | IMPS = '__test_drop_imps'
 6 | 
 7 | 
 8 | def imp_process(data):
 9 |     from disco.util import urlsplit
10 | 
11 |     _, (host, _), _ = urlsplit(data['url'])
12 |     if host.startswith('www.'):
13 |         host = host[4:]
14 |     data['site_id'] = host
15 | 
16 | 
17 | def ensure_tables():
18 |     overrides['server'] = 'disco://localhost'
19 |     overrides['dump'] = False
20 |     overrides['nest'] = False
21 |     settings = Settings()
22 |     ddfs = settings['ddfs']
23 | 
24 |     imps = Table.create(IMPS,
25 |                         fields=['=$token', '%url', '+%site_id', '@cpm_millis', '+#ad_id', '+$date', '+@time'],
26 |                         partition='date',
27 |                         force=True)
28 | 
29 |     tags = ddfs.list("hustle:%s:" % IMPS)
30 |     if len(tags) == 0:
31 |         # insert the files
32 |         insert(imps, File='fixtures/imps.json', preprocess=imp_process)
33 |     return imps
34 | 
35 | 
36 | class TestDropTable(unittest.TestCase):
37 |     def setUp(self):
38 |         overrides['server'] = 'disco://localhost'
39 |         overrides['dump'] = False
40 |         overrides['nest'] = False
41 |         self.settings = Settings()
42 |         self.ddfs = self.settings['ddfs']
43 |         self.table = ensure_tables()
44 | 
45 |     def test_delete_all(self):
46 |         delete(self.table)
47 |         self.assertEqual([], get_partitions(self.table))
48 |         tags = self.ddfs.list(Table.base_tag(self.table._name))
49 |         self.assertEqual(len(tags), 1)
50 |         self.assertEqual(tags[0], "hustle:__test_drop_imps")
51 | 
52 |     def test_delete_partial(self):
53 |         delete(self.table.date >= '2014-01-13')
54 |         self.assertEqual(['hustle:__test_drop_imps:2014-01-10',
55 |                           'hustle:__test_drop_imps:2014-01-11',
56 |                           'hustle:__test_drop_imps:2014-01-12'],
57 |                          get_partitions(self.table))
58 |         tags = self.ddfs.list(Table.base_tag(self.table._name))
59 |         self.assertEqual(len(tags), 4)
60 |         self.assertIn("hustle:__test_drop_imps", tags)
61 |         drop(self.table)
62 |         with self.assertRaises(ValueError):
63 |             delete(self.table.site_id == 'foobar')
64 |             delete(self.tale.url)
65 | 
66 |     def test_drop(self):
67 |         drop(self.table)
68 |         self.assertEqual([], get_partitions(self.table))
69 |         tags = self.ddfs.list(Table.base_tag(self.table._name))
70 |         self.assertEqual(len(tags), 0)
71 | 


--------------------------------------------------------------------------------
/integration_test/test_project_order.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle import select, Table
 3 | from setup import IMPS
 4 | from hustle.core.settings import Settings, overrides
 5 | 
 6 | 
 7 | class TestProjectOrder(unittest.TestCase):
 8 |     def setUp(self):
 9 |         overrides['server'] = 'disco://localhost'
10 |         overrides['dump'] = False
11 |         overrides['nest'] = False
12 |         self.settings = Settings()
13 | 
14 |     def tearDown(self):
15 |         pass
16 | 
17 |     def test_single_int_order(self):
18 |         imps = Table.from_tag(IMPS)
19 |         res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis)
20 |         lowest = 0
21 |         for a, d, c in res:
22 |             self.assertLessEqual(lowest, c)
23 |             lowest = c
24 |         res.purge()
25 | 
26 |     def test_combo_order(self):
27 |         imps = Table.from_tag(IMPS)
28 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
29 |                      where=imps.date > '2014-01-21',
30 |                      order_by=(imps.date, imps.cpm_millis))
31 |         lowest_cpm = 0
32 |         lowest_date = '2000-01-01'
33 |         for a, d, c in res:
34 |             if lowest_date == d:
35 |                 self.assertLessEqual(lowest_cpm, c)
36 |                 lowest_cpm = c
37 |             else:
38 |                 self.assertLessEqual(lowest_date, d)
39 |                 lowest_date = d
40 |                 lowest_cpm = c
41 |         res.purge()
42 | 
43 |     def test_combo_descending(self):
44 |         imps = Table.from_tag(IMPS)
45 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
46 |                      where=imps.date > '2014-01-21',
47 |                      order_by=(imps.date, imps.cpm_millis),
48 |                      desc=True)
49 |         highest_cpm = 1000000000
50 |         highest_date = '2222-01-01'
51 |         for a, d, c in res:
52 |             if highest_date == d:
53 |                 self.assertGreaterEqual(highest_cpm, c)
54 |                 highest_cpm = c
55 |             else:
56 |                 self.assertGreaterEqual(highest_date, d)
57 |                 highest_date = d
58 |                 highest_cpm = c
59 |         res.purge()
60 | 
61 |     def test_high_limit(self):
62 |         imps = Table.from_tag(IMPS)
63 |         res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', limit=100)
64 |         results = list(res)
65 |         self.assertEqual(len(results), 10)
66 |         res.purge()
67 | 
68 |     def test_low_limit(self):
69 |         imps = Table.from_tag(IMPS)
70 |         res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', limit=4)
71 |         results = list(res)
72 |         self.assertEqual(len(results), 4)
73 |         res.purge()
74 | 
75 |     def test_distinct(self):
76 |         imps = Table.from_tag(IMPS)
77 |         res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True)
78 |         results = list(res)
79 |         self.assertEqual(len(results), 8)
80 |         res.purge()
81 | 
82 |     def test_overall(self):
83 |         imps = Table.from_tag(IMPS)
84 |         res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4,
85 |                      order_by='ad_id', desc=True)
86 |         results = [a for a, d in res]
87 |         self.assertEqual(len(results), 4)
88 |         self.assertListEqual(results, [30019, 30018, 30017, 30015])
89 |         res.purge()
90 | 


--------------------------------------------------------------------------------
/integration_test/test_simple_query.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from hustle import select, Table
  3 | from setup import IMPS
  4 | from hustle.core.settings import Settings, overrides
  5 | 
  6 | 
  7 | class TestSimpleQuery(unittest.TestCase):
  8 |     def setUp(self):
  9 |         overrides['server'] = 'disco://localhost'
 10 |         overrides['dump'] = False
 11 |         overrides['nest'] = False
 12 |         self.settings = Settings()
 13 | 
 14 |     def tearDown(self):
 15 |         pass
 16 | 
 17 |     def test_equality_on_partition(self):
 18 |         imps = Table.from_tag(IMPS)
 19 |         res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
 20 |         results = list(res)
 21 |         self.assertEqual(len(results), 10)
 22 |         found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
 23 |         self.assertIsNotNone(found)
 24 |         self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
 25 |         res.purge()
 26 | 
 27 |     def test_range_on_partition(self):
 28 |         imps = Table.from_tag(IMPS)
 29 |         res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27')
 30 |         results = list(res)
 31 |         self.assertEqual(len(results), 20)
 32 |         self.assertTrue(all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
 33 |         res.purge()
 34 | 
 35 |     def test_combo_where_on_partition(self):
 36 |         imps = Table.from_tag(IMPS)
 37 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
 38 |                      where=((imps.date >= '2014-01-20') & (imps.ad_id == 30010)))
 39 |         results = list(res)
 40 |         self.assertEqual(len(results), 6)
 41 |         self.assertTrue(all(d >= '2014-01-20' and a == 30010 for a, d, _ in results))
 42 |         res.purge()
 43 | 
 44 |     def test_combo_where_on_or_partition(self):
 45 |         imps = Table.from_tag(IMPS)
 46 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
 47 |                      where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
 48 |         results = list(res)
 49 |         self.assertEqual(len(results), 27)
 50 |         self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
 51 |         res.purge()
 52 | 
 53 |     def test_combo_where_on_or_partition_ex(self):
 54 |         imps = Table.from_tag(IMPS)
 55 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
 56 |                      where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id == 30010)))
 57 |         results = list(res)
 58 |         self.assertEqual(len(results), 27)
 59 |         self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
 60 |         res.purge()
 61 | 
 62 |     def test_combo_where_on_or_partition_ex1(self):
 63 |         imps = Table.from_tag(IMPS)
 64 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
 65 |                      where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id << [30003, 30010])))
 66 |         results = list(res)
 67 |         self.assertEqual(len(results), 40)
 68 |         self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 or a == 30003 for a, d, _ in results))
 69 |         res.purge()
 70 | 
 71 |     def test_combo_where_on_or_partition_ex2(self):
 72 |         imps = Table.from_tag(IMPS)
 73 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
 74 |                      where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010])))
 75 |         results = list(res)
 76 |         self.assertEqual(len(results), 1)
 77 |         self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
 78 |         res.purge()
 79 | 
 80 |     def test_combo_where_on_and_partition(self):
 81 |         imps = Table.from_tag(IMPS)
 82 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
 83 |                      where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.ad_id == 30010)))
 84 |         results = list(res)
 85 |         self.assertEqual(len(results), 2)
 86 |         self.assertTrue(all(d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30010 for a, d, _ in results))
 87 |         res.purge()
 88 | 
 89 |     def test_combo_where_no_partition(self):
 90 |         imps = Table.from_tag(IMPS)
 91 |         res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000))
 92 |         results = list(res)
 93 |         print results
 94 |         self.assertEqual(len(results), 5)
 95 |         res.purge()
 96 | 
 97 |     def test_combo_where_on_mixed_partition(self):
 98 |         imps = Table.from_tag(IMPS)
 99 |         res = select(imps.ad_id, imps.date, imps.cpm_millis,
100 |                      where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
101 |         results = list(res)
102 |         self.assertEqual(len(results), 2)
103 |         self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
104 |         res.purge()
105 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | nose==1.3.7
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ujson==1.35
2 | PyYAML==3.12
3 | Cython==0.27.3
4 | 


--------------------------------------------------------------------------------
/settings.yaml:
--------------------------------------------------------------------------------
 1 | # disco server.
 2 | server: disco://localhost
 3 | 
 4 | # default hustle worker. Mostly, there is no need to change it.
 5 | worker_class: hustle.core.pipeworker.Worker
 6 | 
 7 | # print out the result to the terminal. If this is disabled, you need to call cat()
 8 | # function yourself to get the result.
 9 | dump: False
10 | 
11 | # save query results to a hustle table
12 | nest: False
13 | 
14 | # partition number for the hustle pipeworker. If unspecified, 16 would be used.
15 | # This number would be used when assigning labels for the output for each
16 | # stage.
17 | partition: 16
18 | 
19 | # size of history for hustle shell
20 | history_size: 1000
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | if sys.version_info[:2] < (2, 6):
 6 |     raise RuntimeError('Requires Python 2.6 or better')
 7 | 
 8 | VERSION = '0.2.7'
 9 | 
10 | setup(
11 |     name='hustle',
12 |     version=VERSION,
13 |     description=('Hustle: a data warehouse system.'),
14 |     keywords='hustle',
15 |     author='Chango Inc.',
16 |     author_email='dev@chango.com',
17 |     url='http://chango.com',
18 |     license='MIT License',
19 |     packages=find_packages(exclude=['test', 'deps', 'examples', 'inferno']),
20 |     include_package_data=True,
21 |     zip_safe=False,
22 |     test_suite='nose.collector',
23 |     requires=['disco', 'mdb'])
24 | 


--------------------------------------------------------------------------------
/test/fixtures/keys:
--------------------------------------------------------------------------------
 1 | _eb_cat_115280
 2 | 0cd10308162c5216f2476f462ba6d0c3
 3 | hobbit house being demolished due to lac
 4 | money in the bank streams
 5 | iphone contacts deleted how to restore
 6 | montreal database administrator oracle qc
 7 | 878ed9857ee90262fea180c83d3e96c5
 8 | ton fatberg removed from london sewer
 9 | 463d67a89afdcbc4960b4e2d45ae8c24
10 | flip flop luggage tag set of
11 | iab_sports
12 | crassula vendre montreal
13 | _eb_cat_9394
14 | tvs electronics phones cordless phones
15 | _av_2162
16 | make it diy floor pillow made from thrifted blanket
17 | 879319cd0476782a235d42e4fcb80edf
18 | iphone rumor
19 | new homes condos
20 | __cat_adx_532
21 | _av_2334
22 | _av_2334
23 | heart of the swarm patch makes the game more spectacular
24 | college football predictions the hardest places to play in
25 | 


--------------------------------------------------------------------------------
/test/test_lru_dict.py:
--------------------------------------------------------------------------------
  1 | import mdb
  2 | import os
  3 | import unittest
  4 | from functools import partial
  5 | from hustle.core.marble import mdb_evict, mdb_fetch
  6 | from pylru import LRUDict
  7 | from pylru import CharLRUDict, IntLRUDict
  8 | from pyebset import BitSet
  9 | 
 10 | class TestLRUDict(unittest.TestCase):
 11 |     def setUp(self):
 12 |         pass
 13 | 
 14 |     def tearDown(self):
 15 |         try:
 16 |             os.unlink('/tmp/lru_test')
 17 |             os.unlink('/tmp/lru_test-lock')
 18 |         except:
 19 |             pass
 20 | 
 21 |     def test_lru(self):
 22 |         def get(db, txn, key):
 23 |             try:
 24 |                 return db.get(txn, key)
 25 |             except:
 26 |                 return None
 27 | 
 28 |         env = mdb.Env('/tmp/lru_test', flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
 29 |         txn = env.begin_txn()
 30 |         ixdb = env.open_db(txn, 'ix', flags=mdb.MDB_CREATE)
 31 | 
 32 |         lru = LRUDict.getDict(5,
 33 |                       partial(mdb_fetch, txn=txn, ixdb=ixdb),
 34 |                       partial(mdb_evict, txn=txn, ixdb=ixdb))
 35 | 
 36 |         lru.set('hello', BitSet())
 37 |         lru.set('goodbye', BitSet())
 38 |         lru.set('amine', BitSet())
 39 |         lru.set('solution', BitSet())
 40 |         lru.set('lsd', BitSet())
 41 |         self.assertEqual(len(lru._getContents()), 5)
 42 | 
 43 |         lru.set('proxy', BitSet())
 44 |         store = lru._getContents()
 45 |         self.assertNotIn('hello', store)
 46 |         self.assertIsNotNone(get(ixdb, txn, 'hello'))
 47 |         self.assertEqual(len(store), 5)
 48 | 
 49 |         bitmap = lru['hello']
 50 |         store = lru._getContents()
 51 |         self.assertIn('hello', store)
 52 |         self.assertEqual(len(store), 5)
 53 |         self.assertIsInstance(bitmap, BitSet)
 54 |         self.assertIsNone(lru.get('skibiddles'))
 55 | 
 56 |         # test eviction order
 57 |         self.assertIsNotNone(lru.get('goodbye')) # this now should 'reset' goodbye so that it won't be evicted
 58 |         lru.set('whammy bar', BitSet()) # amine should be evicted
 59 |         store = lru._getContents()
 60 |         self.assertNotIn('amine', store)
 61 |         self.assertIn('goodbye', store)
 62 | 
 63 |         txn.commit()
 64 |         env.close()
 65 | 
 66 | class LruTest(unittest.TestCase):
 67 |     def test_basic_char(self):
 68 |         mdict = {}
 69 | 
 70 |         def fetch(key):
 71 |             try:
 72 |                 res = mdict[key]
 73 |             except:
 74 |                 return None
 75 |             return res
 76 | 
 77 |         def evict(key, value):
 78 |             mdict[key] = value
 79 | 
 80 |         l = CharLRUDict(10, fetch, evict)
 81 | 
 82 |         a = 100000
 83 |         b = 200000
 84 | 
 85 |         for i in range(a, b):
 86 |             l.set(str(i * i), i * i)
 87 | 
 88 |         for i in range(b - 1, a, -1):
 89 |             v = l.get(str(i * i))
 90 |             self.assertEqual(i * i, v)
 91 | 
 92 |     def test_basic_int(self):
 93 |         mdict = {}
 94 | 
 95 |         def fetch(key):
 96 |             try:
 97 |                 res = mdict[key]
 98 |             except:
 99 |                 return None
100 |             return res
101 | 
102 |         def evict(key, value):
103 |             mdict[key] = value
104 | 
105 |         l = IntLRUDict(10, fetch, evict)
106 | 
107 |         a = 100000
108 |         b = 200000
109 | 
110 |         for i in range(a , b):
111 |             l.set(i * i, i * i)
112 | 
113 |         for i in range(b - 1, a, -1):
114 |             v = l.get(i * i)
115 |             self.assertEqual(i * i, v)
116 | 
117 |     def test_no_eviction(self):
118 |         def fetch(key):
119 |             return None
120 | 
121 |         def evict(key, value):
122 |             self.fail("Nothing should be evicted: " + str(key) + " " + str(value))
123 | 
124 |         l = CharLRUDict(1, fetch, evict, list)
125 |         s = l["10"]
126 |         self.assertListEqual(s, [])
127 | 


--------------------------------------------------------------------------------
/test/test_merge_wrapper.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle.core.pipeworker import merge_wrapper
 3 | 
 4 | 
 5 | class TestMarble(unittest.TestCase):
 6 |     def test_up(self):
 7 |         a = [(('keya', 5), 22), (('lima', 9), 23), (('oebra', 21), 24), (('qeya', 5), 22), (('tima', 9), 23), (('zebra', 21), 24)]
 8 |         b = [(('aeya', 5), 22), (('fima', 12), 23), (('hebra', 8), 24), (('xya', 5), 22), (('yima', 12), 23), (('zzebra', 8), 24)]
 9 |         c = [(('beya', 5), 22), (('fliea', 9), 23), (('gray', 21), 24), (('morie', 5), 22), (('steel', 9), 23), (('yale', 21), 24)]
10 |         d = [(('vera', 5), 22), (('wera', 12), 23), (('xera', 8), 24), (('yolanda', 5), 22), (('yolo', 12), 23), (('zanadu', 8), 24)]
11 |         from heapq import merge
12 |         res = merge(merge_wrapper(a), merge_wrapper(b), merge_wrapper(c), merge_wrapper(d))
13 |         lowest = 'aaaaaa'
14 |         for k, v in res:
15 |             self.assertTrue(lowest < k[0])
16 |             lowest = k[0]
17 | 
18 |     def test_down(self):
19 |         a = [(('zebra', 21), 24), (('lima', 9), 23), (('keya', 5), 22), ]
20 |         b = [(('zzebra', 8), 24), (('sima', 12), 23), (('aeya', 5), 22)]
21 |         from heapq import merge
22 |         res = merge(merge_wrapper(a, desc=True), merge_wrapper(b, desc=True))
23 |         highest = 'zzzzzzzzz'
24 |         for k, v in res:
25 |             self.assertTrue(highest > k[0])
26 |             highest = k[0]
27 | 
28 |     def test_nulls(self):
29 |         a = [(('zebra', 21), 24), (('keya', 5), 22), ((None, 9), 23)]
30 |         b = [(('zzebra', 8), 24), (('sima', 12), 23), (('aeya', 5), 22), ((None, 12), 18)]
31 |         from heapq import merge
32 |         res = merge(merge_wrapper(a, desc=True), merge_wrapper(b, desc=True))
33 |         highest = 'zzzzzzzzz'
34 |         for k, v in res:
35 |             print k, v
36 |             self.assertTrue(highest >= k[0])
37 |             highest = k[0]
38 | 
39 |     def test_multi(self):
40 |         a = [(('zebra', 12), 24), (('webra', 12), 24), (('bebra', 12), 24), (('aebra', 12), 24), (('zebra', 11), 24), (('keya', 5), 22), (('aeya', 5), 22), ]
41 |         b = [(('sima', 12), 23), (('zzebra', 8), 28), (('yzebra', 8), 28), (('azebra', 8), 28), (('aeya', 5), 22)]
42 |         from heapq import merge
43 |         res = merge(merge_wrapper(a, sort_range=(1, 0), desc=True), merge_wrapper(b, sort_range=(1, 0), desc=True))
44 |         highest = 999999999
45 |         highest_2nd = 'zzzzzzzz'
46 |         same_count = 0
47 |         for k, v in res:
48 |             print "kev", k, v
49 |             if highest == k[1]:
50 |                 self.assertTrue(highest_2nd >= k[0])
51 |                 same_count += 1
52 |             self.assertGreaterEqual(highest, k[1])
53 |             highest = k[1]
54 |             highest_2nd = k[0]
55 |         self.assertEqual(same_count, 8)
56 | 
57 |     def test_lopsided(self):
58 |         a = [(('zebra', 21), 24)]
59 |         b = [(('zzebra', 8), 24), (('sima', 12), 23), (('aeya', 5), 22)]
60 |         from heapq import merge
61 |         res = merge(merge_wrapper(a, desc=True), merge_wrapper(b, desc=True))
62 |         highest = 'zzzzzzzzz'
63 |         for k, v in res:
64 |             self.assertTrue(highest > k[0])
65 |             highest = k[0]
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/test/test_pipeworker.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle.core.pipeworker import sort_reader, disk_sort
 3 | from StringIO import StringIO
 4 | import os
 5 | 
 6 | OUT_FILE = '/tmp/test_disk_sort'
 7 | 
 8 | TEST_FILE = \
 9 |     b"stuff\xff19\xffvalue1\x00" \
10 |     b"morestuff\xff29\xffvalue2\x00" \
11 |     b"reallylongkeyprobablylongerthanthebufferljkfdskjlkjjkjkjjjjjjjjjjjjjsfddfsfdsdfsdfsfdsfdsdfsfdsfdsdsffdsdfsdfsfdsdfsdfsdfsdfsfdsdsfdfsfdsfdsdsfdsffdsdfsdsfdsfdfsdfsfdssdfdfsdfsdfsdfsdfsfdsfdssdfdfs\xff15\xfffinalvalue\x00"\
12 | 
13 | EXPECTED = [
14 |     (["stuff", "19"], "value1"),
15 |     (["morestuff", "29"], "value2"),
16 |     (["reallylongkeyprobablylongerthanthebufferljkfdskjlkjjkjkjjjjjjjjjjjjjsfddfsfdsdfsdfsfdsfdsdfsfdsfdsdsffdsdfsdfsfdsdfsdfsdfsdfsfdsdsfdfsfdsfdsdsfdsffdsdfsdsfdsfdfsdfsfdssdfdfsdfsdfsdfsdfsfdsfdssdfdfs", "15"], "finalvalue"),
17 | ]
18 | 
19 | RESPECTED = [
20 |     (["stuff", 1900], 'value1'),
21 |     (["morestuff", 9], 'value2'),
22 |     (["anymore", 290], 'value3'),
23 |     (["stuff", 29], 'value4'),
24 |     (["toeat", 1500], 'value5'),
25 |     (["reallystuff", 15], 'finalvalue'),
26 | ]
27 | 
28 | 
29 | SOMENULLS = [
30 |     (["olay", 1900], 'value1'),
31 |     (["morestuff", 9], 'value2'),
32 |     (["anymore", 290], 'value3'),
33 |     ([None, 29], 'value4'),
34 |     (["toeat", 1500], 'value5'),
35 |     (["reallystuff", 15], 'finalvalue'),
36 | ]
37 | 
38 | 
39 | class TestPipeworker(unittest.TestCase):
40 |     def setUp(self):
41 |         pass
42 | 
43 |     def _clean_ds_tmp(self):
44 |         try:
45 |             os.unlink(OUT_FILE)
46 |         except:
47 |             pass
48 | 
49 |     def test_sort_reader(self):
50 |         for buf_size in [8, 16, 32, 64, 256, 8192]:
51 |             infile = StringIO(TEST_FILE)
52 |             for actual, expected in zip(sort_reader(infile, 'test', buf_size), EXPECTED):
53 |                 self.assertListEqual(actual[0], expected[0])
54 |                 self.assertEqual(actual[1], expected[1])
55 | 
56 |     def test_simple_disk_sort(self):
57 |         self._clean_ds_tmp()
58 |         actual = [(key, value) for key, value in disk_sort(RESPECTED, OUT_FILE, (0, 1))]
59 |         print "ACTUAL: ", actual
60 |         self.assertEqual(actual[0][0][0], "anymore")
61 |         self.assertEqual(actual[1][0][1], 9)
62 |         self.assertEqual(actual[2][0][0], "reallystuff")
63 |         self.assertEqual(actual[3][1], ()) # tests secondary sorting
64 | 
65 |     def test_positional_disk_sort(self):
66 |         self._clean_ds_tmp()
67 |         actual = [(key, value) for key, value in disk_sort(RESPECTED, OUT_FILE, [1])]
68 |         print "ACTUAL: ", actual
69 |         self.assertEqual(actual[0][0][0], "morestuff")
70 |         self.assertEqual(actual[1][0][1], 15)
71 |         self.assertEqual(actual[2][0][0], "stuff")
72 |         self.assertEqual(actual[3][1], ())
73 |         self.assertEqual(actual[5][1], ())
74 | 
75 |     def test_nulls(self):
76 |         self._clean_ds_tmp()
77 |         actual = [(key, value) for key, value in disk_sort(SOMENULLS, OUT_FILE, [0])]
78 |         print "ACTUAL: ", actual
79 |         self.assertEqual(actual[0][0][0], None)
80 | 
81 | 


--------------------------------------------------------------------------------
/test/test_query_checker.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from hustle.core.marble import Marble, check_query
  3 | 
  4 | 
  5 | _FIELDS = ("+@4id", "+*name", "+$date", "+%2genre", "+@2rating", "artist", "@4quantity")
  6 | _PARTITIONS = "date"
  7 | _FIELDS_SELL = ("+@4id", "+@4item_id", "+$date", "@4store_id", "@4quantity", "$price")
  8 | 
  9 | 
 10 | class TestChecker(unittest.TestCase):
 11 |     def setUp(self):
 12 |         self.albums = Marble(name="Albums",
 13 |                              fields=_FIELDS,
 14 |                              partition=_PARTITIONS)
 15 |         self.transaction = Marble(name="Transcation",
 16 |                                   fields=_FIELDS_SELL,
 17 |                                   partition=_PARTITIONS)
 18 |         self.single_where = [(self.albums.rating > 3)]
 19 |         self.multi_wheres = [(self.albums.rating > 3) & (self.albums.id == 1000)]
 20 |         self.cross_wheres = [self.albums.rating > 3, self.transaction.id == 1000]
 21 |         self.single_select = [self.albums.name]
 22 |         self.multi_select = [self.albums.name, self.albums.date, self.albums.rating]
 23 |         self.cross_select = [self.albums.name, self.albums.artist,
 24 |                              self.transaction.store_id, self.transaction.price]
 25 |         self.order_by = [self.albums.quantity, self.albums.rating]
 26 |         self.join = [self.albums.id, self.transaction.item_id]
 27 |         self.join_invalid = [self.albums.id, self.transaction.price]
 28 |         self.join_invalid_1 = [self.albums.id, self.albums.id]
 29 |         self.join_invalid_2 = [self.albums.id, self.transaction.price]
 30 |         self.limit_single = 100
 31 |         self.limit_single_invalid = -100
 32 | 
 33 |     def test_select_clauses(self):
 34 |         # test empty select
 35 |         with self.assertRaises(ValueError):
 36 |             check_query([],
 37 |                         [],
 38 |                         self.order_by,
 39 |                         None,
 40 |                         self.single_where)
 41 |         # test duplicate select
 42 |         with self.assertRaises(ValueError):
 43 |             check_query(self.single_select + self.single_select,
 44 |                         [],
 45 |                         self.order_by,
 46 |                         None,
 47 |                         self.single_where)
 48 |         self.assertTrue(check_query(self.single_select, [], [],
 49 |                                     None, self.single_where))
 50 | 
 51 |     def test_where_clauses(self):
 52 |         # should raise if a single table shows up in multi-wheres
 53 |         # should raise if where and select are from different tables
 54 |         with self.assertRaises(ValueError):
 55 |             check_query(self.single_select,
 56 |                         [],
 57 |                         [],
 58 |                         self.order_by,
 59 |                         [self.transaction.id == 1000])
 60 |         self.assertTrue(check_query(self.single_select, [], [],
 61 |                                     None, self.single_where))
 62 | 
 63 |     def test_join(self):
 64 |         # test join with single table
 65 |         with self.assertRaises(ValueError):
 66 |             check_query(self.single_select,
 67 |                         self.join,
 68 |                         [],
 69 |                         None,
 70 |                         self.single_where)
 71 | 
 72 |         # test invalid join
 73 |         with self.assertRaises(ValueError):
 74 |             check_query(self.single_select,
 75 |                         self.join_invalid,
 76 |                         [],
 77 |                         None,
 78 |                         self.cross_wheres)
 79 | 
 80 |         # test invalid join
 81 |         with self.assertRaises(ValueError):
 82 |             check_query(self.single_select,
 83 |                         self.join_invalid_1,
 84 |                         [],
 85 |                         None,
 86 |                         self.cross_wheres)
 87 | 
 88 |         # test invalid join
 89 |         with self.assertRaises(ValueError):
 90 |             check_query(self.single_select,
 91 |                         self.join_invalid_2,
 92 |                         [],
 93 |                         None,
 94 |                         self.cross_wheres)
 95 |         self.assertTrue(check_query(self.single_select,
 96 |                                     self.join, [], None, self.cross_wheres))
 97 | 
 98 |     def test_order_by(self):
 99 |         # should raise if select columns don't contain the order column
100 |         with self.assertRaises(ValueError):
101 |             check_query(self.single_select,
102 |                         [],
103 |                         self.order_by,
104 |                         None,
105 |                         self.single_where)
106 |         self.assertTrue(check_query(self.single_select, [], [self.albums.name],
107 |                                     None, self.single_where))
108 | 
109 |     def test_limit(self):
110 |         with self.assertRaises(ValueError):
111 |             check_query(self.single_select,
112 |                         [],
113 |                         [],
114 |                         self.limit_single_invalid,
115 |                         self.single_where)
116 | 
117 |         self.assertTrue(
118 |             check_query(self.single_select,
119 |                         [],
120 |                         [],
121 |                         self.limit_single,
122 |                         self.single_where))
123 | 
124 |     def test_full_query(self):
125 |         self.assertTrue(
126 |             check_query(
127 |                 self.cross_select,
128 |                 self.join,
129 |                 self.single_select,
130 |                 self.limit_single,
131 |                 self.cross_wheres))
132 | 


--------------------------------------------------------------------------------
/test/test_rtrie.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import unittest
 3 | import rtrie
 4 | import mdb
 5 | from wtrie import Trie
 6 | 
 7 | class TestRTrie(unittest.TestCase):
 8 |     def test_rtrie_in_memory(self):
 9 | 
10 |         s = unicode(u'séllsink').encode('utf-8')
11 |         #print "HELLSINK: %s" % s
12 | 
13 |         t = Trie()
14 |         self.assertEqual(t.add('hello'), 1)
15 |         self.assertEqual(t.add('hell'), 2)
16 |         self.assertEqual(t.add('hello'), 1)
17 |         self.assertEqual(t.add('hellothere'), 3)
18 |         self.assertEqual(t.add('good'), 4)
19 |         self.assertEqual(t.add('goodbye'), 5)
20 |         self.assertEqual(t.add('hello'), 1)
21 |         self.assertEqual(t.add('hellsink'), 6)
22 |         self.assertEqual(t.add(s), 7)
23 |         t.print_it()
24 | 
25 |         nodes, kids, _ = t.serialize()
26 |         nodeaddr, nodelen = nodes.buffer_info()
27 |         kidaddr, kidlen = kids.buffer_info()
28 |         print "LENS %s %s" % (nodelen, kidlen)
29 | 
30 |         for i in range(8):
31 |             val = rtrie.value_for_vid(nodeaddr, kidaddr, i)
32 |             print "Value", i, val
33 | 
34 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hello'), 1)
35 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hell'), 2)
36 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'goodbye'), 5)
37 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellsink'), 6)
38 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'hellothere'), 3)
39 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, 'good'), 4)
40 |         self.assertEqual(rtrie.vid_for_value(nodeaddr, kidaddr, s), 7)
41 |         self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'notthere'))
42 |         self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'h'))
43 |         self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'he'))
44 |         self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hel'))
45 |         self.assertIsNone(rtrie.vid_for_value(nodeaddr, kidaddr, 'hells'))
46 | 
47 |     def test_rtrie_in_mdb(self):
48 |         t = Trie()
49 |         self.assertEqual(t.add('hello'), 1)
50 |         self.assertEqual(t.add('hell'), 2)
51 |         self.assertEqual(t.add('hello'), 1)
52 |         self.assertEqual(t.add('hellothere'), 3)
53 |         self.assertEqual(t.add('good'), 4)
54 |         self.assertEqual(t.add('goodbye'), 5)
55 |         self.assertEqual(t.add('hello'), 1)
56 |         self.assertEqual(t.add('hellsink'), 6)
57 | 
58 |         nodes, kids, _ = t.serialize()
59 |         nodeaddr, nodelen = nodes.buffer_info()
60 |         kidaddr, kidlen = kids.buffer_info()
61 |         try:
62 |             env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_WRITEMAP | mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
63 |             txn = env.begin_txn()
64 |             db = env.open_db(txn, name='_meta_', flags=mdb.MDB_CREATE)
65 |             db.put_raw(txn, 'nodes', nodeaddr, nodelen)
66 |             db.put_raw(txn, 'kids', kidaddr, kidlen)
67 | 
68 |             n, ns = db.get_raw(txn, 'nodes')
69 |             k, ks = db.get_raw(txn, 'kids')
70 |             txn.commit()
71 |             env.close()
72 | 
73 |             env = mdb.Env('/tmp/test_rtrie', flags=mdb.MDB_NOSYNC | mdb.MDB_NOSUBDIR)
74 |             txn = env.begin_txn()
75 |             db = env.open_db(txn, name='_meta_')
76 | 
77 |             n, ns = db.get_raw(txn, 'nodes')
78 |             k, ks = db.get_raw(txn, 'kids')
79 |             self.assertEqual(rtrie.vid_for_value(n, k, 'hello'), 1)
80 |             self.assertEqual(rtrie.vid_for_value(n, k, 'hell'), 2)
81 |             self.assertEqual(rtrie.vid_for_value(n, k, 'goodbye'), 5)
82 |             self.assertEqual(rtrie.vid_for_value(n, k, 'hellsink'), 6)
83 |             self.assertEqual(rtrie.vid_for_value(n, k, 'hellothere'), 3)
84 |             self.assertEqual(rtrie.vid_for_value(n, k, 'good'), 4)
85 |             self.assertIsNone(rtrie.vid_for_value(n, k, 'notthere'))
86 | 
87 |             txn.commit()
88 |             env.close()
89 |         finally:
90 |             import os
91 |             os.unlink('/tmp/test_rtrie')
92 |             os.unlink('/tmp/test_rtrie-lock')
93 | 
94 | 


--------------------------------------------------------------------------------
/test/test_stress_wtrie.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from wtrie import Trie
 4 | from rtrie import value_for_vid, vid_for_value
 5 | 
 6 | 
 7 | pwd = os.getcwd()
 8 | if os.path.basename(pwd) != 'test':
 9 |     fixture = os.path.join(pwd, 'test/fixtures/keys')
10 | else:
11 |     fixture = os.path.join(pwd, 'fixtures/keys')
12 | 
13 | 
14 | class TestStressWTrie(unittest.TestCase):
15 |     def test_stress_wtrie(self):
16 |         ktrie = Trie()
17 |         strie = Trie()
18 |         etrie = Trie()
19 | 
20 |         keywords = {}
21 |         search_terms = {}
22 |         exchange_ids = {}
23 | 
24 |         with open(fixture) as f:
25 |             for data in f:
26 |                 for word in data.split(' '):
27 |                     vid = ktrie.add(word)
28 |                     actual_vid = keywords.get(word)
29 |                     if actual_vid is not None:
30 |                         self.assertEqual(vid, actual_vid)
31 |                     else:
32 |                         keywords[word] = vid
33 | 
34 |                 vid = strie.add(data)
35 |                 actual_vid = search_terms.get(data)
36 |                 if actual_vid is not None:
37 |                     self.assertEqual(vid, actual_vid)
38 |                 else:
39 |                     search_terms[data] = vid
40 | 
41 |         nodes, kids, nodelen = etrie.serialize()
42 |         naddr, nlen = nodes.buffer_info()
43 |         kaddr, klen = kids.buffer_info()
44 |         #summarize(naddr, kaddr, nodelen)
45 |         #print_it(naddr, kaddr)
46 | 
47 |         for dc, vid in exchange_ids.iteritems():
48 |             rvid = etrie.add(dc)
49 |             self.assertEqual(vid, rvid)
50 | 
51 |             print dc, vid
52 |             value = value_for_vid(naddr, kaddr, vid)
53 |             self.assertEqual(dc, value)
54 |             if dc != value:
55 |                 print "      dc=%s adc=%s" % (dc, value)
56 | 
57 |             avid = vid_for_value(naddr, kaddr, dc)
58 |             #print "vid=%s avid=%s" % (vid, avid)
59 |             self.assertEqual(vid, avid)
60 | 


--------------------------------------------------------------------------------
/test/test_table.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from hustle import Table
 3 | 
 4 | class TestTable(unittest.TestCase):
 5 |     def test_create_syntax(self):
 6 |         full_columns = ['wide index uint32 x', 'index string y', 'int16 z', 'lz4 a', 'trie32 b', 'binary c']
 7 |         full_fields = ['=@4x', '+$y', '#2z', '*a', '%4b', '&c']
 8 |         fields = Table.parse_column_specs(full_columns)
 9 |         self.assertListEqual(fields, full_fields)
10 | 
11 |         default_columns = ['wide index x', 'index int y', 'uint z', 'trie b', 'c']
12 |         default_fields = ['=x', '+#y', '@z', '%b', 'c']
13 |         fields = Table.parse_column_specs(default_columns)
14 |         self.assertListEqual(fields, default_fields)
15 | 
16 |     def test_create_errors(self):
17 |         self.assertRaises(ValueError, Table.parse_column_specs, ['wide wide index x'])
18 |         self.assertRaises(ValueError, Table.parse_column_specs, ['index wide x'])
19 |         self.assertRaises(ValueError, Table.parse_column_specs, ['index blah16 x'])
20 |         self.assertRaises(ValueError, Table.parse_column_specs, ['uint24 x'])
21 | 


--------------------------------------------------------------------------------
/test/test_util.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from hustle.core.util import SortedIterator
  3 | 
  4 | class TestSortedIterator(unittest.TestCase):
  5 | 
  6 |     def test_merges_sorted_inputs(self):
  7 |         data = [
  8 |             [
  9 |                 ((1, 1), 'some_value'),
 10 |                 ((1, 2), 'some_value'),
 11 |                 ((1, 3), 'some_value')
 12 |             ],
 13 |             [
 14 |                 ((1, 100), 'some_value'),
 15 |                 ((1, 200), 'some_value'),
 16 |                 ((1, 300), 'some_value')
 17 |             ],
 18 |             [
 19 |                 ((1, 10), 'some_value'),
 20 |                 ((1, 20), 'some_value'),
 21 |                 ((1, 30), 'some_value')
 22 |             ],
 23 |             [
 24 |                 ((1, 4), 'some_value'),
 25 |                 ((1, 40), 'some_value'),
 26 |                 ((1, 400), 'some_value')
 27 |             ]
 28 |         ]
 29 |         sorted_iterator = SortedIterator(data)
 30 |         expected = [
 31 |             ((1, 1), 'some_value'),
 32 |             ((1, 2), 'some_value'),
 33 |             ((1, 3), 'some_value'),
 34 |             ((1, 4), 'some_value'),
 35 |             ((1, 10), 'some_value'),
 36 |             ((1, 20), 'some_value'),
 37 |             ((1, 30), 'some_value'),
 38 |             ((1, 40), 'some_value'),
 39 |             ((1, 100), 'some_value'),
 40 |             ((1, 200), 'some_value'),
 41 |             ((1, 300), 'some_value'),
 42 |             ((1, 400), 'some_value')]
 43 |         self.assertListEqual(list(sorted_iterator), expected)
 44 | 
 45 |     def test_assumes_individual_inputs_are_already_sorted(self):
 46 |         data = [
 47 |             [
 48 |                 ((2, 1), 'some_value'),
 49 |                 ((1, 1), 'some_value'),
 50 |             ],
 51 |             [
 52 |                 ((4, 1), 'some_value'),
 53 |                 ((3, 1), 'some_value'),
 54 |             ]
 55 |         ]
 56 |         sorted_iterator = SortedIterator(data)
 57 |         expected = [
 58 |             ((2, 1), 'some_value'),
 59 |             ((1, 1), 'some_value'),
 60 |             ((4, 1), 'some_value'),
 61 |             ((3, 1), 'some_value')]
 62 |         self.assertListEqual(list(sorted_iterator), expected)
 63 | 
 64 |     def test_handles_duplicates(self):
 65 |         data = [
 66 |             [
 67 |                 ((1, 1), 'some_value'),
 68 |                 ((1, 2), 'some_value'),
 69 |             ],
 70 |             [
 71 |                 ((1, 1), 'some_value'),
 72 |                 ((1, 2), 'some_value'),
 73 |                 ((1, 3), 'some_value'),
 74 |             ],
 75 |             [
 76 |                 ((1, 3), 'some_value'),
 77 |             ]
 78 |         ]
 79 |         sorted_iterator = SortedIterator(data)
 80 |         expected = [
 81 |             ((1, 1), 'some_value'),
 82 |             ((1, 1), 'some_value'),
 83 |             ((1, 2), 'some_value'),
 84 |             ((1, 2), 'some_value'),
 85 |             ((1, 3), 'some_value'),
 86 |             ((1, 3), 'some_value')]
 87 |         self.assertListEqual(list(sorted_iterator), expected)
 88 | 
 89 |     def test_handles_empty_input(self):
 90 |         data = [
 91 |             [((1, 1), 'some_value')],
 92 |             [],  # <----- empty input
 93 |             [((2, 1), 'some_value')],
 94 |         ]
 95 |         sorted_iterator = SortedIterator(data)
 96 |         expected = [
 97 |             ((1, 1), 'some_value'),
 98 |             ((2, 1), 'some_value')]
 99 |         self.assertListEqual(list(sorted_iterator), expected)
100 | 


--------------------------------------------------------------------------------
/test/test_wtrie.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import unittest
 3 | from wtrie import Trie
 4 | 
 5 | 
 6 | class TestWTrie(unittest.TestCase):
 7 |     def test_wtrie(self):
 8 |         t = Trie()
 9 |         self.assertEqual(t.add('hello'), 1)
10 |         self.assertEqual(t.add('hell'), 2)
11 |         self.assertEqual(t.add('hello'), 1)
12 |         self.assertEqual(t.add('hellothere'), 3)
13 |         self.assertEqual(t.add('good'), 4)
14 |         self.assertEqual(t.add('goodbye'), 5)
15 |         self.assertEqual(t.add('hello'), 1)
16 |         self.assertEqual(t.add('hellsink'), 6)
17 |         self.assertEqual(t.add(''), 0)
18 | 
19 |         # nodes = t.nodes
20 |         # t.print_it()
21 | 
22 |         key, sz, pt = t.node_at_path()
23 |         self.assertEqual(sz, 2)
24 | 
25 |         key, sz, pt = t.node_at_path(104)
26 |         self.assertEqual(key, 'hell')
27 |         self.assertEqual(pt, 0)
28 |         self.assertEqual(sz, 2, 'actual %s' % sz)
29 | 
30 |         key2, sz, pt = t.node_at_path(104, 111)
31 |         self.assertEqual(key2, 'o', 'actual %s' % key)
32 |         self.assertEqual(pt, 2)
33 |         self.assertEqual(sz, 1)
34 | 
35 |         key, sz, pt = t.node_at_path(104, 111, 116)
36 |         self.assertEqual(key, 'there')
37 |         self.assertEqual(pt, 1)
38 |         self.assertEqual(sz, 0)
39 | 
40 |         n, k, _ = t.serialize()
41 |         self.assertEqual(len(n), 7 * 4, "actual %d" % len(n))
42 |         self.assertEqual(len(k), 100, "actual %d" % len(k))
43 |         # print "sqork: %s" % t.kid_space
44 | 
45 |         print 'nodes', n
46 |         print 'kids', k
47 | 
48 |         unpacked = struct.unpack_from("7I", n, 0)
49 |         expected = (0x02000000, 0x01000010, 0x0200000b, 0x00000013, 0x01000004, 0x00000008, 0x00000016)
50 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
51 | 
52 |         unpacked = struct.unpack_from("IH2I", k, 0)
53 |         expected = (0, 0, 0x67000004, 0x68000002)
54 |         self.assertEqual(unpacked, expected, unpacked)
55 | 
56 |         unpacked = struct.unpack_from("IH4cI", k, 16)
57 |         expected = (0x0000, 0x0004, 'g', 'o', 'o', 'd', 0x62000005)
58 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
59 | 
60 |         unpacked = struct.unpack_from("IH3c", k, 32)
61 |         expected = (0x0004, 0x0003, 'b', 'y', 'e')
62 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
63 | 
64 |         unpacked = struct.unpack_from("IH4c2I", k, 44)
65 |         expected = (0x0000, 0x0004, 'h', 'e', 'l', 'l', 0x6f000001, 0x73000006)
66 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
67 | 
68 |         unpacked = struct.unpack_from("IHcI", k, 64)
69 |         expected = (0x0002, 1, 'o', 0x74000003)
70 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
71 | 
72 |         unpacked = struct.unpack_from("IH5c", k, 76)
73 |         expected = (0x0001, 0x0005, 't', 'h', 'e', 'r', 'e')
74 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
75 | 
76 |         unpacked = struct.unpack_from("IH4c", k, 88)
77 |         expected = (0x0002, 0x0004, 's', 'i', 'n', 'k')
78 |         self.assertEqual(unpacked, expected, 'actual %s' % str(unpacked))
79 | 


--------------------------------------------------------------------------------