├── .gitignore
├── .travis.yml
├── CHANGELOG.rst
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.rst
├── dev-requirements.txt
├── qcache
    ├── __init__.py
    ├── app.py
    ├── compression.py
    ├── dataset_cache.py
    ├── qframe
    │   ├── __init__.py
    │   ├── common.py
    │   ├── constants.py
    │   ├── context.py
    │   ├── pandas_filter.py
    │   ├── query.py
    │   └── update.py
    └── statistics.py
├── setup.cfg
├── setup.py
├── tasks.py
├── test
    ├── performance_run.py
    ├── test_api.py
    ├── test_qframe.py
    └── test_statistics.py
├── tls
    ├── ca-conf.json
    ├── ca-key.pem
    ├── ca.csr
    ├── ca.pem
    ├── csr.json
    ├── generate_test_certs.sh
    ├── host-key.pem
    ├── host.csr
    └── host.pem
└── util
    ├── __init__.py
    └── memory_benchmark.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | .idea
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # Complexity
39 | output/*.html
40 | output/*/index.html
41 | 
42 | # Sphinx
43 | docs/_build
44 | README.html
45 | 
46 | .cache
47 | ps_mem
48 | htmlcov


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "2.7"
 7 | 
 8 | # command to install dependencies
 9 | install:
10 |   - pip install -U .
11 |   - pip install -r dev-requirements.txt
12 | 
13 | # command to run tests, e.g. python setup.py test
14 | script:
15 |   - invoke coverage
16 | 
17 | before_install:
18 |   - pip install codecov
19 | 
20 | after_success:
21 |   - codecov


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 0.9.3 (2019-01-05)
  4 | ------------------
  5 | * Update dependencies on lz4 and tornado
  6 | * Allow float type hinting
  7 | 
  8 | 0.9.2 (2018-05-23)
  9 | ------------------
 10 | * Fix #15, fix cache item size at creation
 11 | 
 12 | 0.9.1 (2017-11-15)
 13 | ------------------
 14 | * Downgrade to Pandas 0.20.3 in an attempt to fix #14.
 15 | 
 16 | 0.9.0 (2017-11-14)
 17 | ------------------
 18 | * Numexpr filter engine is not available anymore, only Pandas. Numexpr is no longer a requirement of qcache.
 19 |   NB! Major backwards incompatibility
 20 | * Fix #12, like now ignores NaN
 21 | * Fix #13, only empty string is considered as NaN when reading CSV
 22 | * Fix #8, integer standins remain integers
 23 | * Upgrade Pandas to 0.21.0 and Numpy to 0.13.1
 24 | 
 25 | 0.8.1 (2017-04-06)
 26 | ------------------
 27 | * Bump Pandas to 0.19.2
 28 | 
 29 | 0.8.0 (2017-01-08)
 30 | ------------------
 31 | * Support client cert verification
 32 | 
 33 | 0.7.2 (2016-12-18)
 34 | ------------------
 35 | * Fix #10 & #11, minor statistics improvements
 36 | 
 37 | 0.7.1 (2016-11-30)
 38 | ------------------
 39 | * Fix #9, df overwritten by mistake
 40 | 
 41 | 0.7.0 (2016-11-09)
 42 | ------------------
 43 | * Compression using LZ4 or GZIP in requests and responses (#3)
 44 | * Sub queries in "in" filter (#7)
 45 | * Enum type based on Pandas category type (#6)
 46 | * Support for stand in columns in queries (#5)
 47 | * Additional metrics/statistics for complete request duration for stores and queries
 48 | * Update size estimates to do deep inspection of objects contained in dataframe. This should
 49 |   be more accurate than the previous shallow inspection.
 50 | * Update Pandas to 0.19.1
 51 | * Update Tornado to 4.4.2
 52 | 
 53 | 0.6.1 (2016-09-18)
 54 | ------------------
 55 | * Fix packaging, the new qcache.qframe package was missing from the submitted package.
 56 | 
 57 | 0.6.0 (2016-09-18)
 58 | ------------------
 59 | * New filter engine based on Pandas rather than Numexpr. This enables new types of filters in the where
 60 |   clause (see below). By default the old engine is still used but the new one can be enabled either
 61 |   by default on server startup or on a per-query basis by setting the new 'X-QCache-filter-engine' header
 62 |   to 'pandas'.
 63 | * New bitwise filters in the 'pandas' filter engine, 'all_bits' and 'any_bits'.
 64 | * New string filters, 'like' and 'ilike' which corresponds roughly to LIKE in SQL with the addition
 65 |   of regex support. 'like' is case sensitive while 'ilike' is case insensitive.
 66 | 
 67 | 0.5.0 (2016-06-19)
 68 | ------------------
 69 | * New header when uploading data, 'X-QCache-stand-in-columns', that let you specify default values
 70 |   for columns that may not be present in the uploaded data.
 71 | 
 72 | 0.4.2 (2016-06-04)
 73 | ------------------
 74 | * Additional statistics to measure for how long data remains in the cache before it's evicted.
 75 | * Bump dependency versions of Pandas, Numexpr and Tornado.
 76 | 
 77 | 0.4.1 (2016-01-31)
 78 | ------------------
 79 | * Provide the duration for which statistics were collected and statistics buffer size
 80 | 
 81 | 0.4.0 (2016-01-24)
 82 | ------------------
 83 | * Sub query support with new 'from' clause
 84 | * Column aliasing + support for calculated columns
 85 | * Error message improvements
 86 | 
 87 | 0.3.0 (2015-12-23)
 88 | ------------------
 89 | * Accepts conjunctions and disjunctions with only one clause
 90 | * Accept POST queries, good for large queries
 91 | * Improved performance for "in" queries, up to 30x faster for large lists
 92 | 
 93 | 0.2.1 (2015-12-15)
 94 | ------------------
 95 | * More efficient cache size tracking
 96 | * Check against unknown query clauses
 97 | 
 98 | 0.2.0 (2015-12-06)
 99 | ------------------
100 | * Report the unsliced result length as part of the result, nice for pagination for example
101 | * Use connection pooling
102 | * SSL and basic auth support
103 | 
104 | 0.1.0 (2015-10-25)
105 | ------------------
106 | * First release that actually does something sensible.
107 | 
108 | 0.0.1 (2015-10-15)
109 | ------------------
110 | * First release on PyPI.
111 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7.15-slim-jessie
 2 | 
 3 | RUN pip install qcache==0.9.3
 4 | 
 5 | EXPOSE 9401 9402 9403 9404 9405 9406 9407 9408
 6 | ENV QCACHE_PORT 9401
 7 | 
 8 | # Start container like this:
 9 | # - docker run -p 9401:9401 qcache
10 | # - docker run --env QCACHE_PORT=9402 -p 9402:9402 qcache
11 | CMD [ "sh", "-c", "qcache -p $QCACHE_PORT"]
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Tobias Gustafsson
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst LICENSE
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ======
  2 | QCache
  3 | ======
  4 | 
  5 | .. image:: https://travis-ci.org/tobgu/qcache.png?branch=master
  6 |     :target: https://travis-ci.org/tobgu/qcache
  7 | 
  8 | .. image:: https://badge.fury.io/py/qcache.svg
  9 |     :target: https://badge.fury.io/py/qcache
 10 | 
 11 | .. image:: http://codecov.io/github/tobgu/qcache/coverage.svg?branch=master
 12 |     :target: http://codecov.io/github/tobgu/qcache?branch=master
 13 | 
 14 | .. _Memcached: http://memcached.org/
 15 | 
 16 | QCache is a key-table cache, an in memory cache server with analytical query capabilities.
 17 | 
 18 | While the more commonly known key-value caches (such as Memcached_) lets you fetch a value
 19 | based on a key QCache lets you run queries against a table based on a key.
 20 | 
 21 | **********
 22 | Motivation
 23 | **********
 24 | You are working with table data that you want to run flexible queries against but do not want to
 25 | load them into an SQL database or similar because of any of the following:
 26 | 
 27 | - The operational cost and complexity of bringing in an SQL server
 28 | - The tables do not have a homogeneous format
 29 | - The data is short lived
 30 | - Not all data available is ever used, you only want to use resources on demand
 31 | - You want to treat queries as data and build them dynamically using data structures
 32 |   that you are used to (dictionaries and lists or objects and arrays depending on your
 33 |   language background)
 34 | - Expensive JOINs are required to create the table.
 35 | - ...
 36 | 
 37 | Or, you are building server software and want to add the possibility for your clients to run
 38 | queries directly against the data without the need for dreadful translations between a REST
 39 | interface with some home grown filter language.
 40 | 
 41 | 
 42 | .. _QCache-client: https://github.com/tobgu/qcache-client
 43 | .. _Go-QCache-client: https://github.com/tobgu/go-qcache-client
 44 | 
 45 | ********
 46 | Features
 47 | ********
 48 | - Simple, single thread, single process, server.
 49 | - Expressive JSON-based query language with format and features similar to SQL SELECT. Queries
 50 |   are data that can easily be transformed or enriched.
 51 | - Support for JSON or CSV input and output format
 52 | - Performant queries on tables as large as 10 x 1000000 cells out of the box
 53 | - No need for table definitions, tables are created dynamically based on the data inserted
 54 | - Statistics about hit and miss count, query and insert performance and more available
 55 |   through HTTP API
 56 | - Scales linearly in query capacity with the number of servers. A python client library that
 57 |   uses consistent hashing for key distribution among servers is available
 58 |   here QCache-client_. There's also a basic Go client here Go-QCache-client_.
 59 |   More clients are welcome!
 60 | 
 61 | 
 62 | ************
 63 | Requirements
 64 | ************
 65 | Python 2.7 (2.7.9+ if using TLS) for now
 66 | 
 67 | 
 68 | ************
 69 | Installation
 70 | ************
 71 | .. code::
 72 | 
 73 |    pip install qcache
 74 | 
 75 | *******
 76 | Running
 77 | *******
 78 | .. code::
 79 | 
 80 |    qcache
 81 | 
 82 | This will start qcache on the default port using the default cache size. To get help on available parameters:
 83 | 
 84 | .. code::
 85 | 
 86 |    qcache --help
 87 | 
 88 | 
 89 | ******
 90 | Docker
 91 | ******
 92 | You can also get the latest version as a Docker image. This is probably the easiest way to try it out if you
 93 | are running Linux or if you have Docker Machine installed.
 94 | 
 95 | .. code::
 96 | 
 97 |    docker run -p 9401:9401 tobgu/qcache
 98 | 
 99 | 
100 | *******
101 | License
102 | *******
103 | MIT licensed. See the bundled `LICENSE <https://github.com/tobgu/qcache/blob/master/LICENSE>`_ file for more details.
104 | 
105 | **************
106 | Query examples
107 | **************
108 | Below are examples of the major features of the query language. A JSON object is used to
109 | describe the query. The query should be URL encoded and passed in using the 'q' GET-parameter.
110 | 
111 | The query language uses LISP-style prefix notation for simplicity. This makes it easy
112 | to parse and build queries dynamically since no rules for operator precedence
113 | ever need to be applied.
114 | 
115 | Like so:
116 | `http://localhost:8888/qcache/datasets/<dataset_key>?q=<URL-encoded-query>`
117 | 
118 | You can also POST queries as JSON against:
119 | `http://localhost:8888/qcache/datasets/<dataset_key>/q/`
120 | 
121 | This is a good alternative to GET if your queries are too large to fit in the query string.
122 | 
123 | Select all
124 | ==========
125 | An empty object will return all rows in the table:
126 | 
127 | .. code:: python
128 | 
129 |    {}
130 | 
131 | Projection
132 | ==========
133 | .. code:: python
134 | 
135 |    {"select": ["foo", "bar"]}
136 | 
137 | Not specifying select is equivalent to SELECT * in SQL
138 | 
139 | Column aliasing
140 | ---------------
141 | .. code:: python
142 | 
143 |    {"select": [["=", "foo", "bar"]]}
144 | 
145 | This will rename column bar to foo in the result.
146 | 
147 | You can also make more elaborate calculations in the aliasing expression.
148 | 
149 | .. code:: python
150 | 
151 |    {"select": [["=", "baz", ["+", ["*", "bar", 2], "foo"]]]
152 | 
153 | As well as simple constant assignments.
154 | 
155 | .. code:: python
156 | 
157 |    {"select": [["=", "baz", 55]]}
158 | 
159 | 
160 | Filtering
161 | =========
162 | 
163 | Comparison
164 | ----------
165 | .. code:: python
166 | 
167 |    {"where": ["<", "foo", 1]}
168 | 
169 | The following operators are supported:
170 | 
171 | .. code::
172 | 
173 |    ==, !=, <=, <, >, >=
174 | 
175 | In
176 | --
177 | .. code:: python
178 | 
179 |    {"where": ["in", "foo", [1, 2]]}
180 | 
181 | 
182 | Like/ilike
183 | ----------
184 | Like and ilike are used for string matching and work similar to LIKE in SQL. Like is case sensitive
185 | while ilike is case insensitive. In addition to string matching using % as wildcard like/ilike also
186 | supports regexps.
187 | 
188 | .. code:: python
189 | 
190 |    {"where": ["like", "foo", "'%bar%'"]}
191 | 
192 | 
193 | Bitwise operators
194 | -----------------
195 | There are two operators for bitwise filtering on integers: `all_bits` and `any_bits`.
196 | 
197 | * all_bits - evaluates to true if all bits in the supplied argument are set in value tested against.
198 | * any_bits - evaluates to true if any bits in the supplied argument are set in value tested agains.
199 | 
200 | .. code:: python
201 | 
202 |    {"where": ["any_bits", "foo", 31]}
203 | 
204 | 
205 | Clauses
206 | -------
207 | .. code:: python
208 | 
209 |    {"where": ["&", [">", "foo", 1],
210 |                    ["==", "bar", 2]]}
211 | 
212 | The following operators are supported:
213 | 
214 | .. code::
215 | 
216 |    &, |
217 | 
218 | 
219 | Negation
220 | --------
221 | .. code:: python
222 | 
223 |    {"where": ["!", ["==", "foo",  1]]}
224 | 
225 | 
226 | Ordering
227 | ========
228 | 
229 | Ascending
230 | 
231 | .. code:: python
232 | 
233 |    {"order_by": ["foo"]}
234 | 
235 | 
236 | Descending
237 | 
238 | .. code:: python
239 | 
240 |    {"order_by": ["-foo"]}
241 | 
242 | 
243 | Offset
244 | ======
245 | Great for pagination of long results!
246 | 
247 | .. code:: python
248 | 
249 |    {"offset": 5}
250 | 
251 | 
252 | Limit
253 | =====
254 | Great for pagination of long results!
255 | 
256 | .. code:: python
257 | 
258 |    {"limit": 10}
259 | 
260 | 
261 | Group by
262 | ========
263 | .. code:: python
264 | 
265 |    {"group_by": ["foo"]}
266 | 
267 | 
268 | Aggregation
269 | ===========
270 | Aggregation is done as part of the select, just like in SQL.
271 | 
272 | .. code:: python
273 | 
274 |    {"select": ["foo" ["sum", "bar"]],
275 |     "group_by": ["foo"]}
276 | 
277 | 
278 | Distinct
279 | ========
280 | Distinct has its own query clause unlike in SQL.
281 | 
282 | .. code:: python
283 | 
284 |    {"select": ["foo", "bar"],
285 |     "distinct": ["foo"]}
286 | 
287 | 
288 | Sub queries using from
289 | ======================
290 | Filter, transform and select your data in multiple steps.
291 | 
292 | .. code:: python
293 | 
294 |     {"select": [["=", "foo_pct", ["*", 100, ["/", "foo", "bar"]]]],
295 |      "from": {"select": ["foo", ["sum", "bar"]],
296 |               "group_by": ["foo"]}}
297 | 
298 | 
299 | Sub queries using in
300 | ====================
301 | Filter your data using the result of a query as filter input.
302 | 
303 | .. code:: python
304 | 
305 |     {"where", ["in", "foo", {"where": ["==", "bar", 10]}]}
306 | 
307 | 
308 | All together now!
309 | =================
310 | A slightly more elaborate example. Get the top 10 foo:s with most bar:s.
311 | 
312 | .. code:: python
313 | 
314 |    {"select": ["foo", ["sum", "bar"]],
315 |     "where": [">", "bar", 0],
316 |     "order_by": ["-bar"],
317 |     "group_by": ["foo"],
318 |     "limit": 10}
319 | 
320 | 
321 | ***********************
322 | API examples using curl
323 | ***********************
324 | Upload table data to cache (a 404 will be returned if querying on a key that does not exist).
325 | 
326 | .. code::
327 | 
328 |    curl -X POST --data-binary @my_csv.csv http://localhost:8888/qcache/dataset/my-key
329 | 
330 | 
331 | Query table
332 | 
333 | .. code::
334 | 
335 |    curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"select\": [[\"count\"]], \"where\": [\"<\", \"baz\", 99999999999915],  \"offset\": 100, \"limit\": 50}"
336 |    curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"select\": [[\"count\"]], \"where\": [\"in\", \"baz\", [779889,8958854,8281368,6836605,3080972,4072649,7173075,4769116,4766900,4947128,7314959,683531,6395813,7834211,12051932,3735224,12368089,9858334,4424629,4155280]],  \"offset\": 0, \"limit\": 50}"
337 |    curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"where\": [\"==\", \"foo\", \"\\\"95d9f671\\\"\"],  \"offset\": 0, \"limit\": 50}"
338 |    curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"select\": [[\"max\", \"baz\"]],  \"offset\": 0, \"limit\": 500000000000}"
339 | 
340 | 
341 | ***************************
342 | Custom request HTTP headers
343 | ***************************
344 | 
345 | There are a couple of custom HTTP headers that can be used to control the behaviour of Q-Cache.
346 | 
347 | Posting tables
348 | ==============
349 | 
350 | X-QCache-types
351 | --------------
352 | QCache will usually recognize the data types of submitted data automatically. There may be times when
353 | strings are mistaken for numbers because all of the data submitted for a column in a dataset happens
354 | to be in numbers.
355 | 
356 | This header makes it possible to explicitly type column to be a string to. In the example below columns
357 | foo and bar are both typed to string.
358 | 
359 | .. code::
360 | 
361 |    X-QCache-types: foo=string;bar=string
362 | 
363 | Explicitly setting the type to string is only relevant when submitting data in CSV. With JSON the data
364 | has an unambiguous (well...) data type that is used by QCache.
365 | 
366 | Enums
367 | -----
368 | The `X-QCache-types` header can also be used to specify columns with enum types.
369 | 
370 | .. code::
371 | 
372 |    X-QCache-types: foo=enum;bar=enum
373 | 
374 | Enums are a good way to store low cardinality string columns space efficiently. They can be compared
375 | for equality and inequality but currently do not have a well defined order so filtering by
376 | larger than and less than is not possible for example.
377 | 
378 | 
379 | X-QCache-stand-in-columns
380 | -------------------------
381 | It may be that your submitted data varies a little from dataset to dataset with respect to the columns
382 | available in the dataset. You still want to be able to query the datasets in the same way and make
383 | some assumptions of which columns that are available. This header lets you do that.
384 | 
385 | In the below example column foo will be set to 10 in case it does not exist in the submitted data. bar will
386 | be set to the value of the baz column if it is not submitted.
387 | 
388 | This header can be used in request both for storing and querying data.
389 | 
390 | .. code::
391 | 
392 |    X-QCache-stand-in-columns: foo=10;bar=baz
393 | 
394 | 
395 | Query responses
396 | ===============
397 | 
398 | X-QCache-unsliced-length
399 | ------------------------
400 | This header is added to responses and states how many rows the total filtered result was before applying
401 | any limits or offsets for pagination.
402 | 
403 | .. code::
404 | 
405 |    X-QCache-unsliced-length: 8324
406 | 
407 | 
408 | *************
409 | More examples
410 | *************
411 | Please look at the tests in the project or QCache-client_ for some further examples of queries.
412 | The unit tests in this project is also a good source for examples.
413 | 
414 | If you still have questions don't hesitate to contact the author or write an issue!
415 | 
416 | **********
417 | Statistics
418 | **********
419 | 
420 | .. code::
421 | 
422 |    http://localhost:8888/qcache/statistics
423 | 
424 | A get against the above endpoint will return a JSON object containing cache statistics,
425 | hit & miss count, query & upload duration. Statistics are reset when querying.
426 | 
427 | *************
428 | Data encoding
429 | *************
430 | Just use UTF-8 when uploading data and in queries and you'll be fine. All responses are UTF-8.
431 | No other codecs are supported.
432 | 
433 | ****************
434 | Data compression
435 | ****************
436 | QCache supports request and response body compression with LZ4 or GZIP using standard HTTP headers.
437 | 
438 | In a query request set the following header to receive a compressed response:
439 | 
440 | .. code::
441 | 
442 |    Accept-Encoding: lz4,gzip
443 | 
444 | 
445 | The response will contain the following header indicating the used encoding
446 | 
447 | .. code::
448 | 
449 |    Content-Encoding: lz4
450 | 
451 | LZ4 will always be preferred if present.
452 | 
453 | The above header should also be set indicating the compression algorithm if you are
454 | submitting compressed data.
455 | 
456 | 
457 | **************************
458 | Performance & dimensioning
459 | **************************
460 | Since QCache is single thread, single process, the way to scale capacity is by adding more servers.
461 | If you have 8 Gb of ram available on a 4 core machine don't start one server using all 8 Gb. Instead
462 | start 4 servers with 2 Gb memory each or even 8 servers with 1 Gb each or 16 servers with 512 Mb each.
463 | depending on your use case. Assign them to different ports and use a client library to do the key
464 | balancing between them. That way you will have 4 - 16 times the query capacity.
465 | 
466 | QCache is ideal for container deployment. Start one container running one QCache instance.
467 | 
468 | Expect a memory overhead of about 20% - 30% of the configured cache size for querying and table loading.
469 | To be on the safe side you should probably assume a 50% overhead. Eg. if you have 3 Gb available set the
470 | cache size to 2 Gb.
471 | 
472 | When choosing between CSV and JSON as upload format prefer CSV as the amount of data can be large and it's
473 | more compact and faster to insert than JSON.
474 | 
475 | For query responses prefer JSON as the amount of data is often small and it's easier to work with than CSV.
476 | 
477 | .. _Pandas: http://pandas.pydata.org/
478 | .. _NumPy: http://www.numpy.org/
479 | .. _Tornado: http://www.tornadoweb.org/en/stable/
480 | 
481 | ***********************************
482 | Standing on the shoulders of giants
483 | ***********************************
484 | QCache makes heavy use of the fantastic python libraries Pandas_, NumPy_ and Tornado_.
485 | 
486 | 
487 | *********************
488 | Ideas for coming work
489 | *********************
490 | These may or may not be realized, it's far from sure that all of the ideas are good.
491 | 
492 | * Improve documentation
493 | * Stream data into dataframe rather than waiting for complete input, chunked HTTP upload or similar.
494 | * Streaming proxy to allow clients to only know about one endpoint.
495 | * Configurable URL prefix to allow being mounted at arbitrary position behind a proxy.
496 | * Make it possible to execute multiple queries and return multiple responses in one request (qs=,/qs/).
497 | * Allow post with data and query in one request, this will guarantee progress
498 |   as long as the dataset fits in memory. {"query": ..., "dataset": ...}
499 | * Possibility to specify indexes when uploading data (how do the indexes affect size? write performance? read performance?)
500 | * Possibility to upload files as a way to prime the cache without taking up memory.
501 | * Namespaces for more diverse statistics based on namespace?
502 | * Publish performance numbers
503 | * Other table formats in addition to CSV and JSON?
504 | * Break out all things dataframe into an own package and provide possibility to update
505 |   and insert into dataframe based on predicate just like querying is done now.
506 | * Investigate type hints for pandas categorials on enum-like values to improve storage
507 |   layout and filter speed. Check new import options from CSV when Pandas 0.19 is available.
508 | * Support math functions as part of the where clause (see pandas expr.py/ops.py)
509 | * Some kind of light weight joining? Could create dataset groups that all are allocated to
510 |   the same cache. Sub queries could then be used to query datasets based on data selected
511 |   from other datasets in the same dataset group.
512 | 
513 | ************
514 | Contributing
515 | ************
516 | Want to contribute? That's great!
517 | 
518 | If you experience problems please log them on GitHub. If you want to contribute code,
519 | please fork the code and submit a pull request.
520 | 
521 | If you intend to implement major features or make major changes please raise an issue
522 | so that we can discuss it first.
523 | 
524 | Running tests
525 | =============
526 | .. code::
527 | 
528 |    pip install -r dev-requirements.txt
529 |    invoke test
530 | 
531 | TLS
532 | ===
533 | Some tests rely on a couple of certs found under `tls/`. If these have expired
534 | they have to be regenerated. This is done by executing `generate_test_certs.sh`
535 | from the `tls` directory.
536 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | invoke<=0.12.0
3 | freezegun
4 | pytest-cov
5 | flake8


--------------------------------------------------------------------------------
/qcache/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """QCache
 4 | 
 5 | Usage:
 6 |   qcache [-hd] [--port=PORT] [--size=MAX_SIZE] [--age=MAX_AGE] [--statistics-buffer-size=BUFFER_SIZE]
 7 |          [--cert-file=PATH_TO_CERT] [--ca-file=PATH_TO_CA] [--basic-auth=<USER>:<PASSWORD>]
 8 | 
 9 | Options:
10 |   -h --help                     Show this screen
11 |   -p PORT --port=PORT           Port [default: 8888]
12 |   -s MAX_SIZE --size=MAX_SIZE   Max cache size, bytes [default: 1000000000]
13 |   -a MAX_AGE --age=MAX_AGE      Max age of cached item, seconds. 0 = never expire. [default: 0]
14 |   -b BUFFER_SIZE --statistics-buffer-size=BUFFER_SIZE  Number of entries to store in statistics
15 |                                                        ring buffer. [default: 1000]
16 |   -c PATH_TO_CERT --cert-file=PATH_TO_CERT   Path to PEM file containing private key and certificate for SSL
17 |   -ca PATH_TO_CA --ca-file=PATH_TO_CA   Path to CA file, if provided client certificates will be checked against this ca
18 |   -d --debug   Run in debug mode
19 |   -ba <USER>:<PASSWORD> --basic-auth=<USER>:<PASSWORD>   Enable basic auth, requires that SSL is enabled.
20 | """
21 | 
22 | from docopt import docopt
23 | from qcache.app import run
24 | 
25 | __version__ = "0.9.3"
26 | __author__ = "Tobias Gustafsson"
27 | __license__ = "MIT"
28 | 
29 | 
30 | def main():
31 |     """
32 |     Main entry point for the qcache server.
33 |     """
34 |     args = docopt(__doc__, version=__version__)
35 | 
36 |     # Should be possible to solve this without casting to int...
37 |     if '--version' in args:
38 |         print __version__
39 |     else:
40 |         run(port=int(args['--port']),
41 |             max_cache_size=int(args['--size']),
42 |             max_age=int(args['--age']),
43 |             statistics_buffer_size=int(args['--statistics-buffer-size']),
44 |             debug=args['--debug'],
45 |             certfile=args['--cert-file'],
46 |             cafile=args['--ca-file'],
47 |             basic_auth=args['--basic-auth'])
48 | 
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/qcache/app.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import json
  3 | import re
  4 | import ssl
  5 | import time
  6 | import gc
  7 | 
  8 | from tornado.ioloop import IOLoop
  9 | from tornado.web import RequestHandler, Application, url, HTTPError
 10 | 
 11 | from qcache.dataset_cache import DatasetCache
 12 | from qcache.compression import CompressedContentEncoding, decoded_body
 13 | from qcache.qframe import MalformedQueryException, QFrame
 14 | from qcache.statistics import Statistics
 15 | 
 16 | 
 17 | class ResponseCode(object):
 18 |     OK = 200
 19 |     CREATED = 201
 20 | 
 21 |     BAD_REQUEST = 400
 22 |     UNAUTHORIZED = 401
 23 |     NOT_FOUND = 404
 24 |     NOT_ACCEPTABLE = 406
 25 |     UNSUPPORTED_MEDIA_TYPE = 415
 26 | 
 27 | 
 28 | CONTENT_TYPE_JSON = 'application/json'
 29 | CONTENT_TYPE_CSV = 'text/csv'
 30 | ACCEPTED_TYPES = {CONTENT_TYPE_JSON, CONTENT_TYPE_CSV}  # text/*, */*?
 31 | CHARSET_REGEX = re.compile('charset=([A-Za-z0-9_-]+)')
 32 | 
 33 | auth_user = None
 34 | auth_password = None
 35 | 
 36 | 
 37 | def auth_enabled():
 38 |     return auth_user is not None and auth_password is not None
 39 | 
 40 | 
 41 | def credentials_correct(provided_user, provided_password):
 42 |     return provided_user == auth_user and provided_password == auth_password
 43 | 
 44 | 
 45 | def http_auth(handler_class):
 46 |     """
 47 |     Basic auth decorator. Based on the decorator found here:
 48 |     https://simplapi.wordpress.com/2014/03/26/python-tornado-and-decorator/
 49 |     """
 50 | 
 51 |     def set_401(handler):
 52 |         handler.set_status(ResponseCode.UNAUTHORIZED)
 53 |         handler.set_header('WWW-Authenticate', 'Basic realm=Restricted')
 54 |         handler._transforms = []
 55 |         handler.finish()
 56 | 
 57 |     def wrap_execute(handler_execute):
 58 |         def is_authenticated(handler):
 59 |             if not auth_enabled():
 60 |                 return True
 61 | 
 62 |             auth_header = handler.request.headers.get('Authorization')
 63 |             if auth_header is None or not auth_header.startswith('Basic '):
 64 |                 set_401(handler)
 65 |                 return False
 66 | 
 67 |             auth_decoded = base64.decodestring(auth_header[6:])
 68 |             user, password = auth_decoded.split(':', 2)
 69 | 
 70 |             if not credentials_correct(user, password):
 71 |                 set_401(handler)
 72 |                 return False
 73 | 
 74 |             return True
 75 | 
 76 |         def _execute(self, transforms, *args, **kwargs):
 77 |             if not is_authenticated(self):
 78 |                 return False
 79 | 
 80 |             return handler_execute(self, transforms, *args, **kwargs)
 81 | 
 82 |         return _execute
 83 | 
 84 |     handler_class._execute = wrap_execute(handler_class._execute)
 85 |     return handler_class
 86 | 
 87 | 
 88 | class UTF8JSONDecoder(json.JSONDecoder):
 89 |     def decode(self, json_string):
 90 |         obj = super(UTF8JSONDecoder, self).decode(json_string)
 91 |         assert isinstance(obj, list), "Must pass a list of objects"
 92 | 
 93 |         for r in obj:
 94 |             yield {k: v.encode(encoding='utf-8') if isinstance(v, unicode) else v for k, v in r.items()}
 95 | 
 96 | 
 97 | class AppState(object):
 98 |     def __init__(self):
 99 |         self.query_count = 0
100 | 
101 | 
102 | @http_auth
103 | class DatasetHandler(RequestHandler):
104 |     def initialize(self, dataset_cache, state, stats):
105 |         self.dataset_cache = dataset_cache
106 |         self.state = state
107 |         self.stats = stats
108 | 
109 |     def prepare(self):
110 |         self.request_start = time.time()
111 | 
112 |     def on_finish(self):
113 |         if hasattr(self, 'operation'):
114 |             self.stats.append('{}_request_durations'.format(self.operation), time.time() - self.request_start)
115 | 
116 |     def accept_type(self):
117 |         accept_types = [t.strip() for t in self.request.headers.get('Accept', CONTENT_TYPE_JSON).split(',')]
118 |         for t in accept_types:
119 |             if t in ACCEPTED_TYPES:
120 |                 return t
121 | 
122 |         raise HTTPError(ResponseCode.NOT_ACCEPTABLE)
123 | 
124 |     def content_type(self):
125 |         header = self.request.headers.get("Content-Type", CONTENT_TYPE_CSV).split(';')
126 |         content_type = header[0]
127 |         if content_type not in ACCEPTED_TYPES:
128 |             raise HTTPError(ResponseCode.UNSUPPORTED_MEDIA_TYPE,
129 |                             "Content-Type '{content_type}' not supported".format(content_type=content_type))
130 | 
131 |         if len(header) > 1:
132 |             m = CHARSET_REGEX.match(header[1].strip())
133 |             if m and m.group(1) != 'utf-8':
134 |                 raise HTTPError(ResponseCode.UNSUPPORTED_MEDIA_TYPE,
135 |                                 "charset={charset} not supported, only utf-8".format(charset=m.group(1)))
136 | 
137 |         return content_type
138 | 
139 |     def header_to_key_values(self, header_name):
140 |         header_value = self.request.headers.get(header_name, None)
141 |         if not header_value:
142 |             return None
143 | 
144 |         key_values = []
145 |         for key_value in header_value.split(';'):
146 |             key_values.append(tuple(s.strip() for s in key_value.split('=')))
147 | 
148 |         return key_values
149 | 
150 |     def dtypes(self):
151 |         types = self.header_to_key_values('X-QCache-types')
152 |         if not types:
153 |             return None
154 | 
155 |         dtypes = {}
156 |         for column_name, type_name in types:
157 |             if type_name == 'string':
158 |                 dtypes[column_name] = 'object'
159 |             elif type_name == 'enum':
160 |                 dtypes[column_name] = 'category'
161 |             elif type_name == 'float':
162 |                 dtypes[column_name] = 'float64'
163 |             else:
164 |                 raise HTTPError(ResponseCode.BAD_REQUEST,
165 |                                 'Unrecognized type name "{type_name}" for column "{column_name}"'.format(
166 |                                     type_name=type_name, column_name=column_name))
167 | 
168 |         return dtypes
169 | 
170 |     def stand_in_columns(self):
171 |         return self.header_to_key_values('X-QCache-stand-in-columns')
172 | 
173 |     def query(self, dataset_key, q):
174 |         t0 = time.time()
175 |         self.operation = 'query'
176 |         accept_type = self.accept_type()
177 |         if dataset_key not in self.dataset_cache:
178 |             self.stats.inc('miss_count')
179 |             raise HTTPError(ResponseCode.NOT_FOUND)
180 | 
181 |         if self.dataset_cache.evict_if_too_old(dataset_key):
182 |             self.stats.inc('miss_count')
183 |             self.stats.inc('age_evict_count')
184 |             raise HTTPError(ResponseCode.NOT_FOUND)
185 | 
186 |         qf = self.dataset_cache[dataset_key]
187 |         try:
188 |             result_frame = qf.query(q, stand_in_columns=self.stand_in_columns())
189 |         except MalformedQueryException as e:
190 |             self.write(json.dumps({'error': str(e)}))
191 |             self.set_status(ResponseCode.BAD_REQUEST)
192 |             return
193 | 
194 |         self.set_header("Content-Type", "{content_type}; charset=utf-8".format(content_type=accept_type))
195 |         self.set_header("X-QCache-unsliced-length", result_frame.unsliced_df_len)
196 |         if accept_type == CONTENT_TYPE_CSV:
197 |             self.write(result_frame.to_csv())
198 |         else:
199 |             self.write(result_frame.to_json())
200 | 
201 |         self.post_query_processing()
202 |         self.stats.inc('hit_count')
203 |         self.stats.append('query_durations', time.time() - t0)
204 | 
205 |     def q_json_to_dict(self, q_json):
206 |         try:
207 |             return json.loads(q_json)
208 |         except ValueError:
209 |             self.write(json.dumps({'error': 'Could not load JSON: {json}'.format(json=json)}))
210 |             self.set_status(ResponseCode.BAD_REQUEST)
211 | 
212 |         return None
213 | 
214 |     def get(self, dataset_key, optional_q):
215 |         if optional_q:
216 |             # There should not be a q URL for the GET method, it's supposed to take
217 |             # q as a query parameter
218 |             raise HTTPError(ResponseCode.NOT_FOUND)
219 | 
220 |         q_dict = self.q_json_to_dict(self.get_argument('q', default=''))
221 |         if q_dict is not None:
222 |             self.query(dataset_key, q_dict)
223 | 
224 |     def post_query_processing(self):
225 |         if self.state.query_count % 10 == 0:
226 |             # Run a collect every now and then. It reduces the process memory consumption
227 |             # considerably but always doing it will impact query performance negatively.
228 |             gc.collect()
229 | 
230 |         self.state.query_count += 1
231 | 
232 |     def post(self, dataset_key, optional_q):
233 |         if optional_q:
234 |             q_dict = self.q_json_to_dict(decoded_body(self.request))
235 |             if q_dict is not None:
236 |                 self.query(dataset_key, q_dict)
237 |             return
238 | 
239 |         t0 = time.time()
240 |         self.operation = 'store'
241 |         if dataset_key in self.dataset_cache:
242 |             self.stats.inc('replace_count')
243 |             del self.dataset_cache[dataset_key]
244 | 
245 |         content_type = self.content_type()
246 |         input_data = decoded_body(self.request)
247 |         if content_type == CONTENT_TYPE_CSV:
248 |             durations_until_eviction = self.dataset_cache.ensure_free(len(input_data))
249 |             qf = QFrame.from_csv(input_data, column_types=self.dtypes(),
250 |                                  stand_in_columns=self.stand_in_columns())
251 |         else:
252 |             # This is a waste of CPU cycles, first the JSON decoder decodes all strings
253 |             # from UTF-8 then we immediately encode them back into UTF-8. Couldn't
254 |             # find an easy solution to this though.
255 |             durations_until_eviction = self.dataset_cache.ensure_free(len(input_data) / 2)
256 |             data = json.loads(input_data, cls=UTF8JSONDecoder)
257 |             qf = QFrame.from_dicts(data, stand_in_columns=self.stand_in_columns())
258 | 
259 |         self.dataset_cache[dataset_key] = qf
260 |         self.set_status(ResponseCode.CREATED)
261 |         self.stats.inc('size_evict_count', count=len(durations_until_eviction))
262 |         self.stats.inc('store_count')
263 |         self.stats.append('store_row_counts', len(qf))
264 |         self.stats.append('store_durations', time.time() - t0)
265 |         self.stats.extend('durations_until_eviction', durations_until_eviction)
266 |         self.write("")
267 | 
268 |     def delete(self, dataset_key, optional_q):
269 |         if optional_q:
270 |             # There should not be a q parameter for the delete method
271 |             raise HTTPError(ResponseCode.NOT_FOUND)
272 | 
273 |         if dataset_key in self.dataset_cache:
274 |             del self.dataset_cache[dataset_key]
275 | 
276 |         self.write("")
277 | 
278 | 
279 | @http_auth
280 | class StatusHandler(RequestHandler):
281 |     def get(self):
282 |         self.write("OK")
283 | 
284 | 
285 | @http_auth
286 | class StatisticsHandler(RequestHandler):
287 |     def initialize(self, dataset_cache, stats):
288 |         self.dataset_cache = dataset_cache
289 |         self.stats = stats
290 | 
291 |     def get(self):
292 |         self.set_header("Content-Type", "application/json; charset=utf-8")
293 |         stats = self.stats.snapshot()
294 |         stats['dataset_count'] = len(self.dataset_cache)
295 |         stats['cache_size'] = self.dataset_cache.size
296 |         self.write(json.dumps(stats))
297 | 
298 | 
299 | def make_app(url_prefix='/qcache', debug=False, max_cache_size=1000000000, max_age=0,
300 |              statistics_buffer_size=1000, basic_auth=None):
301 |     if basic_auth:
302 |         global auth_user, auth_password
303 |         auth_user, auth_password = basic_auth.split(':', 2)
304 | 
305 |     stats = Statistics(buffer_size=statistics_buffer_size)
306 |     cache = DatasetCache(max_size=max_cache_size, max_age=max_age)
307 |     return Application([
308 |                            url(r"{url_prefix}/dataset/([A-Za-z0-9\-_]+)/?(q)?".format(url_prefix=url_prefix),
309 |                                DatasetHandler,
310 |                                dict(dataset_cache=cache, state=AppState(), stats=stats),
311 |                                name="dataset"),
312 |                            url(r"{url_prefix}/status".format(url_prefix=url_prefix),
313 |                                StatusHandler,
314 |                                dict(),
315 |                                name="status"),
316 |                            url(r"{url_prefix}/statistics".format(url_prefix=url_prefix),
317 |                                StatisticsHandler,
318 |                                dict(dataset_cache=cache, stats=stats),
319 |                                name="statistics")
320 |                        ], debug=debug, transforms=[CompressedContentEncoding])
321 | 
322 | 
323 | def ssl_options(certfile, cafile=None):
324 |     if certfile:
325 |         print "Enabling TLS"
326 |         ssl_context = ssl.create_default_context(purpose=ssl.Purpose.CLIENT_AUTH, cafile=cafile)
327 |         ssl_context.load_cert_chain(certfile)
328 | 
329 |         if cafile:
330 |             print "Enabling client certificate verification"
331 |             ssl_context.verify_mode = ssl.CERT_REQUIRED
332 |         return dict(ssl_options=ssl_context)
333 | 
334 |     return {}
335 | 
336 | 
337 | def run(port=8888, max_cache_size=1000000000, max_age=0, statistics_buffer_size=1000,
338 |         debug=False, certfile=None, cafile=None, basic_auth=None):
339 |     if basic_auth and not certfile:
340 |         print "TLS must be enabled to use basic auth!"
341 |         return
342 | 
343 |     print("Starting on port {port}, max cache size {max_cache_size} bytes, max age {max_age} seconds,"
344 |           " statistics_buffer_size {statistics_buffer_size}, debug={debug},".format(
345 |         port=port, max_cache_size=max_cache_size, max_age=max_age,
346 |         statistics_buffer_size=statistics_buffer_size, debug=debug))
347 | 
348 |     app = make_app(
349 |         debug=debug, max_cache_size=max_cache_size, max_age=max_age,
350 |         statistics_buffer_size=statistics_buffer_size, basic_auth=basic_auth)
351 | 
352 |     args = {}
353 |     args.update(ssl_options(certfile=certfile, cafile=cafile))
354 |     app.listen(port, max_buffer_size=max_cache_size, **args)
355 |     IOLoop.current().start()
356 | 
357 | 
358 | if __name__ == "__main__":
359 |     run()
360 | 


--------------------------------------------------------------------------------
/qcache/compression.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | from io import BytesIO
 3 | 
 4 | import lz4.block
 5 | from tornado.web import OutputTransform, HTTPError
 6 | 
 7 | 
 8 | GZIP_LEVEL = 6
 9 | 
10 | def gzip_dumps(string):
11 |     buffer = BytesIO()
12 |     file = gzip.GzipFile(mode='w', fileobj=buffer, compresslevel=GZIP_LEVEL)
13 |     file.write(string)
14 |     file.close()
15 |     return buffer.getvalue()
16 | 
17 | 
18 | def gzip_loads(string):
19 |     buffer = BytesIO(string)
20 |     file = gzip.GzipFile(mode='r', fileobj=buffer)
21 |     return file.read()
22 | 
23 | 
24 | ENCODINGS = {
25 |     'lz4': (lz4.block.decompress, lz4.block.compress),
26 |     'gzip': (gzip_loads, gzip_dumps),
27 |     None: (lambda c: c, lambda c: c)
28 | }
29 | 
30 | 
31 | def decoded_body(request):
32 |     encoding = request.headers.get('Content-Encoding')
33 |     if encoding not in ENCODINGS:
34 |         raise HTTPError(400,
35 |                         'Unrecognized encoding "{encoding}"'.format(encoding=encoding))
36 | 
37 |     return ENCODINGS[encoding][0](request.body)
38 | 
39 | 
40 | class CompressedContentEncoding(OutputTransform):
41 |     """Applies compression to response. Prefers lz4 if accepted else uses gzip.
42 |     """
43 |     def __init__(self, request):
44 |         accept_coding = request.headers.get("Accept-Encoding", "")
45 |         if 'lz4' in accept_coding:
46 |             self.encoding = 'lz4'
47 |         elif 'gzip' in accept_coding:
48 |             self.encoding = 'gzip'
49 |         else:
50 |             self.encoding = None
51 | 
52 |         super(CompressedContentEncoding, self).__init__(request)
53 | 
54 |     def transform_first_chunk(self, status_code, headers, chunk, finishing):
55 |         if status_code != 200:
56 |             # Only compress responses containing query data
57 |             self.encoding = None
58 | 
59 |         if self.encoding:
60 |             if not finishing:
61 |                 raise Exception("Multi chunk not accepted by QCache when applying compression")
62 | 
63 |             chunk = ENCODINGS[self.encoding][1](chunk)
64 |             headers['Content-Encoding'] = self.encoding
65 |             headers['Content-Length'] = str(len(chunk))
66 | 
67 |         return status_code, headers, chunk
68 | 
69 |     def transform_chunk(self, chunk, finishing):
70 |         return chunk
71 | 


--------------------------------------------------------------------------------
/qcache/dataset_cache.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | 
 4 | class CacheItem(object):
 5 |     def __init__(self, qframe):
 6 |         self.creation_time = time()
 7 |         self.last_access_time = self.creation_time
 8 |         self._qframe = qframe
 9 |         self.access_count = 0
10 | 
11 |         # 100 bytes is just a very rough estimate of the object overhead of this instance
12 |         self.size = 100 + qframe.byte_size()
13 | 
14 |     @property
15 |     def dataset(self):
16 |         self.last_access_time = time()
17 |         self.access_count += 1
18 |         return self._qframe
19 | 
20 | 
21 | class DatasetCache(object):
22 |     def __init__(self, max_size, max_age):
23 |         self.max_size = max_size
24 |         self.max_age = max_age
25 |         self._cache_dict = {}
26 |         self.size = 0.0
27 | 
28 |     def has_expired(self, item):
29 |         return self.max_age and time() > item.creation_time + self.max_age
30 | 
31 |     def evict_if_too_old(self, key):
32 |         if self.has_expired(self._cache_dict[key]):
33 |             del self[key]
34 |             return True
35 | 
36 |         return False
37 | 
38 |     def __contains__(self, key):
39 |         return key in self._cache_dict
40 | 
41 |     def __getitem__(self, item):
42 |         return self._cache_dict[item].dataset
43 | 
44 |     def __setitem__(self, key, qframe):
45 |         current_size = 0.0
46 |         if key in self._cache_dict:
47 |             current_size = self._cache_dict[key].size
48 | 
49 |         new_item = CacheItem(qframe)
50 |         self.size += new_item.size - current_size
51 |         self._cache_dict[key] = new_item
52 | 
53 |     def __delitem__(self, key):
54 |         self.size -= self._cache_dict[key].size
55 |         del self._cache_dict[key]
56 | 
57 |     def __len__(self):
58 |         return len(self._cache_dict)
59 | 
60 |     def ensure_free(self, byte_count):
61 |         """
62 |         :return: A list of durations in seconds that the dataset spent in the cache before
63 |                  being evicted.
64 |         """
65 |         if byte_count > self.max_size:
66 |             raise Exception('Impossible to allocate')
67 | 
68 |         if self.max_size - self.size >= byte_count:
69 |             return []
70 | 
71 |         # This is not very efficient but good enough for now
72 |         lru_datasets = sorted(self._cache_dict.items(), key=lambda item: item[1].last_access_time)
73 |         now = time()
74 |         durations_until_eviction = []
75 |         for key, _ in lru_datasets:
76 |             durations_until_eviction.append(now - self._cache_dict[key].creation_time)
77 |             del self[key]
78 |             if self.max_size - self.size >= byte_count:
79 |                 break
80 | 
81 |         return durations_until_eviction
82 | 


--------------------------------------------------------------------------------
/qcache/qframe/__init__.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | from StringIO import StringIO
  4 | 
  5 | import numpy
  6 | from pandas import DataFrame, pandas
  7 | 
  8 | from qcache.qframe.common import unquote, MalformedQueryException
  9 | from qcache.qframe.context import set_current_qframe
 10 | from qcache.qframe.query import query
 11 | from qcache.qframe.update import update_frame
 12 | 
 13 | 
 14 | def _get_dtype(obj):
 15 |     try:
 16 |         try:
 17 |             int(obj)
 18 |             return numpy.int64
 19 |         except ValueError:
 20 |             float(obj)
 21 |             return numpy.float64
 22 |     except ValueError:
 23 |         return numpy.object
 24 | 
 25 | 
 26 | def _add_stand_in_columns(df, stand_in_columns):
 27 |     if not stand_in_columns:
 28 |         return df
 29 | 
 30 |     for column_name, stand_in_value in stand_in_columns:
 31 |         if column_name not in df:
 32 |             if stand_in_value in df:
 33 |                 df.loc[:, column_name] = df[stand_in_value]
 34 |             else:
 35 |                 dtype = _get_dtype(stand_in_value)
 36 |                 stand_in_value = unquote(stand_in_value)
 37 |                 arr = numpy.full(len(df), stand_in_value, dtype=dtype)
 38 |                 df.loc[:, column_name] = pandas.Series(arr, index=df.index)
 39 | 
 40 | 
 41 | class QFrame(object):
 42 |     """
 43 |     Thin wrapper around a Pandas dataframe.
 44 |     """
 45 |     __slots__ = ('df', 'unsliced_df_len')
 46 | 
 47 |     def __init__(self, pandas_df, unsliced_df_len=None):
 48 |         self.unsliced_df_len = len(pandas_df) if unsliced_df_len is None else unsliced_df_len
 49 |         self.df = pandas_df
 50 | 
 51 |     @staticmethod
 52 |     def from_csv(csv_string, column_types=None, stand_in_columns=None):
 53 |         df = pandas.read_csv(StringIO(csv_string), dtype=column_types, na_values=[''], keep_default_na=False)
 54 |         _add_stand_in_columns(df, stand_in_columns)
 55 |         return QFrame(df)
 56 | 
 57 |     @staticmethod
 58 |     def from_dicts(d, column_types=None, stand_in_columns=None):
 59 |         df = DataFrame.from_records(d)
 60 | 
 61 |         # Setting columns to categorials is slightly awkward from dicts
 62 |         # than from CSV...
 63 |         if column_types:
 64 |             for name, type in column_types.items():
 65 |                 if type == 'category':
 66 |                     df[name] = df[name].astype("category")
 67 | 
 68 |         _add_stand_in_columns(df, stand_in_columns=stand_in_columns)
 69 |         return QFrame(df)
 70 | 
 71 |     def query(self, q, stand_in_columns=None):
 72 |         _add_stand_in_columns(self.df, stand_in_columns)
 73 |         set_current_qframe(self)
 74 |         if 'update' in q:
 75 |             # In place operation, should it be?
 76 |             update_frame(self.df, q)
 77 |             return None
 78 | 
 79 |         new_df, unsliced_df_len = query(self.df, q)
 80 |         return QFrame(new_df, unsliced_df_len=unsliced_df_len)
 81 | 
 82 |     def to_csv(self):
 83 |         return self.df.to_csv(index=False)
 84 | 
 85 |     def to_json(self):
 86 |         return self.df.to_json(orient='records')
 87 | 
 88 |     def to_dicts(self):
 89 |         return self.df.to_dict(orient='records')
 90 | 
 91 |     @property
 92 |     def columns(self):
 93 |         return self.df.columns
 94 | 
 95 |     def __len__(self):
 96 |         return len(self.df)
 97 | 
 98 |     def byte_size(self):
 99 |         # Estimate of the number of bytes consumed by this QFrame
100 |         return self.df.memory_usage(index=True, deep=True).sum()
101 | 


--------------------------------------------------------------------------------
/qcache/qframe/common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | 
 4 | class MalformedQueryException(Exception):
 5 |     pass
 6 | 
 7 | 
 8 | def raise_malformed(message, q):
 9 |     raise MalformedQueryException(message + ': {q}'.format(q=q))
10 | 
11 | 
12 | def assert_integer(name, i):
13 |     if not isinstance(i, (int, long)):
14 |         raise_malformed('Invalid type for {name}'.format(name=name), i)
15 | 
16 | 
17 | def assert_list(name, l):
18 |     if not isinstance(l, list):
19 |         raise_malformed('Invalid format for {name}'.format(name=name), l)
20 | 
21 | 
22 | def assert_len(q, expected, error_message="Invalid number of arguments"):
23 |     if len(q) != expected:
24 |         raise_malformed(error_message, q)
25 | 
26 | 
27 | def is_quoted(string):
28 |     l = len(string)
29 |     return (l >= 2) and \
30 |            ((string[0] == "'" and string[-1] == "'") or
31 |             (string[0] == '"' and string[-1] == '"'))
32 | 
33 | 
34 | def unquote(s):
35 |     if s.startswith("'") or s.startswith('"'):
36 |         s = s[1:]
37 | 
38 |     if s.endswith("'") or s.endswith('"'):
39 |         s = s[:-1]
40 | 
41 |     return s
42 | 


--------------------------------------------------------------------------------
/qcache/qframe/constants.py:
--------------------------------------------------------------------------------
1 | import operator
2 | 
3 | COMPARISON_OPERATORS = {'==': operator.eq,
4 |                         '!=': operator.ne,
5 |                         '<': operator.lt,
6 |                         '<=': operator.le,
7 |                         '>': operator.gt,
8 |                         '>=': operator.ge}


--------------------------------------------------------------------------------
/qcache/qframe/context.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Context to keep track of the qframe that is currently being operated on.
 3 | 
 4 | NB! Not thread safe and not safe for interleaved operations on multiple frames.
 5 | """
 6 | 
 7 | _current_qframe = None
 8 | 
 9 | 
10 | def set_current_qframe(qframe):
11 |     global _current_qframe
12 |     _current_qframe = qframe
13 | 
14 | 
15 | def get_current_qframe():
16 |     return _current_qframe
17 | 


--------------------------------------------------------------------------------
/qcache/qframe/pandas_filter.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | import operator
  4 | 
  5 | import numpy
  6 | 
  7 | from qcache.qframe.common import assert_list, raise_malformed, is_quoted, unquote, assert_len
  8 | from qcache.qframe.constants import COMPARISON_OPERATORS
  9 | from qcache.qframe.context import get_current_qframe
 10 | 
 11 | JOINING_OPERATORS = {'&': operator.and_,
 12 |                      '|': operator.or_}
 13 | 
 14 | 
 15 | def _leaf_node(df, q):
 16 |     if isinstance(q, basestring):
 17 |         if is_quoted(q):
 18 |             return q[1:-1].encode('utf-8')
 19 | 
 20 |         try:
 21 |             return df[q]
 22 |         except KeyError:
 23 |             raise_malformed("Unknown column", q)
 24 | 
 25 |     return q
 26 | 
 27 | 
 28 | def _bitwise_filter(df, q):
 29 |     assert_len(q, 3)
 30 |     op, column, arg = q
 31 |     if not isinstance(arg, (int, long)):
 32 |         raise_malformed('Invalid argument type, must be an integer:'.format(t=type(arg)), q)
 33 | 
 34 |     try:
 35 |         series = df[column] & arg
 36 |         if op == "any_bits":
 37 |             return series > 0
 38 |         return series == arg
 39 |     except TypeError:
 40 |         raise_malformed("Invalid column type, must be an integer", q)
 41 | 
 42 | 
 43 | def _not_filter(df, q):
 44 |     assert_len(q, 2, "! is a single arity operator, invalid number of arguments")
 45 |     return ~_do_pandas_filter(df, q[1])
 46 | 
 47 | 
 48 | def _isnull_filter(df, q):
 49 |     assert_len(q, 2, "isnull is a single arity operator, invalid number of arguments")
 50 | 
 51 |     # Slightly hacky but the only way I've come up with so far.
 52 |     return df[q[1]] != df[q[1]]
 53 | 
 54 | 
 55 | def _comparison_filter(df, q):
 56 |     assert_len(q, 3)
 57 |     op, col_name, arg = q
 58 |     return COMPARISON_OPERATORS[op](df[col_name], _do_pandas_filter(df, arg))
 59 | 
 60 | 
 61 | def _join_filter(df, q):
 62 |     result = None
 63 |     if len(q) < 2:
 64 |         raise_malformed("Invalid number of arguments", q)
 65 |     elif len(q) == 2:
 66 |         # Conjunctions and disjunctions with only one clause are OK
 67 |         result = _do_pandas_filter(df, q[1])
 68 |     else:
 69 |         result = reduce(lambda l, r: JOINING_OPERATORS[q[0]](l, _do_pandas_filter(df, r)),
 70 |                         q[2:], _do_pandas_filter(df, q[1]))
 71 | 
 72 |     return result
 73 | 
 74 | 
 75 | def prepare_in_clause(q):
 76 |     """
 77 |     The arguments to an in expression may be either a list of values or
 78 |     a sub query which is then executed to produce a list of values.
 79 |     """
 80 |     assert_len(q, 3)
 81 |     _, col_name, args = q
 82 | 
 83 |     if isinstance(args, dict):
 84 |         # Sub query, circular dependency on query by nature so need to keep the import local
 85 |         from qcache.qframe import query
 86 |         current_qframe = get_current_qframe()
 87 |         sub_df, _ = query(current_qframe.df, args)
 88 |         try:
 89 |             args = sub_df[col_name].values
 90 |         except KeyError:
 91 |             raise_malformed('Unknown column "{}"'.format(col_name), q)
 92 | 
 93 |     if not isinstance(args, (list, numpy.ndarray)):
 94 |         raise_malformed("Second argument must be a list", q)
 95 | 
 96 |     return col_name, args
 97 | 
 98 | 
 99 | def _in_filter(df, q):
100 |     col_name, args = prepare_in_clause(q)
101 |     return df[col_name].isin(args)
102 | 
103 | 
104 | def _like_filter(df, q):
105 |     assert_len(q, 3)
106 |     op, column, raw_expr = q
107 | 
108 |     if not is_quoted(raw_expr):
109 |         raise_malformed("like expects a quoted string as second argument", q)
110 | 
111 |     regexp = unquote(raw_expr)
112 | 
113 |     if not regexp.startswith('%'):
114 |         regexp = '^' + regexp
115 |     else:
116 |         regexp = regexp[1:]
117 | 
118 |     if not regexp.endswith('%'):
119 |         regexp += '$'
120 |     else:
121 |         regexp = regexp[:-1]
122 | 
123 |     # 'like' is case sensitive, 'ilike' is case insensitive
124 |     case = op == 'like'
125 | 
126 |     try:
127 |         return df[column].str.contains(regexp, case=case, na=False)
128 |     except AttributeError:
129 |         raise_malformed("Invalid column type for (i)like", q)
130 | 
131 | 
132 | def _do_pandas_filter(df, q):
133 |     if not isinstance(q, list):
134 |         return _leaf_node(df, q)
135 | 
136 |     if not q:
137 |         raise_malformed("Empty expression not allowed", q)
138 | 
139 |     result = None
140 |     op = q[0]
141 |     try:
142 |         if op in ('any_bits', 'all_bits'):
143 |             result = _bitwise_filter(df, q)
144 |         elif op == "!":
145 |             result = _not_filter(df, q)
146 |         elif op == "isnull":
147 |             result = _isnull_filter(df, q)
148 |         elif op in COMPARISON_OPERATORS:
149 |             result = _comparison_filter(df, q)
150 |         elif op in JOINING_OPERATORS:
151 |             result = _join_filter(df, q)
152 |         elif op == 'in':
153 |             result = _in_filter(df, q)
154 |         elif op in ('like', 'ilike'):
155 |             result = _like_filter(df, q)
156 |         else:
157 |             raise_malformed("Unknown operator", q)
158 |     except KeyError:
159 |         raise_malformed("Column is not defined", q)
160 |     except TypeError:
161 |         raise_malformed("Invalid type in argument", q)
162 | 
163 |     return result
164 | 
165 | 
166 | def pandas_filter(df, filter_q):
167 |     if filter_q:
168 |         assert_list('where', filter_q)
169 |         return df[_do_pandas_filter(df, filter_q)]
170 | 
171 |     return df
172 | 


--------------------------------------------------------------------------------
/qcache/qframe/query.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | import re
  3 | 
  4 | from pandas import DataFrame
  5 | from pandas.core.computation.ops import UndefinedVariableError
  6 | from pandas.core.groupby import DataFrameGroupBy
  7 | from qcache.qframe.pandas_filter import pandas_filter
  8 | from qcache.qframe.common import assert_list, assert_integer, raise_malformed, MalformedQueryException
  9 | 
 10 | 
 11 | CLAUSE_WHERE = 'where'
 12 | CLAUSE_GROUP_BY = 'group_by'
 13 | CLAUSE_DISTINCT = 'distinct'
 14 | CLAUSE_SELECT = 'select'
 15 | CLAUSE_ORDER_BY = 'order_by'
 16 | CLAUSE_OFFSET = 'offset'
 17 | CLAUSE_LIMIT = 'limit'
 18 | CLAUSE_FROM = 'from'
 19 | QUERY_CLAUSES = {CLAUSE_WHERE, CLAUSE_GROUP_BY, CLAUSE_DISTINCT, CLAUSE_SELECT,
 20 |                  CLAUSE_ORDER_BY, CLAUSE_OFFSET, CLAUSE_LIMIT, CLAUSE_FROM}
 21 | 
 22 | 
 23 | def _group_by(dataframe, group_by_q):
 24 |     if not group_by_q:
 25 |         return dataframe
 26 | 
 27 |     assert_list('group_by', group_by_q)
 28 | 
 29 |     try:
 30 |         return dataframe.groupby(group_by_q, as_index=False)
 31 |     except KeyError:
 32 |         raise_malformed('Group by column not in table', group_by_q)
 33 | 
 34 | 
 35 | def is_aggregate_function(expr):
 36 |     return type(expr) is list and len(expr) == 2
 37 | 
 38 | 
 39 | def is_alias_assignment(expr):
 40 |     """
 41 |     Examples:
 42 |     ['=', 'column_name', 1]                                       Constant assignment
 43 |     ['=', 'column_name', 'other_column']                          Basic aliasing
 44 |     ['=', 'column_name', ['sin', 'column_name']]
 45 |     ['=', 'column_name', ['+', 'column_name', 'other_column']]    Complex calculations
 46 |     """
 47 |     return type(expr) is list and len(expr) == 3 and expr[0] == '='
 48 | 
 49 | 
 50 | def _aggregate(dataframe_group_by, project_q, aggregate_fns):
 51 |     if not aggregate_fns:
 52 |         raise_malformed("Aggregate function required when group_by is specified", project_q)
 53 | 
 54 |     try:
 55 |         return dataframe_group_by.agg(aggregate_fns)
 56 |     except AttributeError as e:
 57 |         functions = [fn_name for fn_name in aggregate_fns.values() if fn_name in str(e)]
 58 |         raise_malformed("Unknown aggregation function '{fn}'".format(fn=functions[0]), project_q)
 59 | 
 60 | 
 61 | def _aggregate_without_group_by(dataframe, project_q, aggregate_fns):
 62 |     if len(aggregate_fns) != len(project_q):
 63 |         raise_malformed('Cannot mix aggregation functions and columns without group_by clause', project_q)
 64 | 
 65 |     results = {}
 66 |     for column_name, fn_name in aggregate_fns.items():
 67 |         # Intricate, apply the selected function to the selected column
 68 |         temp_dataframe = dataframe[[column_name]]
 69 |         fn = getattr(temp_dataframe, fn_name, None)
 70 |         if not fn or not callable(fn):
 71 |             raise_malformed('Unknown aggregation function', project_q)
 72 | 
 73 |         results[column_name] = [fn(axis=0)[0]]
 74 | 
 75 |     # The result must be a data frame
 76 |     return DataFrame.from_dict(results)
 77 | 
 78 | ALIAS_STRING = "^([A-Za-z0-9_-]+)$"
 79 | ALIAS_RE = re.compile(ALIAS_STRING)
 80 | 
 81 | 
 82 | def _build_eval_expression(expr):
 83 |     if type(expr) is list:
 84 |         if len(expr) == 3:
 85 |             arg1 = _build_eval_expression(expr[1])
 86 |             arg2 = _build_eval_expression(expr[2])
 87 |             op = expr[0]
 88 |             return "({arg1} {op} {arg2})".format(arg1=arg1, op=op, arg2=arg2)
 89 | 
 90 |         if len(expr) == 2:
 91 |             arg1 = _build_eval_expression(expr[1])
 92 |             op = expr[0]
 93 |             return "{op}({arg1})".format(op=op, arg1=arg1)
 94 | 
 95 |         raise_malformed('Invalid number of arguments', expr)
 96 | 
 97 |     return expr
 98 | 
 99 | 
100 | def _alias(dataframe, expressions):
101 |     result_frame = dataframe
102 |     for expression in expressions:
103 |         destination, source = expression[1], expression[2]
104 |         if not isinstance(destination, basestring):
105 |             raise_malformed('Invalid alias, must be a string', expression)
106 | 
107 |         if not re.match(ALIAS_RE, destination):
108 |             raise_malformed('Invalid alias, must match {alias}'.format(alias=ALIAS_STRING), expression)
109 | 
110 |         eval_expr = _build_eval_expression(source)
111 |         try:
112 |             result_frame = result_frame.eval('{destination} = {expr}'.format(destination=destination, expr=eval_expr), inplace=False)
113 |         except (SyntaxError, ValueError):
114 |             raise_malformed('Unknown function in alias', source)
115 | 
116 |     return result_frame
117 | 
118 | 
119 | def classify_expressions(project_q):
120 |     aggregate_functions = {}
121 |     alias_expressions = []
122 |     for expression in project_q:
123 |         if is_aggregate_function(expression):
124 |             aggregate_functions[expression[1]] = expression[0]
125 |         elif is_alias_assignment(expression):
126 |             alias_expressions.append(expression)
127 |         elif type(expression) is list:
128 |             raise_malformed('Invalid expression in select', expression)
129 | 
130 |     return aggregate_functions, alias_expressions
131 | 
132 | 
133 | def _project(dataframe, project_q):
134 |     if not project_q:
135 |         return dataframe
136 | 
137 |     assert_list('project', project_q)
138 | 
139 |     if project_q == [['count']]:
140 |         # Special case for count only, ~equal to SQL count(*)
141 |         return DataFrame.from_dict({'count': [len(dataframe)]})
142 | 
143 |     aggregate_fns, alias_expressions = classify_expressions(project_q)
144 | 
145 |     if aggregate_fns and alias_expressions:
146 |         raise_malformed("Cannot mix aliasing and aggregation functions", project_q)
147 | 
148 |     if isinstance(dataframe, DataFrameGroupBy):
149 |         dataframe = _aggregate(dataframe, project_q, aggregate_fns)
150 |     elif aggregate_fns:
151 |         return _aggregate_without_group_by(dataframe, project_q, aggregate_fns)
152 |     elif alias_expressions:
153 |         dataframe = _alias(dataframe, alias_expressions)
154 |     else:
155 |         # Nothing to do here
156 |         pass
157 | 
158 |     columns = [e if type(e) is not list else e[1] for e in project_q]
159 | 
160 |     try:
161 |         return dataframe[columns]
162 |     except KeyError:
163 |         missing_columns = set(columns) - set(dataframe.columns.values)
164 |         raise_malformed("Selected columns not in table", list(missing_columns))
165 | 
166 | 
167 | def _order_by(dataframe, order_q):
168 |     if not order_q:
169 |         return dataframe
170 | 
171 |     assert_list('order_by', order_q)
172 |     if not all(isinstance(c, basestring) for c in order_q):
173 |         raise_malformed("Invalid order by format", order_q)
174 | 
175 |     columns = [e[1:] if e.startswith('-') else e for e in order_q]
176 |     ascending = [not e.startswith('-') for e in order_q]
177 | 
178 |     try:
179 |         return dataframe.sort_values(by=columns, ascending=ascending)
180 |     except KeyError:
181 |         raise_malformed("Order by column not in table", columns)
182 | 
183 | 
184 | def _do_slice(dataframe, offset, limit):
185 |     if offset:
186 |         assert_integer('offset', offset)
187 |         dataframe = dataframe[offset:]
188 | 
189 |     if limit:
190 |         assert_integer('limit', limit)
191 |         dataframe = dataframe[:limit]
192 | 
193 |     return dataframe
194 | 
195 | 
196 | def _distinct(dataframe, columns):
197 |     if columns is None:
198 |         return dataframe
199 | 
200 |     args = {}
201 |     if columns:
202 |         args['subset'] = columns
203 | 
204 |     return dataframe.drop_duplicates(**args)
205 | 
206 | 
207 | def query(dataframe, q):
208 |     if not isinstance(q, dict):
209 |         raise MalformedQueryException('Query must be a dictionary, not "{q}"'.format(q=q))
210 | 
211 |     key_set = set(q.keys())
212 |     if not key_set.issubset(QUERY_CLAUSES):
213 |         raise MalformedQueryException('Unknown query clauses: {keys}'.format(
214 |             keys=', '.join(key_set.difference(QUERY_CLAUSES))))
215 | 
216 |     try:
217 |         if CLAUSE_FROM in q:
218 |             dataframe, _ = query(dataframe, q[CLAUSE_FROM])
219 | 
220 |         filtered_df = pandas_filter(dataframe, q.get('where'))
221 |         grouped_df = _group_by(filtered_df, q.get('group_by'))
222 |         distinct_df = _distinct(grouped_df, q.get('distinct'))
223 |         projected_df = _project(distinct_df, q.get('select'))
224 |         ordered_df = _order_by(projected_df, q.get('order_by'))
225 |         sliced_df = _do_slice(ordered_df, q.get('offset'), q.get('limit'))
226 |         return sliced_df, len(ordered_df)
227 |     except UndefinedVariableError as e:
228 |         raise MalformedQueryException(str(e))
229 | 


--------------------------------------------------------------------------------
/qcache/qframe/update.py:
--------------------------------------------------------------------------------
  1 | from qcache.qframe.common import assert_len, raise_malformed, is_quoted, unquote
  2 | from qcache.qframe.constants import COMPARISON_OPERATORS
  3 | 
  4 | 
  5 | def _prepare_arg(df, arg):
  6 |     if isinstance(arg, basestring):
  7 |         if is_quoted(arg):
  8 |             return unquote(arg)
  9 | 
 10 |         return getattr(df, arg)
 11 | 
 12 |     return arg
 13 | 
 14 | 
 15 | def _build_update_filter(df, update_q):
 16 |     if type(update_q) is not list:
 17 |         raise_malformed("Expressions must be lists", update_q)
 18 | 
 19 |     if not update_q:
 20 |         raise_malformed("Empty expression not allowed", update_q)
 21 | 
 22 |     operator = update_q[0]
 23 |     if operator == "isnull":
 24 |         assert_len(update_q, 2, 'Invalid length of isnull query')
 25 |         try:
 26 |             return getattr(_prepare_arg(df, update_q[1]), 'isnull')()
 27 |         except AttributeError:
 28 |             raise_malformed("Unknown column for 'isnull'", update_q)
 29 | 
 30 |     if operator == "in":
 31 |         if len(update_q) != 3:
 32 |             raise_malformed("Invalid length of 'in' query", update_q)
 33 | 
 34 |         _, column, values = update_q
 35 |         if column not in df:
 36 |             raise_malformed("First argument to 'in' must be a column present in frame", update_q)
 37 | 
 38 |         if not isinstance(values, (list, tuple)):
 39 |             raise_malformed("Second argument to 'in' must be a list", update_q)
 40 | 
 41 |         return getattr(df, column).isin([_prepare_arg(df, val) for val in values])
 42 | 
 43 |     if operator in COMPARISON_OPERATORS:
 44 |         arg1 = _prepare_arg(df, update_q[1])
 45 |         arg2 = _prepare_arg(df, update_q[2])
 46 |         return COMPARISON_OPERATORS[operator](arg1, arg2)
 47 | 
 48 |     raise_malformed("Unknown operator '{operator}'".format(operator=operator), update_q)
 49 | 
 50 | 
 51 | def _build_update_values(df, updates):
 52 |     columns, values = zip(*updates)
 53 |     return columns, [_prepare_arg(df, val) for val in values]
 54 | 
 55 | 
 56 | def classify_updates(q):
 57 |     # Updates can be either simple assignments or self referring updates (e. column += 1).
 58 |     # The former can be applied all at once while pandas only supports updates of one column
 59 |     # at the time for the latter. All updates are performed in the order they are declared
 60 |     # in the query.
 61 |     simple_run = []
 62 |     for update in q['update']:
 63 |         if not isinstance(update, (list, tuple)):
 64 |             raise_malformed("Invalid update clause", update)
 65 | 
 66 |         if len(update) == 2:
 67 |             simple_run.append(update)
 68 |         else:
 69 |             if simple_run:
 70 |                 yield ('simple', simple_run)
 71 |                 simple_run = []
 72 |             yield ('self-referring', update)
 73 | 
 74 |     if simple_run:
 75 |         yield ('simple', simple_run)
 76 | 
 77 | 
 78 | def apply_operation(df, update_filter, column, op, value):
 79 |     # This is repetitive and ugly but the only way I've found to do in place updates
 80 |     if op == '+':
 81 |         df.ix[update_filter, column] += value
 82 |     elif op == '-':
 83 |         df.ix[update_filter, column] -= value
 84 |     elif op == '*':
 85 |         df.ix[update_filter, column] *= value
 86 |     elif op == '/':
 87 |         df.ix[update_filter, column] /= value
 88 |     elif op == '<<':
 89 |         df.ix[update_filter, column] <<= value
 90 |     elif op == '>>':
 91 |         df.ix[update_filter, column] >>= value
 92 |     elif op == '&':
 93 |         df.ix[update_filter, column] &= value
 94 |     elif op == '|':
 95 |         df.ix[update_filter, column] |= value
 96 |     elif op == '^':
 97 |         df.ix[update_filter, column] ^= value
 98 |     elif op == '%':
 99 |         df.ix[update_filter, column] %= value
100 |     elif op == '**':
101 |         df.ix[update_filter, column] **= value
102 |     else:
103 |         raise_malformed('Invalid update operator', (op, value, column))
104 | 
105 | 
106 | def update_frame(df, q):
107 |     update_filter = _build_update_filter(df, q['where'])
108 |     for update_type, updates in classify_updates(q):
109 |         if update_type == 'simple':
110 |             columns, values = _build_update_values(df, updates)
111 |             df.ix[update_filter, columns] = values
112 |         else:
113 |             op, column, value = updates
114 |             apply_operation(df, update_filter, column, op, value)
115 | 


--------------------------------------------------------------------------------
/qcache/statistics.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import json
 3 | import time
 4 | 
 5 | 
 6 | class Statistics(object):
 7 |     def __init__(self, buffer_size):
 8 |         self.buffer_size = buffer_size
 9 |         self.reset()
10 | 
11 |     def inc(self, stat_name, count=1):
12 |         if stat_name not in self.stats:
13 |             self.stats[stat_name] = 0
14 | 
15 |         self.stats[stat_name] += count
16 | 
17 |     def append(self, stat_name, value):
18 |         if stat_name not in self.stats:
19 |             self.stats[stat_name] = deque(maxlen=self.buffer_size)
20 | 
21 |         self.stats[stat_name].append(value)
22 | 
23 |     def extend(self, stat_name, values):
24 |         if stat_name not in self.stats:
25 |             self.stats[stat_name] = deque(maxlen=self.buffer_size)
26 | 
27 |         self.stats[stat_name].extend(values)
28 | 
29 |     def reset(self, timestamp=None):
30 |         if timestamp is None:
31 |             timestamp = time.time()
32 |         self.stats = {'since': timestamp,
33 |                       'statistics_buffer_size': self.buffer_size}
34 | 
35 |     def snapshot(self):
36 |         """
37 |         Create a statistics snapshot. This will reset the statistics.
38 |         """
39 |         snapshot = self.stats.copy()
40 |         for k, v in snapshot.items():
41 |             if isinstance(v, deque):
42 |                 snapshot[k] = list(v)
43 | 
44 |         timestamp = time.time()
45 |         snapshot['statistics_duration'] = timestamp - snapshot['since']
46 |         del snapshot['since']
47 |         self.reset()
48 |         return snapshot
49 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E122, E126, E127, E266, E241


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | REQUIRES = [
 7 |     'docopt==0.6.2',
 8 |     'numpy==1.13.3',
 9 |     'pandas==0.20.3',
10 |     'tornado==5.1.1',
11 |     'lz4==2.1.6'
12 | ]
13 | 
14 | 
15 | def find_version(fname):
16 |     '''Attempts to find the version number in the file names fname.
17 |     Raises RuntimeError if not found.
18 |     '''
19 |     version = ''
20 |     with open(fname, 'r') as fp:
21 |         reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]')
22 |         for line in fp:
23 |             m = reg.match(line)
24 |             if m:
25 |                 version = m.group(1)
26 |                 break
27 |     if not version:
28 |         raise RuntimeError('Cannot find version information')
29 |     return version
30 | 
31 | __version__ = find_version("qcache/__init__.py")
32 | 
33 | 
34 | def read(fname):
35 |     with open(fname) as fp:
36 |         content = fp.read()
37 |     return content
38 | 
39 | setup(
40 |     name='qcache',
41 |     version=__version__,
42 |     description='In memory cache server with analytical query capabilities',
43 |     long_description=read("README.rst"),
44 |     author='Tobias Gustafsson',
45 |     author_email='tobias.l.gustafsson@gmail.com',
46 |     url='https://github.com/tobgu/qcache',
47 |     install_requires=REQUIRES,
48 |     license="MIT",
49 |     zip_safe=False,
50 |     keywords='qcache',
51 |     classifiers=[
52 |         'Development Status :: 4 - Beta',
53 |         'Intended Audience :: Developers',
54 |         'License :: OSI Approved :: MIT License',
55 |         'Natural Language :: English',
56 |         "Programming Language :: Python :: 2",
57 |         'Programming Language :: Python :: 2.7',
58 |         'Programming Language :: Python :: Implementation :: CPython',
59 |     ],
60 |     packages=["qcache", "qcache.qframe"],
61 |     entry_points={
62 |         'console_scripts': [
63 |             "qcache = qcache:main"
64 |         ]
65 |     }
66 | )
67 | 


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | 
  4 | from invoke import task, run
  5 | from qcache import __version__ as qcache_version
  6 | 
  7 | docs_dir = 'docs'
  8 | build_dir = os.path.join(docs_dir, '_build')
  9 | 
 10 | 
 11 | @task
 12 | def test():
 13 |     run('python -m pytest -s -v -m "not benchmark"', pty=True)
 14 | 
 15 | 
 16 | @task
 17 | def test_limited(limit_by):
 18 |     run('python -m pytest -s -v -m "not benchmark" -k{}'.format(limit_by), pty=True)
 19 | 
 20 | 
 21 | @task
 22 | def benchmark():
 23 |     run('python -m pytest -s -v -m "benchmark"', pty=True)
 24 | 
 25 | 
 26 | @task
 27 | def coverage():
 28 |     run('python -m pytest --cov=qcache', pty=True)
 29 |     run('coverage report -m', pty=True)
 30 |     run('coverage html', pty=True)
 31 | 
 32 | 
 33 | @task
 34 | def flake8():
 35 |     run("flake8 qcache test")
 36 | 
 37 | 
 38 | @task
 39 | def clean():
 40 |     run("rm -rf build")
 41 |     run("rm -rf dist")
 42 |     run("rm -rf qcache.egg-info")
 43 |     clean_docs()
 44 |     print("Cleaned up.")
 45 | 
 46 | 
 47 | @task
 48 | def clean_docs():
 49 |     run("rm -rf %s" % build_dir)
 50 | 
 51 | 
 52 | @task
 53 | def browse_docs():
 54 |     run("open %s" % os.path.join(build_dir, 'index.html'))
 55 | 
 56 | 
 57 | @task
 58 | def build_docs(clean=False, browse=False):
 59 |     if clean:
 60 |         clean_docs()
 61 |     run("sphinx-build %s %s" % (docs_dir, build_dir), pty=True)
 62 |     if browse:
 63 |         browse_docs()
 64 | 
 65 | 
 66 | @task
 67 | def readme(browse=False):
 68 |     run('rst2html.py README.rst > README.html')
 69 | 
 70 | 
 71 | @task
 72 | def publish(test=False):
 73 |     """Publish to the cheeseshop."""
 74 |     if test:
 75 |         run('python setup.py register -r pypitest sdist upload -r pypitest')
 76 |     else:
 77 |         run("python setup.py register sdist upload")
 78 | 
 79 | 
 80 | @task
 81 | def install():
 82 |     run('python setup.py sdist install')
 83 | 
 84 | 
 85 | @task
 86 | def build_image():
 87 |     run("sudo docker build -t tobgu/qcache:{version} .".format(version=qcache_version))
 88 |     run("sudo docker tag tobgu/qcache:{version} tobgu/qcache:latest".format(version=qcache_version))
 89 | 
 90 | 
 91 | @task
 92 | def push_image():
 93 |     run("sudo docker push tobgu/qcache:{version}".format(version=qcache_version))
 94 |     run("sudo docker push tobgu/qcache:latest")
 95 | 
 96 | 
 97 | @task
 98 | def tag():
 99 |     run("git tag -fa v{version} -m 'v{version}'".format(version=qcache_version))
100 | 


--------------------------------------------------------------------------------
/test/performance_run.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from time import time
 3 | 
 4 | if __name__ == '__main__':
 5 |     i = 0
 6 |     results = []
 7 |     t0 = time()
 8 |     while i <= 1000:
 9 |         requests.get('http://localhost:8088/status')
10 |         t1 = time()
11 |         results.append(t1 - t0)
12 |         t0 = t1
13 |         i += 1
14 | 
15 |     results.sort()
16 |     print "Median: %s, 90perc: %s, 99perc: %s" % (results[500], results[900], results[990])
17 | 


--------------------------------------------------------------------------------
/test/test_api.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import json
  3 | import os
  4 | 
  5 | import lz4 as lz4
  6 | import ssl
  7 | 
  8 | from tornado.httputil import url_concat
  9 | from tornado.testing import AsyncHTTPTestCase
 10 | from freezegun import freeze_time
 11 | 
 12 | import qcache
 13 | import qcache.app as app
 14 | import csv
 15 | from StringIO import StringIO
 16 | 
 17 | 
 18 | def to_json(data):
 19 |     return json.dumps(data)
 20 | 
 21 | 
 22 | def to_csv(data):
 23 |     if not data:
 24 |         return ""
 25 | 
 26 |     out = StringIO()
 27 |     writer = csv.DictWriter(out, data[0].keys())
 28 |     writer.writeheader()
 29 | 
 30 |     for entry in data:
 31 |         writer.writerow(entry)
 32 | 
 33 |     return out.getvalue()
 34 | 
 35 | 
 36 | def from_csv(text):
 37 |     input_data = StringIO(text)
 38 |     return list(csv.DictReader(input_data))
 39 | 
 40 | 
 41 | class SharedTest(AsyncHTTPTestCase):
 42 |     def get_app(self):
 43 |         return app.make_app(url_prefix='', debug=True)
 44 | 
 45 |     def post_json(self, url, data, extra_headers=None):
 46 |         if not isinstance(data, basestring):
 47 |             body = to_json(data)
 48 |         else:
 49 |             # Data already prepared by calling function
 50 |             body = data
 51 | 
 52 |         headers = {'Content-Type': 'application/json'}
 53 | 
 54 |         if extra_headers:
 55 |             headers.update(extra_headers)
 56 | 
 57 |         return self.fetch(url, method='POST', body=body, headers=headers, use_gzip=False)
 58 | 
 59 |     def query_json(self, url, query, extra_headers=None):
 60 |         url = url_concat(url, {'q': json.dumps(query)})
 61 |         headers = {'Accept': 'application/json, text/csv'}
 62 |         if extra_headers:
 63 |             headers.update(extra_headers)
 64 |         return self.fetch(url, headers=headers, use_gzip=False)
 65 | 
 66 |     def post_csv(self, url, data, types=None, extra_headers=None):
 67 |         headers = {'Content-Type': 'text/csv'}
 68 |         if types:
 69 |             headers['X-QCache-types'] = '; '.join('{column_name}={type_name}'.format(column_name=c, type_name=t)
 70 |                                                   for c, t in types.items())
 71 |         if extra_headers:
 72 |             headers.update(extra_headers)
 73 | 
 74 |         body = to_csv(data)
 75 |         return self.fetch(url, method='POST', body=body, headers=headers, use_gzip=False)
 76 | 
 77 |     def query_csv(self, url, query):
 78 |         url = url_concat(url, {'q': json.dumps(query)})
 79 |         return self.fetch(url, headers={'Accept': 'text/csv, application/json'}, use_gzip=False)
 80 | 
 81 |     def get_statistics(self):
 82 |         response = self.fetch('/statistics', use_gzip=False)
 83 |         assert response.code == 200
 84 |         return json.loads(response.body)
 85 | 
 86 | 
 87 | class TestBaseCases(SharedTest):
 88 |     def test_404_when_item_is_missing(self):
 89 |         url = url_concat('/dataset/abc', {'q': json.dumps('{}')})
 90 |         response = self.fetch(url)
 91 |         assert response.code == 404
 92 | 
 93 |     def test_upload_json_query_json(self):
 94 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
 95 |         assert response.code == 201
 96 | 
 97 |         response = self.query_json('/dataset/abc', {'where': ['==', 'foo', 1]})
 98 |         assert response.code == 200
 99 |         assert json.loads(response.body) == [{'foo': 1, 'bar': 10}]
100 | 
101 |     def test_upload_csv_query_csv(self):
102 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}, {'baz': 2, 'bar': 20}])
103 |         assert response.code == 201
104 | 
105 |         response = self.query_csv('/dataset/cba', {'where': ['==', 'baz', 1]})
106 |         assert response.code == 200
107 |         assert from_csv(response.body) == [{'baz': '1', 'bar': '10'}]  # NB: Strings for numbers here
108 | 
109 |     def test_division_by_zero(self):
110 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 0}])
111 |         assert response.code == 201
112 | 
113 |         # Result of division by 0 will be transmitted as null/None
114 |         response = self.query_json('/dataset/abc', {'select': [['=', 'baz', ['/', 'foo', 'bar']]]})
115 |         assert response.code == 200
116 |         assert json.loads(response.body) == [{'baz': None}]
117 | 
118 | 
119 | class TestQueryWithPost(SharedTest):
120 |     def post_query_json(self, url, query):
121 |         return self.fetch(url, headers={'Accept': 'application/json, text/csv', 'Content-Type': 'application/json'},
122 |                           method="POST", body=to_json(query))
123 | 
124 |     def test_upload_json_post_query_json(self):
125 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
126 |         assert response.code == 201
127 | 
128 |         response = self.post_query_json('/dataset/abc/q', {'where': ['==', 'foo', 1]})
129 |         assert response.code == 200
130 |         assert json.loads(response.body) == [{'foo': 1, 'bar': 10}]
131 | 
132 |     def test_upload_json_post_query_json_malformed_query(self):
133 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
134 |         assert response.code == 201
135 | 
136 |         response = self.post_query_json('/dataset/abc/q', {'blabb': ['==', 'foo', 1]})
137 |         assert response.code == 400
138 | 
139 |     def test_delete_against_q_endpoint_is_404(self):
140 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
141 |         assert response.code == 201
142 | 
143 |         response = self.fetch('/dataset/abc/q', method='DELETE')
144 |         assert response.code == 404
145 | 
146 |         response = self.fetch('/dataset/abc', method='DELETE')
147 |         assert response.code == 200
148 | 
149 |     def test_get_against_q_endpoint_is_404(self):
150 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
151 |         assert response.code == 201
152 | 
153 |         response = self.query_json('/dataset/abc/q', query={})
154 |         assert response.code == 404
155 | 
156 |         response = self.query_json('/dataset/abc', query={})
157 |         assert response.code == 200
158 | 
159 | 
160 | class TestSlicing(SharedTest):
161 |     def test_unsliced_size_header_indicates_the_dataset_size_before_slicing_it(self):
162 |         # This helps out in pagination of data
163 |         self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}, {'baz': 2, 'bar': 20}])
164 | 
165 |         # Fetch all data, the header value should be the same as the length of the response
166 |         response = self.query_json('/dataset/cba', {})
167 |         assert response.code == 200
168 |         assert len(json.loads(response.body)) == 2
169 |         assert response.headers['X-QCache-unsliced-length'] == '2'
170 | 
171 |         response = self.query_json('/dataset/cba', {"offset": 1})
172 |         assert response.code == 200
173 |         assert len(json.loads(response.body)) == 1
174 |         assert response.headers['X-QCache-unsliced-length'] == '2'
175 | 
176 | 
177 | class TestCharacterEncoding(SharedTest):
178 |     def test_upload_json_query_json_unicode_characters(self):
179 |         response = self.post_json('/dataset/abc', [{'foo': u'Iñtërnâtiônàližætiøn'}, {'foo': 'qux'}])
180 |         assert response.code == 201
181 | 
182 |         response = self.query_json('/dataset/abc', {'where': ['==', 'foo', u'"Iñtërnâtiônàližætiøn"']})
183 | 
184 |         assert response.code == 200
185 |         response_data = json.loads(response.body)
186 |         assert response_data == [{'foo': u'Iñtërnâtiônàližætiøn'}]
187 |         assert type(response_data[0]['foo']) is unicode
188 | 
189 |     def test_upload_csv_query_csv_unicode_characters_encoded_as_utf8(self):
190 |         response = self.post_csv('/dataset/abc', [{'foo': u'Iñtërnâtiônàližætiønåäö'.encode('utf-8')}, {'foo': 'qux'}])
191 |         assert response.code == 201
192 | 
193 |         response = self.query_csv('/dataset/abc', {'where': ['==', 'foo', u'"Iñtërnâtiônàližætiønåäö"']})
194 |         assert response.code == 200
195 |         assert from_csv(response.body) == [{'foo': u'Iñtërnâtiônàližætiønåäö'.encode('utf-8')}]
196 | 
197 |     def test_upload_csv_query_json_unicode_characters_encoded_as_utf8(self):
198 |         response = self.post_csv('/dataset/abc', [{'foo': u'Iñtërnâtiônàližætiønåäö'.encode('utf-8')}, {'foo': 'qux'}])
199 |         assert response.code == 201
200 | 
201 |         response = self.query_json('/dataset/abc', {'where': ['==', 'foo', u'"Iñtërnâtiônàližætiønåäö"']})
202 | 
203 |         assert response.code == 200
204 |         response_data = json.loads(response.body)
205 |         assert json.loads(response.body) == [{'foo': u'Iñtërnâtiônàližætiønåäö'}]
206 |         assert type(response_data[0]['foo']) is unicode
207 | 
208 |     def test_upload_invalid_content_type(self):
209 |         response = self.fetch('/dataset/abc', method='POST', body='', headers={'Content-Type': 'text/html'})
210 |         assert response.code == 415
211 | 
212 |     def test_upload_invalid_charset(self):
213 |         response = self.fetch('/dataset/abc', method='POST', body='',
214 |                               headers={'Content-Type': 'text/csv; charset=iso-123'})
215 |         assert response.code == 415
216 | 
217 | 
218 | class TestInvalidQueries(SharedTest):
219 |     def setUp(self):
220 |         super(TestInvalidQueries, self).setUp()
221 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
222 |         assert response.code == 201
223 | 
224 |     def do_invalid(self, q):
225 |         response = self.query_json('/dataset/abc', q)
226 |         assert response.code == 400
227 |         return response
228 | 
229 |     def test_list_instead_of_dict(self):
230 |         self.do_invalid(['where', ['==', 'foo', 1]])
231 | 
232 |     def test_json_not_possible_to_parse(self):
233 |         response = self.fetch(url_concat('/dataset/abc', {'q': 'foo'}))
234 |         assert response.code == 400
235 | 
236 |     def test_invalid_filter_format(self):
237 |         response = self.do_invalid({'where': ['==', 'foo', 1, 2]})
238 |         assert 'Invalid number of arguments' in json.loads(response.body)['error']
239 | 
240 |     def test_unknown_filter_operator(self):
241 |         response = self.query_json('/dataset/abc', {'where': ['<>', 'foo', 1]})
242 |         assert 'Unknown operator' in json.loads(response.body)['error']
243 | 
244 |     def test_unknown_select_operator(self):
245 |         response = self.query_json('/dataset/abc', {'select': [['baz', 'foo']]})
246 |         assert 'Unknown aggregation function' in json.loads(response.body)['error']
247 | 
248 |     def test_missing_column_in_select(self):
249 |         response = self.query_json('/dataset/abc', {'select': ['baz', 'foo']})
250 |         assert 'Selected columns not in table' in json.loads(response.body)['error']
251 | 
252 |     def test_missing_column_in_filter(self):
253 |         response = self.query_json('/dataset/abc', {'where': ['>', 'baz', 1]})
254 |         assert 'is not defined' in json.loads(response.body)['error']
255 | 
256 |     def test_missing_column_in_group_by(self):
257 |         response = self.query_json('/dataset/abc', {'group_by': ['baz']})
258 |         assert 'Group by column not in table' in json.loads(response.body)['error']
259 | 
260 |     def test_missing_column_in_order_by(self):
261 |         response = self.query_json('/dataset/abc', {'order_by': ['baz']})
262 |         assert 'Order by column not in table' in json.loads(response.body)['error']
263 | 
264 |     def test_malformed_order_by(self):
265 |         response = self.query_json('/dataset/abc', {'order_by': [['baz']]})
266 |         assert 'Invalid order by format' in json.loads(response.body)['error']
267 | 
268 |     def test_wrong_type_for_offset(self):
269 |         response = self.query_json('/dataset/abc', {'offset': 4.3})
270 |         assert 'Invalid type' in json.loads(response.body)['error']
271 | 
272 |     def test_group_by_not_list(self):
273 |         response = self.query_json('/dataset/abc', {'group_by': {'foo': 4.3}})
274 |         assert 'Invalid format' in json.loads(response.body)['error']
275 | 
276 | 
277 | # Error cases:
278 | # - Malformed query
279 | # * Still some edge cases left in projection and filter.
280 | # - Malformed input data
281 | #   * No data sent in => error
282 | #   * Wrong format specified
283 | #   * Accepted format specified but cannot be encoded
284 | #   * Non-uniform JSON and CSV
285 | # - Non fitting data
286 | #   * The data is too large to be fitted into memory of the current instance.
287 | 
288 | 
289 | class TestBitwiseQueries(SharedTest):
290 |     def test_bitwise_query_succeeds(self):
291 |         response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}])
292 |         assert response.code == 201
293 | 
294 |         response = self.query_json('/dataset/abc', {'where': ['all_bits', 'foo', 1]})
295 |         assert response.code == 200
296 | 
297 | 
298 | class TestCacheEvictionOnSize(SharedTest):
299 |     def get_app(self):
300 |         # A cache size of 200 is trimmed for the below test cases
301 |         just_enough_to_fit_smaller_values = 315
302 |         return app.make_app(url_prefix='', max_cache_size=just_enough_to_fit_smaller_values, debug=True)
303 | 
304 |     def test_evicts_entry_when_too_much_space_occupied(self):
305 |         data = [{'some_longish_key': 'some_fairly_longish_value_that_needs_to_be_stuffed_in'},
306 |                 {'some_longish_key': 'another_fairly_longish_value_that_also_should_be_fitted'}]
307 | 
308 |         # Post data and assure available
309 |         response = self.post_json('/dataset/abc', data)
310 |         assert response.code == 201
311 |         assert self.query_json('/dataset/abc', {}).code == 200
312 | 
313 |         response = self.post_json('/dataset/cba', data)
314 |         assert response.code == 201
315 | 
316 |         # The old dataset has been evicted, the new one has taken its place
317 |         assert self.query_json('/dataset/abc', {}).code == 404
318 |         assert self.query_json('/dataset/cba', {}).code == 200
319 | 
320 |         # Check statistics
321 |         stats = self.get_statistics()
322 | 
323 |         assert stats['dataset_count'] == 1
324 |         assert stats['cache_size'] == 370
325 |         assert stats['hit_count'] == 2
326 |         assert stats['miss_count'] == 1
327 |         assert stats['size_evict_count'] == 1
328 |         assert stats['store_count'] == 2
329 |         assert stats['statistics_duration'] > 0.0
330 |         assert stats['statistics_buffer_size'] == 1000
331 |         assert len(stats['store_durations']) == 2
332 |         assert len(stats['store_row_counts']) == 2
333 |         assert sum(stats['store_row_counts']) == 4
334 |         assert len(stats['query_durations']) == 2
335 |         assert len(stats['durations_until_eviction']) == 1
336 |         assert stats['durations_until_eviction'][0] > 0.0
337 | 
338 |         # Check stats again, this time it should have been cleared
339 |         assert set(self.get_statistics().keys()) == \
340 |                {'dataset_count', 'cache_size', 'statistics_duration', 'statistics_buffer_size'}
341 | 
342 |     def test_can_insert_more_entries_with_smaller_values(self):
343 |         data = [{'some_longish_key': 'short'},
344 |                 {'some_longish_key': 'another_short'}]
345 | 
346 |         self.post_json('/dataset/abc', data)
347 |         self.post_json('/dataset/cba', data)
348 | 
349 |         # Both datasets co-exist in the cache
350 |         assert self.query_json('/dataset/abc', {}).code == 200
351 |         assert self.query_json('/dataset/cba', {}).code == 200
352 | 
353 |     def test_query_stand_in_columns_do_not_interfere_with_cache_eviction(self):
354 |         # Executing a query with stand in columns can increase the dataset
355 |         # size after insert. This should not lead to failed handling of the
356 |         # current cache size where the measured cache size gets smaller and
357 |         # smaller which causes the actual cache size to grow. See #15.
358 |         data = [{'some_longish_key': 'some_fairly_longish_value_that_needs_to_be_stuffed_in'},
359 |                 {'some_longish_key': 'another_fairly_longish_value_that_also_should_be_fitted'}]
360 | 
361 |         repetitions = 10
362 |         for i in range(repetitions):
363 |             response = self.post_json('/dataset/{i}'.format(i=i), data)
364 |             assert response.code == 201
365 |             assert self.query_json('/dataset/{i}'.format(i=i),
366 |                                    {},
367 |                                    extra_headers={'X-QCache-stand-in-columns': 'foo="bar_baz_qux"'}).code == 200
368 | 
369 |         stats = self.get_statistics()
370 |         assert stats['dataset_count'] == 1
371 |         assert stats['size_evict_count'] == repetitions - 1
372 |         assert stats['cache_size'] == 370
373 | 
374 | 
375 | class TestCacheEvictionOnAge(SharedTest):
376 |     def get_app(self):
377 |         # A cache size of 200 is trimmed for the below test cases
378 |         return app.make_app(url_prefix='', max_age=5, debug=True)
379 | 
380 |     def test_evicts_dataset_when_data_too_old(self):
381 |         with freeze_time('2015-10-22 00:00:00'):
382 |             data = [{'some_longish_key': 'short'}]
383 |             self.post_json('/dataset/abc', data)
384 | 
385 |         with freeze_time('2015-10-22 00:00:04'):
386 |             assert self.query_json('/dataset/abc', {}).code == 200
387 | 
388 |         with freeze_time('2015-10-22 00:00:06'):
389 |             assert self.query_json('/dataset/abc', {}).code == 404
390 | 
391 | 
392 | class TestStatusEndpoint(SharedTest):
393 |     def test_status_endpoint_returns_200_ok(self):
394 |         response = self.fetch('/status')
395 | 
396 |         assert response.code == 200
397 |         assert response.body == "OK"
398 | 
399 | 
400 | class TestDatasetDelete(SharedTest):
401 |     def test_post_data_then_delete(self):
402 |         data = [{'some_key': '123456'}]
403 |         self.post_json('/dataset/abc', data)
404 | 
405 |         assert self.query_json('/dataset/abc', {}).code == 200
406 |         assert self.fetch('/dataset/abc', method='DELETE').code == 200
407 |         assert self.query_json('/dataset/abc', {}).code == 404
408 | 
409 | 
410 | class TestColumnTyping(SharedTest):
411 |     def get(self, q, response_code=200):
412 |         response = self.query_json('/dataset/abc', q)
413 |         assert response.code == response_code
414 |         return json.loads(response.body)
415 | 
416 |     def test_type_hint_string_on_column_with_only_integers(self):
417 |         data = [
418 |             {'some_key': '123456', 'another_key': 1111},
419 |             {'some_key': 'abcdef', 'another_key': 2222}]
420 | 
421 |         self.post_csv('/dataset/abc', data, types={'another_key': 'string'})
422 | 
423 |         assert self.get({'where': ['==', 'another_key', '"2222"']}) == \
424 |                [{'some_key': 'abcdef', 'another_key': '2222'}]
425 | 
426 |         # No matching item when querying by integer
427 |         assert not self.get({'where': ['==', 'another_key', 2222]})
428 | 
429 |     def test_type_hinting_with_invalid_type_results_in_bad_request(self):
430 |         # It's currently only possible to type hint strings and enums.
431 |         # Is there ever a need for other type hints?
432 | 
433 |         data = [{'some_key': '123456', 'another_key': 1111}]
434 |         response = self.post_csv('/dataset/abc', data, types={'another_key': 'int'})
435 |         assert response.code == 400
436 | 
437 |     def test_type_hinting_with_enum(self):
438 |         data = [{'some_key': 'aaa'}]
439 |         response = self.post_csv('/dataset/abc', data, types={'some_key': 'enum'})
440 |         assert response.code == 201
441 | 
442 |         assert self.get({'where': ['==', 'some_key', '"aaa"']}) == [
443 |             {'some_key': 'aaa'}
444 |         ]
445 | 
446 |     def test_type_int_to_string(self):
447 |         def get(q, response_code=200):
448 |             response = self.query_json('/dataset/abc', q)
449 |             assert response.code == response_code
450 |             return json.loads(response.body)
451 | 
452 |         data = [
453 |             {'some_key': '123456', 'another_key': 1111},
454 |             {'some_key': 'abcdef', 'another_key': 2222}]
455 | 
456 |         self.post_csv('/dataset/abc', data)
457 | 
458 |         # Querying on integer field
459 |         assert get({'where': ['==', 'another_key', 2222]}) == \
460 |                [{'some_key': 'abcdef', 'another_key': 2222}]
461 | 
462 |         get({'where': ['==', 'another_key', '2222']}, response_code=400)
463 |         get({'where': ['==', 'another_key', '"2222"']}, response_code=400)
464 | 
465 |         # Querying on string field
466 |         assert not get({'where': ['==', 'some_key', 123456]})
467 |         get({'where': ['==', 'some_key', '123456']}, response_code=400)
468 | 
469 |         # Matching string
470 |         assert get({'where': ['==', 'some_key', '"123456"']}) == \
471 |                [{'some_key': '123456', 'another_key': 1111}]
472 | 
473 |         # Here abcdef is interpreted as another column. Since column abcdef
474 |         # doesn't exist a 400, Bad request will be returned.
475 |         get({'where': ['==', 'some_key', 'abcdef']}, response_code=400)
476 | 
477 |     def test_type_hinting_with_float(self):
478 |         data = [{'some_key': 12}]
479 |         response = self.post_csv('/dataset/abc', data, types={'some_key': 'float'})
480 |         assert response.code == 201
481 | 
482 |         result = self.get({'where': ['==', 'some_key', 12.0]})
483 |         assert result == [{'some_key': 12.0}]
484 |         assert type(result[0]['some_key']) == float
485 | 
486 | 
487 | class TestStandInColumns(SharedTest):
488 |     def test_stand_in_column_with_numeric_value(self):
489 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}],
490 |                                  extra_headers={'X-QCache-stand-in-columns': 'foo=13'})
491 |         assert response.code == 201
492 | 
493 |         response = self.query_json('/dataset/cba', {'where': ['==', 'foo', 13]})
494 |         assert response.code == 200
495 |         result = json.loads(response.body)
496 |         assert result == [{'baz': 1, 'bar': 10, 'foo': 13}]
497 |         assert type(result[0]['foo']) == int
498 | 
499 |         response = self.query_json('/dataset/cba', {'where': ['==', 'foo', 14]})
500 |         assert response.code == 200
501 |         assert json.loads(response.body) == []
502 | 
503 |     def test_stand_in_column_with_string_value(self):
504 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}],
505 |                                  extra_headers={'X-QCache-stand-in-columns': 'foo="13"'})
506 |         assert response.code == 201
507 | 
508 |         response = self.query_json('/dataset/cba', {'where': ['==', 'foo', '"13"']})
509 |         assert response.code == 200
510 |         assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': "13"}]
511 | 
512 |     def test_stand_in_column_with_other_column(self):
513 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}, {'baz': 2, 'bar': 20}],
514 |                                  extra_headers={'X-QCache-stand-in-columns': 'foo=bar'})
515 |         assert response.code == 201
516 | 
517 |         response = self.query_json('/dataset/cba', {'where': ['==', 'foo', 20]})
518 |         assert response.code == 200
519 |         assert json.loads(response.body) == [{'baz': 2, 'bar': 20, 'foo': 20}]
520 | 
521 |     def test_multiple_stand_in_columns(self):
522 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}],
523 |                                  extra_headers={'X-QCache-stand-in-columns': 'foo=bar; qux=13'})
524 |         assert response.code == 201
525 | 
526 |         response = self.query_json('/dataset/cba', {})
527 |         assert response.code == 200
528 |         assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': 10, 'qux': 13}]
529 | 
530 |     def test_chained_stand_in_columns(self):
531 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}],
532 |                                  extra_headers={'X-QCache-stand-in-columns': 'foo=13; qux=foo'})
533 |         assert response.code == 201
534 | 
535 |         response = self.query_json('/dataset/cba', {})
536 |         assert response.code == 200
537 |         assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': 13, 'qux': 13}]
538 | 
539 |     def test_json_stand_in_columns(self):
540 |         response = self.post_json('/dataset/cba', [{'baz': 1, 'bar': 10}],
541 |                                   extra_headers={'X-QCache-stand-in-columns': 'foo=13'})
542 |         assert response.code == 201
543 | 
544 |         response = self.query_json('/dataset/cba', {})
545 |         assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': 13}]
546 | 
547 |     def test_stand_in_column_not_applied_when_column_exists_in_submitted_data(self):
548 |         response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}],
549 |                                  extra_headers={'X-QCache-stand-in-columns': 'bar=13'})
550 |         assert response.code == 201
551 | 
552 |         response = self.query_json('/dataset/cba', {})
553 |         assert json.loads(response.body) == [{'baz': 1, 'bar': 10}]
554 | 
555 |     def test_stand_in_columns_in_query(self):
556 |         response = self.post_csv('/dataset/cba', [{'foo': 1}])
557 |         assert response.code == 201
558 | 
559 |         response = self.query_json('/dataset/cba', {}, extra_headers={'X-QCache-stand-in-columns': 'bar=13;baz=foo'})
560 |         assert json.loads(response.body) == [{'foo': 1, 'bar': 13, 'baz': 1}]
561 | 
562 | 
563 | class TestCompression(SharedTest):
564 |     def call_api_with_compression(self, accept_encoding, content_encoding, decoding_fn, encoding_fn, expected_encoding):
565 |         input_data = 10000 * [{'foo': 1, 'bar': 10}]
566 |         data = encoding_fn(to_json(input_data))
567 | 
568 |         response = self.post_json('/dataset/abc', data, extra_headers={'Content-Encoding': content_encoding})
569 |         assert response.code == 201
570 | 
571 |         response = self.query_json('/dataset/abc', query={}, extra_headers={'Accept-Encoding': accept_encoding})
572 | 
573 |         assert response.code == 200
574 |         assert response.headers.get('Content-Encoding') == expected_encoding
575 |         assert json.loads(decoding_fn(response.body)) == input_data
576 | 
577 |     def test_upload_gzip_accept_gzip(self):
578 |         self.call_api_with_compression(accept_encoding='gzip',
579 |                                        content_encoding='gzip',
580 |                                        decoding_fn=qcache.compression.gzip_loads,
581 |                                        encoding_fn=qcache.compression.gzip_dumps,
582 |                                        expected_encoding='gzip')
583 | 
584 |     def test_upload_lz4_accept_lz4(self):
585 |         self.call_api_with_compression(accept_encoding='lz4',
586 |                                        content_encoding='lz4',
587 |                                        decoding_fn=lz4.block.decompress,
588 |                                        encoding_fn=lz4.block.compress,
589 |                                        expected_encoding='lz4')
590 | 
591 |     def test_upload_lz4_accept_gzip(self):
592 |         self.call_api_with_compression(accept_encoding='lz4',
593 |                                        content_encoding='gzip',
594 |                                        decoding_fn=lz4.block.decompress,
595 |                                        encoding_fn=qcache.compression.gzip_dumps,
596 |                                        expected_encoding='lz4')
597 | 
598 |     def test_prefer_lz4_if_multiple_supported_encodings_exists(self):
599 |         self.call_api_with_compression(accept_encoding='compress,gzip,lz4',
600 |                                        content_encoding='gzip',
601 |                                        decoding_fn=lz4.block.decompress,
602 |                                        encoding_fn=qcache.compression.gzip_dumps,
603 |                                        expected_encoding='lz4')
604 | 
605 |     def test_unknown_accept_encoding_results_in_no_response_compression(self):
606 |         self.call_api_with_compression(accept_encoding='foo,bar',
607 |                                        content_encoding='lz4',
608 |                                        decoding_fn=lambda x: x,
609 |                                        encoding_fn=lz4.block.compress,
610 |                                        expected_encoding=None)
611 | 
612 |     def test_upload_with_unknown_encoding_results_in_400(self):
613 |         data = to_json([{'foo': 'bar'}])
614 |         response = self.post_json('/dataset/abc', data, extra_headers={'Content-Encoding': 'baz'})
615 |         assert response.code == 400
616 |         assert 'Unrecognized encoding' in response.body
617 | 
618 |     def test_only_200_responses_are_compressed(self):
619 |         data = to_json([{'foo': 'bar'}])
620 |         response = self.post_json('/dataset/abc', data)
621 |         assert response.code == 201
622 | 
623 |         response = self.query_json('/dataset/non_present_dataset', query={}, extra_headers={'Accept-Encoding': 'lz4'})
624 |         assert response.code == 404
625 |         assert response.headers.get('Content-Encoding') is None
626 | 
627 | 
628 | class TestStatistics(SharedTest):
629 |     def test_store_and_query_durations(self):
630 |         assert self.post_json('/dataset/abc', [{'foo': 123}]).code == 201
631 |         assert self.query_json('/dataset/abc', query={}).code == 200
632 | 
633 |         stats = self.get_statistics()
634 | 
635 |         assert len(stats['query_durations']) == 1
636 |         assert len(stats['store_durations']) == 1
637 |         assert len(stats['query_request_durations']) == 1
638 |         assert len(stats['store_request_durations']) == 1
639 | 
640 |         assert stats['query_durations'][0] < stats['query_request_durations'][0]
641 |         assert stats['store_durations'][0] < stats['store_request_durations'][0]
642 | 
643 | 
644 | class SSLTestBase(AsyncHTTPTestCase):
645 |     TLS_DIR = os.path.join(os.path.dirname(__file__), '../tls/')
646 | 
647 |     def get_app(self):
648 |         return app.make_app(url_prefix='', debug=True)
649 | 
650 |     def get_protocol(self):
651 |         return 'https'
652 | 
653 |     def get_url(self, path):
654 |         """Returns an absolute url for the given path on the test server."""
655 |         return '%s://localhost:%s%s' % (self.get_protocol(),
656 |                                         self.get_http_port(), path)
657 | 
658 |     def get_ssl_version(self):
659 |         raise NotImplementedError()
660 | 
661 |     def get_httpserver_options(self):
662 |         # By default don't require client certificate. Override in subclasses where client
663 |         # certs are tested.
664 |         return app.ssl_options(certfile=self.TLS_DIR + 'host.pem')
665 | 
666 |     def fetch(self, path, **kwargs):
667 |         if 'validate_cert' not in kwargs:
668 |             ssl_context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH,
669 |                                                      cafile=os.path.join(self.TLS_DIR, 'ca.pem'))
670 | 
671 |             if 'client_cert' in kwargs:
672 |                 ssl_context.load_cert_chain(kwargs['client_cert'])
673 | 
674 |             kwargs['ssl_options'] = ssl_context
675 | 
676 |         return super(SSLTestBase, self).fetch(path=path, **kwargs)
677 | 
678 | 
679 | class TestSSLServerWithSSL(SSLTestBase):
680 |     def test_fetch_status(self):
681 |         response = self.fetch('/status')
682 |         assert response.code == 200
683 | 
684 |     def test_fetch_status_no_cert_validation(self):
685 |         response = self.fetch('/status', validate_cert=False)
686 |         assert response.code == 200
687 | 
688 | 
689 | class TestSSLServerWithSSLClientCertVerification(SSLTestBase):
690 |     def get_httpserver_options(self):
691 |         return app.ssl_options(certfile=self.TLS_DIR + 'host.pem',
692 |                                cafile=self.TLS_DIR + 'ca.pem')
693 | 
694 |     def test_fetch_status(self):
695 |         response = self.fetch('/status', client_cert=self.TLS_DIR + 'host.pem')
696 |         assert response.code == 200
697 | 
698 |     def test_fetch_status_no_client_cert_supplied(self):
699 |         response = self.fetch('/status')
700 |         assert response.code == 599
701 | 
702 | 
703 | class TestSSLServerWithoutSSL(SSLTestBase):
704 |     def get_protocol(self):
705 |         return 'http'
706 | 
707 |     def test_fetch_status(self):
708 |         response = self.fetch('/status')
709 |         assert response.code == 599
710 | 
711 | 
712 | class TestSSLServerWithSSLAndBasicAuth(SSLTestBase):
713 |     def get_app(self):
714 |         return app.make_app(url_prefix='', debug=True, basic_auth='foo:bar')
715 | 
716 |     def test_fetch_status_correct_credentials(self):
717 |         response = self.fetch('/status', auth_username='foo', auth_password='bar')
718 |         assert response.code == 200
719 | 
720 |     def test_fetch_status_incorrect_password(self):
721 |         response = self.fetch('/status', auth_username='foo', auth_password='ba')
722 |         assert response.code == 401
723 | 
724 |     def test_fetch_status_unknown_user(self):
725 |         response = self.fetch('/status', auth_username='fo', auth_password='bar')
726 |         assert response.code == 401
727 | 
728 |     def test_fetch_status_missing_credentials(self):
729 |         response = self.fetch('/status')
730 |         assert response.code == 401
731 | 
732 |     def test_fetch_data_missing_credentials(self):
733 |         response = self.fetch('/dataset/XYZ')
734 |         assert response.code == 401
735 | 
736 |     def test_fetch_data_correct_credentials(self):
737 |         url = url_concat('/dataset/XYZ', {'q': json.dumps('{}')})
738 |         response = self.fetch(url, auth_username='foo', auth_password='bar')
739 |         assert response.code == 404
740 | 
741 |     def test_fetch_statistics_missing_credentials(self):
742 |         response = self.fetch('/statistics')
743 |         assert response.code == 401
744 | 
745 |     def test_fetch_statistics_correct_credentials(self):
746 |         response = self.fetch('/statistics', auth_username='foo', auth_password='bar')
747 |         assert response.code == 200
748 | 
749 |         # Delete against a Q endpoint is a 404
750 |         # Get against a Q endpoint is a 404
751 | 


--------------------------------------------------------------------------------
/test/test_qframe.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import json
  3 | from contextlib import contextmanager
  4 | import pytest
  5 | import time
  6 | 
  7 | from qcache.qframe import MalformedQueryException, QFrame
  8 | 
  9 | 
 10 | def query(df, q):
 11 |     return QFrame(df).query(q).df
 12 | 
 13 | ######################### Filtering ##########################
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def basic_frame():
 18 |     data = """
 19 | foo,bar,baz,qux
 20 | bbb,1.25,5,qqq
 21 | aaa,3.25,7,qqq
 22 | ccc,,9,www"""
 23 | 
 24 |     return QFrame.from_csv(data)
 25 | 
 26 | 
 27 | def assert_rows(qframe, rows, column='foo'):
 28 |     frame = qframe.df
 29 |     assert len(frame) == len(rows)
 30 | 
 31 |     for ix, row in enumerate(rows):
 32 |         assert frame.iloc[ix][column] == row
 33 | 
 34 | 
 35 | @pytest.mark.parametrize("operation, column, value, expected", [
 36 |     ("<",  'bar', 2, 'bbb'),
 37 |     (">",  'bar', 2, 'aaa'),
 38 |     (">",  'foo', "'bbb'", 'ccc'),
 39 |     ("<=", 'baz', 6, 'bbb'),
 40 |     ("<=", 'baz', 5, 'bbb'),
 41 |     (">=", 'foo', "'bbc'", 'ccc'),
 42 |     (">=", 'foo', "'ccc'", 'ccc'),
 43 |     ("==", 'foo', "'ccc'", 'ccc'),
 44 |     ("!=", 'qux', "'qqq'", 'ccc'),
 45 | ])
 46 | def test_filter_operations(basic_frame, operation, column, value, expected):
 47 |     frame = basic_frame.query({'where': [operation, column, value]})
 48 |     assert_rows(frame, [expected])
 49 | 
 50 | 
 51 | def test_negation(basic_frame):
 52 |     frame = basic_frame.query({'where': ["!", ["==", "qux", "'qqq'"]]})
 53 |     assert_rows(frame, ['ccc'])
 54 | 
 55 | 
 56 | def test_and(basic_frame):
 57 |     frame = basic_frame.query({'where': ["&", ["==", "qux", "'qqq'"], [">", "baz", 6]]})
 58 |     assert_rows(frame, ['aaa'])
 59 | 
 60 | 
 61 | def test_and_with_only_one_clause(basic_frame):
 62 |     frame = basic_frame.query({'where': ["&", ["==", "foo", "'aaa'"]]})
 63 |     assert_rows(frame, ['aaa'])
 64 | 
 65 |     frame = basic_frame.query({'where': ["&", ["==", "foo", "'abc'"]]})
 66 |     assert_rows(frame, [])
 67 | 
 68 | 
 69 | def test_or(basic_frame):
 70 |     frame = basic_frame.query({'where': ["|", ["==", "baz", 5], ["==", "baz", 7]]})
 71 |     assert_rows(frame, ['bbb', 'aaa'])
 72 | 
 73 | 
 74 | def test_or_with_only_one_clause(basic_frame):
 75 |     frame = basic_frame.query({'where': ["|", ["==", "foo", "'aaa'"]]})
 76 |     assert_rows(frame, ['aaa'])
 77 | 
 78 |     frame = basic_frame.query({'where': ["|", ["==", "foo", "'abc'"]]})
 79 |     assert_rows(frame, [])
 80 | 
 81 | 
 82 | def test_col_in_list(basic_frame):
 83 |     frame = basic_frame.query({'where': ["in", "baz", [5, 8, -2]]})
 84 |     assert_rows(frame, ['bbb'])
 85 | 
 86 | 
 87 | def test_null_value(basic_frame):
 88 |     frame = basic_frame.query({'where': ["isnull", "bar"]})
 89 |     assert_rows(frame, ['ccc'])
 90 | 
 91 | 
 92 | @pytest.mark.skipif(True, reason='This should work I think, but it does not...')
 93 | def test_string_in_col(basic_frame):
 94 |     frame = basic_frame.query({'where': ["contains", "foo", "'bb'"]})
 95 |     assert_rows(frame, ['bbb'])
 96 | 
 97 | 
 98 | def test_unknown_column_name(basic_frame):
 99 |     with pytest.raises(MalformedQueryException):
100 |         basic_frame.query({'where': ["==", "unknown", 3]})
101 | 
102 | 
103 | def test_invalid_column_name(basic_frame):
104 |     with pytest.raises(MalformedQueryException):
105 |         basic_frame.query({'where': ["==", "<foo:3>", 3]})
106 | 
107 | 
108 | def test_empty_filter_returns_same_frame(basic_frame):
109 |     assert basic_frame.query({'where': []}).df.equals(basic_frame.df)
110 | 
111 | 
112 | def test_empty_filter_clause_not_allowed(basic_frame):
113 |     with pytest.raises(MalformedQueryException):
114 |         basic_frame.query({'where': ["|", []]})
115 | 
116 | 
117 | @pytest.mark.parametrize("operation", ["!", "isnull"])
118 | def test_single_argument_operators_require_single_argument(basic_frame, operation):
119 |     with pytest.raises(MalformedQueryException):
120 |         basic_frame.query({'where': [operation, 'foo', 'bar']})
121 | 
122 | 
123 | @pytest.mark.parametrize("operation", ["<", ">", ">", "<=", "<=", ">=", ">=", "==", "!=", "in"])
124 | def test_double_argument_operators_require_single_argument(basic_frame, operation):
125 |     with pytest.raises(MalformedQueryException):
126 |         basic_frame.query({'where': [operation, 'foo']})
127 | 
128 |     with pytest.raises(MalformedQueryException):
129 |         basic_frame.query({'where': [operation, 'foo', 'bar', 'baz']})
130 | 
131 | 
132 | @pytest.mark.parametrize("operation", ["&", "|"])
133 | def test_and_or_requires_at_least_one_argument(basic_frame, operation):
134 |     with pytest.raises(MalformedQueryException):
135 |         basic_frame.query({'where': [operation]})
136 | 
137 | 
138 | @pytest.fixture
139 | def bitwise_frame():
140 |     data = """foo,bar,baz
141 |     1,1.5,abc
142 |     2,1.5,def
143 |     3,1.5,ghi
144 |     4,1.5,ijk
145 |     5,1.5,lmn"""
146 | 
147 |     return QFrame.from_csv(data)
148 | 
149 | 
150 | @pytest.mark.parametrize("filter, expected_rows", [
151 |     (1,  [1, 3, 5]),
152 |     (2,  [2, 3]),
153 |     (3,  [3]),
154 |     (4,  [4, 5]),
155 |     (5,  [5]),
156 |     (6,  []),
157 | ])
158 | def test_bitwise_all_bits_with_constant(filter, expected_rows, bitwise_frame):
159 |     result = bitwise_frame.query({'where': ["all_bits", "foo", filter]})
160 |     assert_rows(result, expected_rows)
161 | 
162 | 
163 | @pytest.mark.parametrize("filter, expected_rows", [
164 |     (1,  [1, 3, 5]),
165 |     (2,  [2, 3]),
166 |     (3,  [1, 2, 3, 5]),
167 |     (4,  [4, 5]),
168 |     (5,  [1, 3, 4, 5]),
169 |     (6,  [2, 3, 4, 5]),
170 |     (8,  []),
171 | ])
172 | def test_bitwise_any_bits_with_constant(filter, expected_rows, bitwise_frame):
173 |     result = bitwise_frame.query({'where': ["any_bits", "foo", filter]})
174 |     assert_rows(result, expected_rows)
175 | 
176 | 
177 | def test_bitwise_invalid_arg(bitwise_frame):
178 |     with pytest.raises(MalformedQueryException):
179 |         bitwise_frame.query({'where': ["any_bits", "foo", 1.3]})
180 | 
181 | 
182 | def test_bitwise_invalid_column_type(bitwise_frame):
183 |     with pytest.raises(MalformedQueryException):
184 |         bitwise_frame.query({'where': ["any_bits", "baz", 1]})
185 | 
186 | 
187 | def test_bitwise_column_missing(bitwise_frame):
188 |     with pytest.raises(MalformedQueryException):
189 |         bitwise_frame.query({'where': ["any_bits", "dont_exist", 1]})
190 | 
191 | 
192 | def test_bitwise_invalid_filter_length(bitwise_frame):
193 |     with pytest.raises(MalformedQueryException):
194 |         bitwise_frame.query({'where': ["any_bits", "foo", 1, 2]})
195 | 
196 | 
197 | @pytest.fixture
198 | def string_frame():
199 |     data = """foo,bar
200 |     1,abcd
201 |     2,defg
202 |     3,ghij
203 |     4,gxyj"""
204 | 
205 |     return QFrame.from_csv(data)
206 | 
207 | 
208 | @pytest.mark.parametrize("operator, filter, expected_rows", [
209 |     ("like", "'a%'",   [1]),
210 |     ("like", "'%g'",   [2]),
211 |     ("like", "'%d%'",  [1, 2]),
212 |     ("like", "'%cc%'", []),
213 |     ("like", "''",     []),
214 |     ("like", "'%'",    [1, 2, 3, 4]),
215 |     ("like", "'%%'",   [1, 2, 3, 4]),
216 |     ("like", "'%D%'",  []),
217 |     ("ilike", "'%D%'",  [1, 2]),
218 |     ("like", "'%g[a-z]{2}j%'",  [3, 4]),
219 |     ("like", "'%g[a-z]{3}j%'",  []),
220 |     ("like", "'g[a-z]{2}j'",  [3, 4]),
221 |     ("like", "'g[a-z]{2}'",  []),
222 |     ("like", "'g[a-z]{2}%'",  [3, 4]),
223 |     ("like", "'g[a-z]{3}'",  [3, 4]),
224 | ])
225 | def test_like(operator, filter, expected_rows, string_frame):
226 |     result = string_frame.query({'where': [operator, "bar", filter]})
227 |     assert_rows(result, expected_rows)
228 | 
229 | 
230 | def test_like_missing_quotes_on_argument(string_frame):
231 |     with pytest.raises(MalformedQueryException):
232 |         string_frame.query({'where': ['like', "bar", "%abc%"]})
233 | 
234 | 
235 | def test_like_invalid_argument_type(string_frame):
236 |     with pytest.raises(MalformedQueryException):
237 |         string_frame.query({'where': ['like', "bar", 12]})
238 | 
239 | 
240 | def test_like_invalid_column_type(string_frame):
241 |     with pytest.raises(MalformedQueryException):
242 |         string_frame.query({'where': ['like', "foo", "'%a%'"]})
243 | 
244 | 
245 | ############### Sub select ##################
246 | 
247 | 
248 | @pytest.mark.parametrize("data", [
249 |     """foo,bar
250 |     1,1
251 |     2,1
252 |     3,2""",   # Numbers
253 |     """foo,bar
254 |     1,aa
255 |     2,aa
256 |     3,bb""",  # Strings
257 |     """foo,bar
258 |     1,
259 |     2,
260 |     3,bb""",  # null/None
261 | ])
262 | def test_sub_select(data):
263 |     frame = QFrame.from_csv(data)
264 | 
265 |     result = frame.query({'where': ['in', 'bar', {'where': ['==', 'foo', 2]}]})
266 | 
267 |     assert_rows(result, [1, 2])
268 | 
269 | 
270 | def test_sub_select_in_column_missing_in_sub_select():
271 |     frame = QFrame.from_csv("""foo,bar
272 |     1,aa""")
273 | 
274 |     with pytest.raises(MalformedQueryException):
275 |         frame.query({'where': ['in', 'bar', {'select': ['foo'],
276 |                                              'where': ['==', 'foo', 2]}]})
277 | 
278 | 
279 | ############### Projections #######################
280 | 
281 | 
282 | def test_select_subset(basic_frame):
283 |     frame = basic_frame.query({'select': ['foo', 'baz']})
284 |     assert list(frame.columns) == ['foo', 'baz']
285 | 
286 | 
287 | def test_select_subset_invalid_column(basic_frame):
288 |     with pytest.raises(MalformedQueryException):
289 |         basic_frame.query({'select': ['foof', 'baz']})
290 | 
291 | 
292 | def test_select_distinct_without_columns(basic_frame):
293 |     # Should not have any effect since all rows are unique with respect to all columns
294 |     frame = basic_frame.query({'distinct': []})
295 |     assert_rows(frame, ['bbb', 'aaa', 'ccc'])
296 | 
297 | 
298 | def test_select_distinct_with_columns(basic_frame):
299 |     frame = basic_frame.query({'distinct': ['qux']})
300 |     assert_rows(frame, ['bbb', 'ccc'])
301 | 
302 | 
303 | ################ Aggregation #####################
304 | 
305 | # TODO: More tests and error handling
306 | 
307 | def test_basic_sum_aggregation(basic_frame):
308 |     expected = QFrame.from_csv("""
309 | qux,baz
310 | www,9
311 | qqq,12""")
312 | 
313 |     frame = basic_frame.query({
314 |         'select': ['qux', ['sum', 'baz']],
315 |         'group_by': ['qux'],
316 |         'order_by': ['baz']})
317 | 
318 |     assert frame.to_csv() == expected.to_csv()
319 | 
320 | 
321 | def test_basic_count_aggregation(basic_frame):
322 |     expected = QFrame.from_csv("""
323 | qux,baz
324 | qqq,2
325 | www,1""")
326 | 
327 |     frame = basic_frame.query({
328 |         'select': ['qux', ['count', 'baz']],
329 |         'group_by': ['qux']})
330 | 
331 |     assert frame.to_csv() == expected.to_csv()
332 | 
333 | 
334 | def test_unknown_aggregation_function(basic_frame):
335 |     with pytest.raises(MalformedQueryException):
336 |         basic_frame.query({
337 |             'select': ['qux', ['foo_bar', 'baz']],
338 |             'group_by': ['qux']})
339 | 
340 | 
341 | def test_missing_aggregation_function(basic_frame):
342 |     with pytest.raises(MalformedQueryException):
343 |         basic_frame.query({
344 |             'select': ['qux'],
345 |             'group_by': ['qux']})
346 | 
347 | 
348 | def test_count_without_aggregation(basic_frame):
349 |     expected = QFrame.from_csv("""
350 | count
351 | 3""")
352 | 
353 |     frame = basic_frame.query({'select': [['count']]})
354 |     assert frame.to_csv() == expected.to_csv()
355 | 
356 | 
357 | def test_max_without_aggregation(basic_frame):
358 |     expected = QFrame.from_csv("""
359 | baz
360 | 9""")
361 | 
362 |     frame = basic_frame.query({'select': [['max', 'baz']]})
363 |     assert frame.to_csv() == expected.to_csv()
364 | 
365 | 
366 | ############### Ordering ################
367 | 
368 | 
369 | def test_single_column_ascending_ordering(basic_frame):
370 |     frame = basic_frame.query({'order_by': ['foo']})
371 |     assert_rows(frame, ['aaa', 'bbb', 'ccc'])
372 | 
373 | 
374 | def test_single_column_decending_ordering(basic_frame):
375 |     frame = basic_frame.query({'order_by': ['-foo']})
376 |     assert_rows(frame, ['ccc', 'bbb', 'aaa'])
377 | 
378 | 
379 | def test_sort_on_unknown_column(basic_frame):
380 |     with pytest.raises(MalformedQueryException):
381 |         basic_frame.query({'order_by': ['foof']})
382 | 
383 | 
384 | ############## Slicing ##################
385 | 
386 | 
387 | def test_offset_and_limit(basic_frame):
388 |     frame = basic_frame.query({"offset": 1, "limit": 1})
389 |     assert_rows(frame, ['aaa'])
390 |     assert frame.unsliced_df_len == 3
391 | 
392 | 
393 | ############## Unicode #################
394 | 
395 | 
396 | def test_unicode_content_from_csv():
397 |     data = u"""foo,bar
398 | aaa,Iñtërnâtiônàližætiøn
399 | bbb,räksmörgås
400 | ccc,"""
401 | 
402 |     input_frame = QFrame.from_csv(data)
403 |     frame = input_frame.query({'where': ["==", "bar", u"'räksmörgås'"]})
404 | 
405 |     assert_rows(frame, ['bbb'])
406 | 
407 | 
408 | def test_unicode_content_from_dicts():
409 |     data = [{'foo': 'aaa', 'bar': u'Iñtërnâtiônàližætiøn'},
410 |             {'foo': 'bbb', 'bar': u'räksmörgås'.encode(encoding='utf-8')}]
411 |     input_frame = QFrame.from_dicts(data)
412 |     frame = input_frame.query({'where': ["==", "bar", u"'räksmörgås'"]})
413 | 
414 |     assert_rows(frame, ['bbb'])
415 | 
416 | 
417 | @pytest.fixture
418 | def calculation_frame():
419 |     data = """
420 | foo,bar
421 | 1,10
422 | 1,11
423 | 2,20
424 | 3,30
425 | 3,33"""
426 | 
427 |     return QFrame.from_csv(data)
428 | 
429 | 
430 | def test_column_aliasing(calculation_frame):
431 |     frame = calculation_frame.query({"select": [["=", "baz", "foo"]]})
432 | 
433 |     assert frame.to_dicts() == [
434 |         {"baz": 1},
435 |         {"baz": 1},
436 |         {"baz": 2},
437 |         {"baz": 3},
438 |         {"baz": 3}
439 |     ]
440 | 
441 | 
442 | def test_constant_int_aliasing(calculation_frame):
443 |     frame = calculation_frame.query({"select": [["=", "baz", 55]],
444 |                                      "limit": 2})
445 | 
446 |     assert frame.to_dicts() == [
447 |         {"baz": 55},
448 |         {"baz": 55},
449 |     ]
450 | 
451 | 
452 | def test_constant_string_aliasing(calculation_frame):
453 |     frame = calculation_frame.query({"select": [["=", "baz", "'qux'"]],
454 |                                      "limit": 2})
455 | 
456 |     assert frame.to_dicts() == [
457 |         {"baz": "qux"},
458 |         {"baz": "qux"},
459 |     ]
460 | 
461 | 
462 | def test_alias_as_sum_of_two_other_columns(calculation_frame):
463 |     frame = calculation_frame.query({"select": [["=", "baz", ["+", "bar", "foo"]]],
464 |                                      "limit": 2})
465 | 
466 |     assert frame.to_dicts() == [
467 |         {"baz": 11},
468 |         {"baz": 12},
469 |     ]
470 | 
471 | 
472 | def test_alias_as_nested_expression(calculation_frame):
473 |     frame = calculation_frame.query({"select": [["=", "baz", ["+", ["*", "bar", 2], "foo"]]],
474 |                                      "limit": 2})
475 | 
476 |     assert frame.to_dicts() == [
477 |         {"baz": 21},
478 |         {"baz": 23},
479 |     ]
480 | 
481 | 
482 | def test_alias_with_single_argument_function(calculation_frame):
483 |     frame = calculation_frame.query({"select": [["=", "baz", ["sqrt", ["+", 3, "foo"]]]],
484 |                                      "limit": 1})
485 | 
486 |     assert frame.to_dicts() == [{"baz": 2}]
487 | 
488 | 
489 | @pytest.fixture
490 | def frame_with_zero():
491 |     data = """
492 | foo,bar
493 | 1,0
494 | 1,11"""
495 | 
496 |     return QFrame.from_csv(data)
497 | 
498 | 
499 | def test_alias_with_division_by_zero(frame_with_zero):
500 |     frame = frame_with_zero.query({"select": [["=", "baz", ["/", "foo", "bar"]]],
501 |                                    "limit": 1})
502 | 
503 |     assert frame.to_dicts() == [{"baz": float("inf")}]
504 | 
505 | 
506 | def test_invalid_alias_target_string_with_invalid_character(calculation_frame):
507 |     with pytest.raises(MalformedQueryException):
508 |         calculation_frame.query({"select": [["=", "ba/r", 1]]})
509 | 
510 | 
511 | def test_invalid_alias_target_non_string(calculation_frame):
512 |     with pytest.raises(MalformedQueryException):
513 |         calculation_frame.query({"select": [["=", 23, 1]]})
514 | 
515 | 
516 | def test_aliasing_does_not_overwrite_original_qframe(calculation_frame):
517 |     frame = calculation_frame.query({"select": [["=", "baz", "foo"]]})
518 |     assert list(frame.columns.values) == ['baz']
519 |     assert 'baz' not in list(calculation_frame.df.columns.values)
520 | 
521 | 
522 | def test_cannot_mix_aliasing_and_aggregation_expressions(calculation_frame):
523 |     with pytest.raises(MalformedQueryException):
524 |         calculation_frame.query({"select": [["=", "bar", 1], ["max", "foo"]],
525 |                                  "group_by": ["bar"]})
526 | 
527 | 
528 | def test_aliasing_with_wrong_number_of_parameters_in_function(calculation_frame):
529 |     with pytest.raises(MalformedQueryException):
530 |         calculation_frame.query({"select": [["=", "baz", ["+", "bar", "foo", "foo"]]]})
531 | 
532 | 
533 | def test_aliasing_with_unknown_function(calculation_frame):
534 |     with pytest.raises(MalformedQueryException):
535 |         calculation_frame.query({"select": [["=", "baz", ["?", "bar", "foo"]]]})
536 | 
537 | 
538 | def test_aliasing_with_unknown_function_2(calculation_frame):
539 |     with pytest.raises(MalformedQueryException):
540 |         calculation_frame.query({"select": [["=", "baz", ["zin", "bar"]]]})
541 | 
542 | 
543 | def test_aliasing_with_invalid_arity(calculation_frame):
544 |     with pytest.raises(MalformedQueryException):
545 |         calculation_frame.query({"select": [["=", "baz", ["zin", "bar"], "foobar"]]})
546 | 
547 | 
548 | def test_multiple_aggregation_functions_without_group_by(calculation_frame):
549 |     frame = calculation_frame.query({"select": [["max", "bar"], ["min", "foo"]]})
550 |     assert frame.to_dicts() == [{"bar": 33, "foo": 1}]
551 | 
552 | 
553 | def test_cannot_mix_aggregation_functions_and_columns_without_group_by(calculation_frame):
554 |     with pytest.raises(MalformedQueryException):
555 |         calculation_frame.query({"select": [["max", "bar"], "foo"]})
556 | 
557 | 
558 | ################# Sub queries ###################
559 | 
560 | 
561 | @pytest.fixture
562 | def subselect_frame():
563 |     data = """
564 | foo,bar
565 | 1,10
566 | 1,15
567 | 5,50"""
568 | 
569 |     return QFrame.from_csv(data)
570 | 
571 | 
572 | def test_alias_aggregation_from_sub_select(subselect_frame):
573 |     frame = subselect_frame.query({"select": [["=", "foo_pct",
574 |                                                ["*", 100, ["/", "foo", "bar"]]]],
575 |                                    "from":
576 |                                        {"select": ["foo", ["sum", "bar"]],
577 |                                         "group_by": ["foo"]}})
578 | 
579 |     assert frame.to_dicts() == [
580 |         {"foo_pct": 4.0},
581 |         {"foo_pct": 10.0}
582 |     ]
583 | 
584 | 
585 | ################ Enums ########################
586 | 
587 | @pytest.fixture
588 | def enum_data():
589 |     return """
590 | foo,bar
591 | ccc,10
592 | ccc,11
593 | ccc,12
594 | ccc,13
595 | ccc,14
596 | ccc,15
597 | ccc,16
598 | bbb,20
599 | aaa,25"""
600 | 
601 | 
602 | @pytest.fixture
603 | def enum_frame(enum_data):
604 |     return QFrame.from_csv(enum_data, column_types={'foo': 'category'})
605 | 
606 | 
607 | def test_enum_basic_sorting(enum_frame):
608 |     assert enum_frame.query({'order_by': ['foo']}).to_dicts() == [
609 |         {'foo': 'aaa', 'bar': 25},
610 |         {'foo': 'bbb', 'bar': 20},
611 |         {'foo': 'ccc', 'bar': 10},
612 |         {'foo': 'ccc', 'bar': 11},
613 |         {'foo': 'ccc', 'bar': 12},
614 |         {'foo': 'ccc', 'bar': 13},
615 |         {'foo': 'ccc', 'bar': 14},
616 |         {'foo': 'ccc', 'bar': 15},
617 |         {'foo': 'ccc', 'bar': 16},
618 |     ]
619 | 
620 | 
621 | def test_enum_filter_by_equality(enum_frame):
622 |     assert enum_frame.query({'where': ['==', 'foo', '"bbb"']}).to_dicts() == [
623 |         {'foo': 'bbb', 'bar': 20},
624 |     ]
625 | 
626 | 
627 | def test_enum_filter_by_order_comparison_not_possible(enum_frame):
628 |     with pytest.raises(MalformedQueryException):
629 |         enum_frame.query({'where': ['<', 'foo', '"bbb"']})
630 | 
631 | 
632 | def test_enum_size(enum_frame, enum_data):
633 |     # Space savings should be possible using categorials
634 |     # when multiple rows containing the same value exists.
635 |     frame = QFrame.from_csv(enum_data)
636 |     assert enum_frame.byte_size() < frame.byte_size()
637 | 
638 | 
639 | def test_enum_from_dicts(enum_frame):
640 |     cat_frame = QFrame.from_dicts(enum_frame.to_dicts(), column_types={'foo': 'category'})
641 |     frame = QFrame.from_dicts(enum_frame.to_dicts())
642 | 
643 |     assert cat_frame.byte_size() < frame.byte_size()
644 | 
645 | 
646 | ############# NaN ###############
647 | 
648 | 
649 | def test_like_ignores_nan_values():
650 |     f = QFrame.from_csv("""
651 |     foo,bar
652 |     aaa,xyz
653 |     bbb,""")
654 | 
655 |     assert f.query({'where': ['ilike', 'bar', '"ccc"']}).to_dicts() == []
656 | 
657 | 
658 | def test_only_empty_string_is_nan():
659 |     f = QFrame.from_csv("""
660 |     foo,bar
661 |     aaa,N/A
662 |     aaa,n/a
663 |     aaa,NA
664 |     aaa,na
665 |     aaa,nan
666 |     aaa,NaN
667 |     aaa,-NaN
668 |     aaa,null
669 |     aaa,NULL
670 |     bbb,""")
671 | 
672 |     assert json.loads(f.query({'select': ['bar']}).to_json()) == [
673 |         {"bar": "N/A"},
674 |         {"bar": "n/a"},
675 |         {"bar": "NA"},
676 |         {"bar": "na"},
677 |         {"bar": "nan"},
678 |         {"bar": "NaN"},
679 |         {"bar": "-NaN"},
680 |         {"bar": "null"},
681 |         {"bar": "NULL"},
682 |         {"bar": None},
683 |     ]
684 | 
685 | 
686 | ################# Update ######################
687 | 
688 | 
689 | def assert_column(column, frame, expected):
690 |     assert [d[column] for d in frame.to_dicts()] == expected
691 | 
692 | 
693 | def test_basic_update(basic_frame):
694 |     basic_frame.query({'update': [['bar', 2.0], ['baz', 0]],
695 |                        'where': ['==', 'foo', '"bbb"']})
696 | 
697 |     assert basic_frame.to_dicts()[0]['bar'] == 2.0
698 |     assert basic_frame.to_dicts()[0]['baz'] == 0
699 | 
700 | 
701 | def test_basic_update_function_based_on_current_value_of_column(basic_frame):
702 |     basic_frame.query({'update': [['+', 'bar', 2.0]],
703 |                        'where': ['==', 'foo', '"bbb"']})
704 | 
705 |     assert basic_frame.to_dicts()[0]['bar'] == 3.25
706 | 
707 | 
708 | def test_unknown_update_function(basic_frame):
709 |     with pytest.raises(MalformedQueryException):
710 |         basic_frame.query({'update': [['_', 'bar', 2.0]],
711 |                            'where': ['==', 'foo', '"bbb"']})
712 | 
713 | 
714 | def test_update_is_null(basic_frame):
715 |     basic_frame.query({'update': [['baz', 19]],
716 |                        'where': ['isnull', 'bar']})
717 | 
718 |     assert_column('baz', basic_frame, [5, 7, 19])
719 | 
720 | 
721 | def test_update_is_null_invalid_argument_number(basic_frame):
722 |     with pytest.raises(MalformedQueryException):
723 |         basic_frame.query({'update': [['baz', 19]],
724 |                            'where': ['isnull', 9]})
725 | 
726 | 
727 | def test_update_in(basic_frame):
728 |     basic_frame.query({'update': [['baz', 19]],
729 |                        'where': ['in', 'foo', ["'aaa'", "'bbb'"]]})
730 | 
731 |     assert_column('baz', basic_frame, [19, 19, 9])
732 | 
733 | 
734 | def test_update_in_invalid_arg_count(basic_frame):
735 |     with pytest.raises(MalformedQueryException):
736 |         basic_frame.query({'update': [['baz', 19]],
737 |                            'where': ['in', 'foo', 'bar', ["'aaa'", "'bbb'"]]})
738 | 
739 | 
740 | def test_update_in_unknown_column(basic_frame):
741 |     with pytest.raises(MalformedQueryException):
742 |         basic_frame.query({'update': [['baz', 19]],
743 |                            'where': ['in', 'unknown', ["'aaa'", "'bbb'"]]})
744 | 
745 | 
746 | def test_update_in_second_arg_not_a_list(basic_frame):
747 |     with pytest.raises(MalformedQueryException):
748 |         basic_frame.query({'update': [['baz', 19]],
749 |                            'where': ['in', 'foo', 'boo']})
750 | 
751 | 
752 | def test_unknown_clause_in_query(basic_frame):
753 |     try:
754 |         basic_frame.query({'foo': []})
755 |         assert False
756 |     except MalformedQueryException as e:
757 |         print str(e)
758 |         assert 'foo' in str(e)
759 | 
760 | 
761 | ################### Performance ####################
762 | 
763 | 
764 | @pytest.fixture
765 | def large_frame():
766 |     d = 1000000 * [{'aaa': 123456789, 'bbb': 'abcdefghijklmnopqrvwxyz', 'ccc': 1.23456789}]
767 |     return QFrame.from_dicts(d)
768 | 
769 | 
770 | @contextmanager
771 | def timeit(name):
772 |     t0 = time.time()
773 |     yield
774 |     print('\n{name} duration: {duration} s'.format(name=name, duration=time.time()-t0))
775 | 
776 | 
777 | @pytest.mark.benchmark
778 | def test_large_frame_csv(large_frame):
779 |     with timeit('to_csv'):
780 |         csv_string = large_frame.to_csv()
781 | 
782 |     with timeit('from_csv'):
783 |         QFrame.from_csv(csv_string)
784 | 
785 |     # Results:
786 |     # to_csv duration: 2.43983101845 s
787 |     # from_csv duration: 0.532874107361 s
788 | 
789 | 
790 | @pytest.mark.benchmark
791 | def test_large_frame_json(large_frame):
792 |     with timeit('to_json'):
793 |         large_frame.to_json()
794 | 
795 |     # with timeit('from_json'):
796 |     #    QFrame.from_json(json_string)
797 | 
798 |     # to_json duration: 0.792788982391 s
799 |     # from_json duration: 3.07192707062 s, This implementation no longer exists
800 | 
801 | 
802 | @pytest.mark.benchmark
803 | @pytest.mark.skipif(True, reason="No implementation")
804 | def test_large_frame_msgpack(large_frame):
805 |     # NOTE: This implementation does not exist but once did as an experiment
806 |     #       This test is left as reference and reminder
807 |     with timeit('to_msgpack'):
808 |         msgpack_string = large_frame.to_msgpack()
809 | 
810 |     with timeit('from_msgpack'):
811 |         QFrame.from_msgpack(msgpack_string)
812 | 
813 |     # These numbers explain why there is no msgpack implementation
814 |     # to_msgpack duration: 7.02977800369 s
815 |     # from_msgpack duration: 1.52387404442 s
816 | 
817 |     # It's not because msgpack is slow (it's fast), it's because the
818 |     # code has to first create a list of python dicts and then serialize
819 |     # that using msgpack rather than serializing the dataframe to msgpack
820 |     # directly.
821 | 
822 | # Not
823 | # Disjunction and conjunction
824 | # Refactor tests to check complete column not just the row that is supposed to be affected
825 | # Mix self referring updates and assignments in same update
826 | # Any way to merge the filter code for select and update (is the update version as performant as the where)?
827 | 
828 | 
829 | def xtest_update_with_conjunction(basic_frame):
830 |     basic_frame.query({'update': [['bar', 2.0]],
831 |                        'where': ['==', 'foo', '"bbb"']})
832 | 
833 |     assert basic_frame.to_dicts()[0]['bar'] == 3.25
834 | 


--------------------------------------------------------------------------------
/test/test_statistics.py:
--------------------------------------------------------------------------------
 1 | from qcache.statistics import Statistics
 2 | 
 3 | 
 4 | def test_ring_buffer_size():
 5 |     s = Statistics(buffer_size=3)
 6 |     s.append('foo', 1)
 7 |     s.append('foo', 2)
 8 |     s.append('foo', 3)
 9 | 
10 |     assert list(s.stats['foo']) == [1, 2, 3]
11 | 
12 |     s.append('foo', 4)
13 |     assert list(s.stats['foo']) == [2, 3, 4]
14 | 


--------------------------------------------------------------------------------
/tls/ca-conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "signing": {
 3 |     "default": {
 4 |       "expiry": "2000000h",
 5 |       "usages": [
 6 |         "signing",
 7 |         "key encipherment",
 8 |         "server auth",
 9 |         "client auth"
10 |       ]
11 |     }
12 |   }
13 | }


--------------------------------------------------------------------------------
/tls/ca-key.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | MIIEowIBAAKCAQEA5lS6l+dBf1DfkY4RoUXnennEMkzTD+SdRnGMUt3LvnkPjHdz
 3 | pb0YQFQ+xLUEEosq4B9S74l0GFM+Y6/vrg+lhpDu8VfWNv4Py7j8tPDA8sLzFqbu
 4 | pi+uRLu5FQPd0+HER+vcDvFWrcEM7YNHBHZC+6WH7nnRewzKgQgaQSc9W5megV3h
 5 | ToHvAGrcwMK78dQEWtG8+QaprREFEFE279q7K7pTychDMidA+LvW1nSOImwapBhv
 6 | lH04PKJ26eWx6SbC3ssS6uMgjJJZVsJrRs4vUz3D86gDbm2/+uSBPBh7zumuKTSH
 7 | f78pFIOSd5Ayj9YAnlyft4laGfOFYj0nWyH+SQIDAQABAoIBACdQOvEGhcVnVmGP
 8 | sC6osdPTyMsVFclWqgmGA/943/fIzgCZuuGFHwiL2JLWmrVXj5/leNBt06T7QGDf
 9 | fLdm8EfBoScMaQHF54hMMMXpeeV1pOwu3fj7lnEg4XxWxpwNouTruwnJ45OQqdY4
10 | W/zE+rXdERCBNmIcUswnR+PrAKwLf2tAH8g9DN0jqilO8KJixmyYM/stfpEC08Q2
11 | N8wLY8zcHMEX6+IIgG4Ok76fKhGQ/gJKaow8fVIAw35Qe0BpV30fbS+mhjHVuPnl
12 | YW1idBDPItCGvBJs3kVS+tTZB9AFmJbHZj9GwOo+0D6ZVPSeXUDFy+czVHI9wL2T
13 | JZtrcgECgYEA/bfGb/siIZ/yG6YND2iSQNO3ip3QLvRzRhzLZCfTdfOax4dPUaFh
14 | xf9ZDB7Qn9Ki7JL4iSpCxbjOUhjAOwjE5N+hdGlqwMWUSJ4eGNQM0Vb98VDLrgsq
15 | FkOxatvUCaW0nqzIQdYMLkef0TnYQw0M+j3hg+t+9dcJzA4zS7A2TakCgYEA6Gca
16 | DQgyyJy9nFPaYk9qTH2RR+3I//NPI3ETBzrAGWkCWsDJS8g814Ln/dERibBuVU86
17 | D+ougtgXk6agDOUhAuC/1YtRVadSCZbyJityMeydHR033OP/fkYpPjOBiSjyOFn1
18 | D0VYWQVgvt/Z0RLvbh1m0PvMPfXf/3cxj4lkT6ECgYAFNvjCJnQ+Iq50OQZ9sZWH
19 | 9ZIJLFMyE94mq8LWbScgfoBI55QOxnVe/2+SGzQIhOjKWf73usGilLjQ4SdaT0TU
20 | u2/zF5OVILp6f514vysARnxzsEhvbFVSHdQQsTH7fMdol36KM98OOHSldT3nquYA
21 | YrM25ek3HlNaOVR+ksGa+QKBgQDEUFNxLru3Oq/wneSbpvnkIy3V1Mc1bhIrnhi0
22 | wqwCyvFyN+fSXBMI+Ut+3Fw0MxUAeyxQxUEExgUkdFw+iE6aX7+sY0MRV4W1FAz4
23 | sTqFcZpGPagyr2XjBOFR6bBCbJQvhc28WJeIm0Jd+jnEonoeSjfP0ON2c3wEEGuN
24 | FEHoAQKBgBF7DEcTo0xeUan/FCLYfdg8Hg/VnEYJUYttg4nOG7dVj35wvCnRTZ+9
25 | mmKpAjnh8Nqk2gTJHBBtzZ3I4L2uoUZocJnOfPjduhRi2A9KIrU0gg2clKKE+xAp
26 | DeEvLUKeaf4hPFRdAgy+yS9+pYwZdObMtr8SWm3FgRNmt4LRrenC
27 | -----END RSA PRIVATE KEY-----
28 | 


--------------------------------------------------------------------------------
/tls/ca.csr:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE REQUEST-----
 2 | MIIC2zCCAcMCAQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWEx
 3 | FjAUBgNVBAcTDVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdl
 4 | dHMsIEluYy4xDDAKBgNVBAsTA1dXVzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC
 5 | AQoCggEBAOZUupfnQX9Q35GOEaFF53p5xDJM0w/knUZxjFLdy755D4x3c6W9GEBU
 6 | PsS1BBKLKuAfUu+JdBhTPmOv764PpYaQ7vFX1jb+D8u4/LTwwPLC8xam7qYvrkS7
 7 | uRUD3dPhxEfr3A7xVq3BDO2DRwR2Qvulh+550XsMyoEIGkEnPVuZnoFd4U6B7wBq
 8 | 3MDCu/HUBFrRvPkGqa0RBRBRNu/auyu6U8nIQzInQPi71tZ0jiJsGqQYb5R9ODyi
 9 | dunlsekmwt7LEurjIIySWVbCa0bOL1M9w/OoA25tv/rkgTwYe87prik0h3+/KRSD
10 | kneQMo/WAJ5cn7eJWhnzhWI9J1sh/kkCAwEAAaAtMCsGCSqGSIb3DQEJDjEeMBww
11 | GgYDVR0RBBMwEYIJbG9jYWxob3N0hwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQDE
12 | eszCFstYVQlcetz3nY/MzJke+I6v4KCo6oyAzzB2OeMXedI0QFLyVqBKVMKaO1yB
13 | 49HmUbrubJaWJlJKNh067KnndcFgZiU1pTZ6yTC4UIVWxGOJoF3SFkgAjl57CVrE
14 | KdhaJH/+YM5c65ck6IzLH7EymYhnC0n4xDT6nP6kiq0b6jxjxx6P1Fz6+iw16Y5R
15 | 8f1HFA8RniZK5bmN1OY/ivPowJZKdobKRqWyIF9oynsiawRAOG20Z+Qo1P51bYNc
16 | jx3nc70vsZ1JfbkSLvrImC/7nLMIuxMS076m3e7WU1p2IQXKPf4tPD8VFz0gsANn
17 | yViBz/BDo+SoElTFV6yG
18 | -----END CERTIFICATE REQUEST-----
19 | 


--------------------------------------------------------------------------------
/tls/ca.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDszCCApugAwIBAgIUOjMlWz73JcqtOoRMekH2Ff8sj6owDQYJKoZIhvcNAQEL
 3 | BQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFjAUBgNVBAcT
 4 | DVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdldHMsIEluYy4x
 5 | DDAKBgNVBAsTA1dXVzAeFw0xOTAxMDUxOTM5MDBaFw0yNDAxMDQxOTM5MDBaMGkx
 6 | CzAJBgNVBAYTAlVTMRMwEQYDVQQIEwpDYWxpZm9ybmlhMRYwFAYDVQQHEw1TYW4g
 7 | RnJhbmNpc2NvMR8wHQYDVQQKExZJbnRlcm5ldCBXaWRnZXRzLCBJbmMuMQwwCgYD
 8 | VQQLEwNXV1cwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDmVLqX50F/
 9 | UN+RjhGhRed6ecQyTNMP5J1GcYxS3cu+eQ+Md3OlvRhAVD7EtQQSiyrgH1LviXQY
10 | Uz5jr++uD6WGkO7xV9Y2/g/LuPy08MDywvMWpu6mL65Eu7kVA93T4cRH69wO8Vat
11 | wQztg0cEdkL7pYfuedF7DMqBCBpBJz1bmZ6BXeFOge8AatzAwrvx1ARa0bz5Bqmt
12 | EQUQUTbv2rsrulPJyEMyJ0D4u9bWdI4ibBqkGG+UfTg8onbp5bHpJsLeyxLq4yCM
13 | kllWwmtGzi9TPcPzqANubb/65IE8GHvO6a4pNId/vykUg5J3kDKP1gCeXJ+3iVoZ
14 | 84ViPSdbIf5JAgMBAAGjUzBRMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8EBTAD
15 | AQH/MB0GA1UdDgQWBBS8CeQioSIBM+9UI/Fu+d4YjH9CYjAPBgNVHREECDAGhwR/
16 | AAABMA0GCSqGSIb3DQEBCwUAA4IBAQBfJygLqN/A/rwjoD5gsdbALG26G1/Ei3wV
17 | V4HliQdExQBKCpb02TS2EPKZn0CYEW3WgnEuQ8TZMubwH07OrKFpQqjGnGHNbO7E
18 | u7thzvK1Sj2Wyr+Gml3EDRJw//cFTi11/Mu9zxj9uZyDt3z96y9GIrEZt1uAaNnf
19 | +HoWd5VnXVbZtzDlPxzSU/943XpHz5nPOdRw4zHYZ2ftcmL5ihecONRHPrTze5F1
20 | hyCIWbaTJSe7D90uO4RoA6jiCtiweF01SBVjN6ELT5Deyohxi2e+ctBffefIa7IJ
21 | hQ8fXTjRY+SOcCdYo3d7PnsnhMITysJYmnF3EPjKs8UZo3niuC3A
22 | -----END CERTIFICATE-----
23 | 


--------------------------------------------------------------------------------
/tls/csr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "hosts": [
 3 |         "localhost",
 4 |         "127.0.0.1"
 5 |     ],
 6 |     "key": {
 7 |         "algo": "rsa",
 8 |         "size": 2048
 9 |     },
10 |     "names": [
11 |         {
12 |             "C": "US",
13 |             "L": "San Francisco",
14 |             "O": "Internet Widgets, Inc.",
15 |             "OU": "WWW",
16 |             "ST": "California"
17 |         }
18 |     ]
19 | }


--------------------------------------------------------------------------------
/tls/generate_test_certs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Small script to generate certs for testing of TLS
 4 | # Uses cfssl for cert generation: https://github.com/cloudflare/cfssl
 5 | # go get -u github.com/cloudflare/cfssl/cmd/cfssl
 6 | # go get -u github.com/cloudflare/cfssl/cmd/cfssljson
 7 | 
 8 | cfssl genkey -initca csr.json | cfssljson -bare ca
 9 | cfssl gencert -ca ca.pem -ca-key ca-key.pem -config ca-conf.json csr.json | cfssljson -bare host
10 | cat host-key.pem >> host.pem
11 | 


--------------------------------------------------------------------------------
/tls/host-key.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | MIIEowIBAAKCAQEArqbqq7q9RFEB8qP+sY41rJN5S8s3SJesrneCuo2fpbJQnvqZ
 3 | +d8JRhb48PpNQg1xUNzAZi2yGNgjD8qvkq29T/bQmqxCvd7sddZt5tIPfL6nGNea
 4 | GXDobiIXgk3OuDwF6LINbdpaTi50s3mxiZEpFWabr9106ZYqvoWmliWegYOKPKf/
 5 | In1Bc/DvoeG5nc2xOT3VjzAz1D4PjdoEn2N99MDwzEkTVs4ErfG+bpsJtaIgNeJf
 6 | M0V9TEiQ49/njQygBomdQodw9wy6d1JaaEG6T0sxSscTPeissgScU66JsJdzBiyb
 7 | fhjsJFXceMWx52z7BOnpaT61HrY5AVLFmCZsowIDAQABAoIBAQCdsI+JZh2lJQ8B
 8 | KRi475F+s9h8morbvDZpf/ZEwcI3NPydzd5gQQR6iBtUWGqRDNoXDHmax/9ZdyyR
 9 | AYvsf3nCTTKjtIy+KhyNeIHaOidJlkoAoAm+lrcFWTqop9/RcEBVjQ2a9d87X652
10 | rZWig7H4ZKCE9QquLKuDQeDbCDRI8iCSUwHw69Os0o0xjVjwIEUfrrTAqdCW68db
11 | JxU+2qbJp6QLnQRA2tg6EEXTRuu1YydDQpVk6LWSQl4aDJvt+NmUdLQ9641oq9+U
12 | vEJStGRI/7dUvkIXHWmSjCWYUPufVFMnpiGCnVdOx+pT+7kyX4qkFkd+NRZCCRv7
13 | pjaaPYnhAoGBAOJYImG6o2ZUf6dpC7Qaju0ht/zUwzpnHhxyUEFJMn01E7n98vBS
14 | gwQjacuyEpuiRbZ2eCzXIVYUvJXRjZ6r49ICUjV3wCDcFJGgotovG5ROLDJ4uePZ
15 | j6pExC6PWOYkYBbMDMDcHINMPUMrSCt1q+IWLYwBnw0s3rrcU081G7MVAoGBAMWI
16 | 9d/XeyP1cyRzfZjYXZlUiu656aenmTMFdo1eq+qqZc3nROJAZhkKsXtZDstMhqK5
17 | Zt8JRgJgyTkpbgi7928kZQ3lZTGYRde+E+28n9PiQ/DwhQNprhAdlkxHYLbyIOMS
18 | TIgiYVyFp/hPa2+TIiRj8R0hTxjVB+eeP3c3MG7XAoGAZ4jdIUsYV0Srp85bNiU3
19 | 36ye1p4UN3DXyzdXEqYC3FcdEOTi7Z8wowH38N8ht+NAdDg4vHojm5actQNH/p6t
20 | 9XBuXlrKnb1OA1cxZxBJU0t22Bd1II5yMvaFrq6PgaZ99a+c3dNRj5WeKE1yE78d
21 | wqtWZkaPlJ1DvMHvsbmfVAkCgYBaiA7H7CYPsOp4hkXFy0P6hfi2uJYRtIpOC+7t
22 | k7oO7tGBsMCtQP6J85CsC4DwQ75gzcAL8GAZruoKPKalciBQ28lEuSHLvRIlcoQH
23 | rS9DGKwyvMj5a3HPCQBdLUlyDz5rU3On4LnmgYseDqgalsBLFVLoGt/5bYkV6j0E
24 | ElMsOQKBgHpJ3GIHB0GBW3knMU7UQypMB3C7845YPRMvXexJuINSar0Uw50HgyYL
25 | gPx+lWV23pMO2dksb+BcrbdrlFAlrOZuPg7rzLKP2J/wEQiM2OqdjOXDZ+C1D6yn
26 | 0tPFFneXGUe31tgrMuSWrUoHU6zpurAfYZuE0TJ0+7ud3Nm0q++P
27 | -----END RSA PRIVATE KEY-----
28 | 


--------------------------------------------------------------------------------
/tls/host.csr:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE REQUEST-----
 2 | MIIC2zCCAcMCAQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWEx
 3 | FjAUBgNVBAcTDVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdl
 4 | dHMsIEluYy4xDDAKBgNVBAsTA1dXVzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC
 5 | AQoCggEBAK6m6qu6vURRAfKj/rGONayTeUvLN0iXrK53grqNn6WyUJ76mfnfCUYW
 6 | +PD6TUINcVDcwGYtshjYIw/Kr5KtvU/20JqsQr3e7HXWbebSD3y+pxjXmhlw6G4i
 7 | F4JNzrg8BeiyDW3aWk4udLN5sYmRKRVmm6/ddOmWKr6FppYlnoGDijyn/yJ9QXPw
 8 | 76HhuZ3NsTk91Y8wM9Q+D43aBJ9jffTA8MxJE1bOBK3xvm6bCbWiIDXiXzNFfUxI
 9 | kOPf540MoAaJnUKHcPcMundSWmhBuk9LMUrHEz3orLIEnFOuibCXcwYsm34Y7CRV
10 | 3HjFseds+wTp6Wk+tR62OQFSxZgmbKMCAwEAAaAtMCsGCSqGSIb3DQEJDjEeMBww
11 | GgYDVR0RBBMwEYIJbG9jYWxob3N0hwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQCS
12 | fUoC/KrOQDiYEk+z/RXGDzDDZIQu02d9hJVhu8+UIgCUInwfgi4L43/cIJWdrdVp
13 | NmJ+EK+0tHHvNYbaNHeQggYVz/b+geyb7rH8dv+6VFRzfvidHJk9l7I1wqKn4CMQ
14 | BrfQMfeqbFJrOoJjEoKkNB5SgW8SGjTl2DRFgmmYFeb/Y3YBWa5sf0/otmVNpk2O
15 | G7Rw7aXrJPNloX+4tPoIuHM6A3u1h3NKhCmkuSESBd3/VOQP3wBdQCEaVsNGSyQr
16 | +/Rx3HQmJiVEBo+tjaKI9mx5IzMm5khH7jdKBmuOIZVTzhFHZoE82Tl01HlXSN7J
17 | ZIRHwKYJ3E9NEWMgJ7SE
18 | -----END CERTIFICATE REQUEST-----
19 | 


--------------------------------------------------------------------------------
/tls/host.pem:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIID3DCCAsSgAwIBAgIUD5T+wDbRhV3OEwkguwxlfTXVTTMwDQYJKoZIhvcNAQEL
 3 | BQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFjAUBgNVBAcT
 4 | DVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdldHMsIEluYy4x
 5 | DDAKBgNVBAsTA1dXVzAgFw0xOTAxMDUxOTM5MDBaGA8yMjQ3MDMwNTAzMzkwMFow
 6 | aTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFjAUBgNVBAcTDVNh
 7 | biBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdldHMsIEluYy4xDDAK
 8 | BgNVBAsTA1dXVzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAK6m6qu6
 9 | vURRAfKj/rGONayTeUvLN0iXrK53grqNn6WyUJ76mfnfCUYW+PD6TUINcVDcwGYt
10 | shjYIw/Kr5KtvU/20JqsQr3e7HXWbebSD3y+pxjXmhlw6G4iF4JNzrg8BeiyDW3a
11 | Wk4udLN5sYmRKRVmm6/ddOmWKr6FppYlnoGDijyn/yJ9QXPw76HhuZ3NsTk91Y8w
12 | M9Q+D43aBJ9jffTA8MxJE1bOBK3xvm6bCbWiIDXiXzNFfUxIkOPf540MoAaJnUKH
13 | cPcMundSWmhBuk9LMUrHEz3orLIEnFOuibCXcwYsm34Y7CRV3HjFseds+wTp6Wk+
14 | tR62OQFSxZgmbKMCAwEAAaN6MHgwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQG
15 | CCsGAQUFBwMBBggrBgEFBQcDAjAMBgNVHRMBAf8EAjAAMB0GA1UdDgQWBBQIYEWF
16 | Sah+UG/boeaUl2LQs/qSLTAaBgNVHREEEzARgglsb2NhbGhvc3SHBH8AAAEwDQYJ
17 | KoZIhvcNAQELBQADggEBADgI4NmlGugR8n3qgjQtmKZI97vtFlcKjLn2cx2K7XOr
18 | LOHMRXpxfpQH7X5VJE3sppPPenCYg7x23S5Cs4Lwy6PTgBZQEZwr5sEFpVcuhEbW
19 | jrF7IK43kXZuZz0qInEFvC2MW6jS6crYn0g8401mrmD24Zkg211HeGOhgDIRa9pZ
20 | lAD4fEynoXUmmEF6wrhMWOpZBY7VAecTXko+j96IyfrVW+1kl7mstCmgcIUQHEFl
21 | 1XnxbEp1HwQLdkA32KgDP07pAIeDMhFQjSZ7Fcqr8y96KNbuL3tmS+EAQ+JbTCLG
22 | F88HXUhSFfop+++aPuNdo/QdkHED9zJ0j8oVueBWPMg=
23 | -----END CERTIFICATE-----
24 | -----BEGIN RSA PRIVATE KEY-----
25 | MIIEowIBAAKCAQEArqbqq7q9RFEB8qP+sY41rJN5S8s3SJesrneCuo2fpbJQnvqZ
26 | +d8JRhb48PpNQg1xUNzAZi2yGNgjD8qvkq29T/bQmqxCvd7sddZt5tIPfL6nGNea
27 | GXDobiIXgk3OuDwF6LINbdpaTi50s3mxiZEpFWabr9106ZYqvoWmliWegYOKPKf/
28 | In1Bc/DvoeG5nc2xOT3VjzAz1D4PjdoEn2N99MDwzEkTVs4ErfG+bpsJtaIgNeJf
29 | M0V9TEiQ49/njQygBomdQodw9wy6d1JaaEG6T0sxSscTPeissgScU66JsJdzBiyb
30 | fhjsJFXceMWx52z7BOnpaT61HrY5AVLFmCZsowIDAQABAoIBAQCdsI+JZh2lJQ8B
31 | KRi475F+s9h8morbvDZpf/ZEwcI3NPydzd5gQQR6iBtUWGqRDNoXDHmax/9ZdyyR
32 | AYvsf3nCTTKjtIy+KhyNeIHaOidJlkoAoAm+lrcFWTqop9/RcEBVjQ2a9d87X652
33 | rZWig7H4ZKCE9QquLKuDQeDbCDRI8iCSUwHw69Os0o0xjVjwIEUfrrTAqdCW68db
34 | JxU+2qbJp6QLnQRA2tg6EEXTRuu1YydDQpVk6LWSQl4aDJvt+NmUdLQ9641oq9+U
35 | vEJStGRI/7dUvkIXHWmSjCWYUPufVFMnpiGCnVdOx+pT+7kyX4qkFkd+NRZCCRv7
36 | pjaaPYnhAoGBAOJYImG6o2ZUf6dpC7Qaju0ht/zUwzpnHhxyUEFJMn01E7n98vBS
37 | gwQjacuyEpuiRbZ2eCzXIVYUvJXRjZ6r49ICUjV3wCDcFJGgotovG5ROLDJ4uePZ
38 | j6pExC6PWOYkYBbMDMDcHINMPUMrSCt1q+IWLYwBnw0s3rrcU081G7MVAoGBAMWI
39 | 9d/XeyP1cyRzfZjYXZlUiu656aenmTMFdo1eq+qqZc3nROJAZhkKsXtZDstMhqK5
40 | Zt8JRgJgyTkpbgi7928kZQ3lZTGYRde+E+28n9PiQ/DwhQNprhAdlkxHYLbyIOMS
41 | TIgiYVyFp/hPa2+TIiRj8R0hTxjVB+eeP3c3MG7XAoGAZ4jdIUsYV0Srp85bNiU3
42 | 36ye1p4UN3DXyzdXEqYC3FcdEOTi7Z8wowH38N8ht+NAdDg4vHojm5actQNH/p6t
43 | 9XBuXlrKnb1OA1cxZxBJU0t22Bd1II5yMvaFrq6PgaZ99a+c3dNRj5WeKE1yE78d
44 | wqtWZkaPlJ1DvMHvsbmfVAkCgYBaiA7H7CYPsOp4hkXFy0P6hfi2uJYRtIpOC+7t
45 | k7oO7tGBsMCtQP6J85CsC4DwQ75gzcAL8GAZruoKPKalciBQ28lEuSHLvRIlcoQH
46 | rS9DGKwyvMj5a3HPCQBdLUlyDz5rU3On4LnmgYseDqgalsBLFVLoGt/5bYkV6j0E
47 | ElMsOQKBgHpJ3GIHB0GBW3knMU7UQypMB3C7845YPRMvXexJuINSar0Uw50HgyYL
48 | gPx+lWV23pMO2dksb+BcrbdrlFAlrOZuPg7rzLKP2J/wEQiM2OqdjOXDZ+C1D6yn
49 | 0tPFFneXGUe31tgrMuSWrUoHU6zpurAfYZuE0TJ0+7ud3Nm0q++P
50 | -----END RSA PRIVATE KEY-----
51 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tobias'
2 | 


--------------------------------------------------------------------------------
/util/memory_benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Rough script used to measure insert and query performance + memory usage.
  3 | 
  4 | For memory usage estimate ps_mem
  5 | (http://github.com/pixelb/scripts/commits/master/scripts/ps_mem.py) was used.
  6 | 
  7 |  Results
  8 | 
  9 |  Sizes: 1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000 rows
 10 |  Cache size: 1 Gb
 11 |  Insert only, 1 Gb cache configured => 1,2 Gb used
 12 |  Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets
 13 |  * 2.2 - 2.5 Gb used
 14 |  * Query response time 7 - 55 ms observed
 15 |  * Cache eviction 1 - 15 ms, datasets in cache 60 - 100, dropped 1 - 11 at a time
 16 | 
 17 |  Sizes: 1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000 rows
 18 |  Cache size: 1 Gb
 19 |  gc.collect() after every query
 20 |  Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets
 21 |  * 1.2 - 1.3 Gb used
 22 |  * Query response time 22 - 65 ms observed
 23 |  * Cache eviction 1 - 15 ms, datasets in cache 60 - 100, dropped 1 - 11 at a time
 24 | 
 25 |  Sizes: 1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000 rows
 26 |  Cache size: 1 Gb
 27 |  gc.collect() after every 10th
 28 |  Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets
 29 |  * 1.2 - 1.3 Gb used
 30 |  * Insert times 600 ms - 850 ms observed
 31 |  * Query response time 7 - 70 ms observed
 32 |  * Insert times 90 ms - 1150 ms observed
 33 |  * Cache eviction 1 - 15 ms, datasets in cache 60 - 100, dropped 1 - 13 at a time
 34 | 
 35 |  Sizes: 1000, 5000, 10000, 15000, 20000, 30000, 40000, 50000 rows
 36 |  Cache size: 1 Gb
 37 |  gc.collect() after every 10th
 38 |  Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets
 39 |  * 1.2 - 1.3 Gb used
 40 |  * Query response time 7 - 70 ms observed
 41 |  * Insert times 600 ms - 850 ms observed
 42 |  * Cache eviction 1 - 15 ms, datasets in cache 400 - 430, dropped 1 - 13 at a time
 43 |  Performance quite similar to other examples with larger datasets.
 44 | """
 45 | 
 46 | from StringIO import StringIO
 47 | import csv
 48 | import json
 49 | import random
 50 | import string
 51 | import requests
 52 | import time
 53 | 
 54 | example_data_row = {
 55 |     'text1': '123abc123', 'text2': 'asdfghjkl', 'some_text': 'aaaaaaaaaaaaaaa', 'a_status': 'b',
 56 |     'some_number': 1234567, 'a_float': 1234.1234,
 57 |     'a_class': 'qwertyuuer', 'some_label': '1234yzx', 'another_label': '1234yzx',
 58 |     'classifier': 'long_classifier', 'another_class': '1', 'float1': 98765432.123,
 59 |     'float2': 12345568.9876, 'description': 'a/b/c'}
 60 | 
 61 | 
 62 | SELECTION = ['aaaaaaaaaaaaaaaaaaa',
 63 |              'bbbbbbbbbbbbbbbbbbb',
 64 |              'ccccccccccccccccccc',
 65 |              'ddddddddddddddddddd',
 66 |              'eeeeeeeeeeeeeeeeeee',
 67 |              'fffffffffffffffffff',
 68 |              'ggggggggggggggggggg',
 69 |              'hhhhhhhhhhhhhhhhhhh',
 70 |              'iiiiiiiiiiiiiiiiiii',
 71 |              'jjjjjjjjjjjjjjjjjjj']
 72 | 
 73 | SOME_NUMBER = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 100,
 74 |                51, 52, 53, 54, 455, 56, 57, 58, 59, 50, 511, 522, 533, 544, 555, 566, 577, 588, 599, 5100]
 75 | 
 76 | def example_data(length):
 77 |     out = StringIO()
 78 |     writer = csv.DictWriter(out, example_data_row.keys())
 79 |     writer.writeheader()
 80 |     for i in range(length):
 81 |         example_data_row['text1'] = random.choice(SELECTION)
 82 |         example_data_row['classifier'] = random.choice(SELECTION)
 83 |         example_data_row['some_number'] = random.choice(SOME_NUMBER)
 84 |         writer.writerow(example_data_row)
 85 | 
 86 |     return out.getvalue()
 87 | 
 88 | 
 89 | def main():
 90 |     print "Building datasets"
 91 |     datasets = [example_data(l) for l in (1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000)]
 92 | 
 93 |     latest_datasets = []
 94 |     while True:
 95 |         ds = random.choice(datasets)
 96 |         key = ''.join(random.choice(string.ascii_uppercase) for _ in range(6))
 97 |         t0 = time.time()
 98 |         response = requests.post("http://localhost:9401/qcache/dataset/{key}".format(key=key),
 99 |                                  headers={'Content-type': 'text/csv'}, data=ds)
100 |         print "Posted {key}={size}, response={response}, duration={duration}".format(
101 |             key=key, size=len(ds), response=response.status_code, duration=time.time()-t0)
102 | 
103 |         # Keep the last 40 inserted
104 |         latest_datasets.append(key)
105 |         latest_datasets = latest_datasets[-40:]
106 | 
107 |         for _ in range(random.randint(0, 5)):
108 |             query = dict(select=['text1', 'text2', 'a_status', 'some_number'],
109 |                          distinct=['text1', 'text2', 'a_status', 'some_number'],
110 |                          where=['==', 'classifier', "'{}'".format(random.choice(SELECTION))],
111 |                          limit=50)
112 |             params = {'q': json.dumps(query)}
113 | 
114 |             ds_key = random.choice(latest_datasets)
115 | 
116 |             t0 = time.time()
117 |             response = requests.get("http://localhost:9401/qcache/dataset/{key}".format(key=ds_key),
118 |                                     params=params, headers={'Accept': 'application/json'})
119 | 
120 |             if response.status_code == 200:
121 |                 print "Success length: {length}, duration: {duration}".format(
122 |                     status=response.status_code, length=len(json.loads(response.content)),
123 |                     duration=time.time()-t0)
124 |             else:
125 |                 print "Response status: {status}, content: {content}, duration: {duration}".format(
126 |                     status=response.status_code, content=response.content, duration=time.time()-t0)
127 | 
128 |         time.sleep(0.5)
129 | 
130 | if __name__ == '__main__':
131 |     main()


--------------------------------------------------------------------------------