├── .gitignore ├── .travis.yml ├── CHANGELOG.rst ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.rst ├── dev-requirements.txt ├── qcache ├── __init__.py ├── app.py ├── compression.py ├── dataset_cache.py ├── qframe │ ├── __init__.py │ ├── common.py │ ├── constants.py │ ├── context.py │ ├── pandas_filter.py │ ├── query.py │ └── update.py └── statistics.py ├── setup.cfg ├── setup.py ├── tasks.py ├── test ├── performance_run.py ├── test_api.py ├── test_qframe.py └── test_statistics.py ├── tls ├── ca-conf.json ├── ca-key.pem ├── ca.csr ├── ca.pem ├── csr.json ├── generate_test_certs.sh ├── host-key.pem ├── host.csr └── host.pem └── util ├── __init__.py └── memory_benchmark.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | .idea 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # Complexity 39 | output/*.html 40 | output/*/index.html 41 | 42 | # Sphinx 43 | docs/_build 44 | README.html 45 | 46 | .cache 47 | ps_mem 48 | htmlcov -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - "2.7" 7 | 8 | # command to install dependencies 9 | install: 10 | - pip install -U . 11 | - pip install -r dev-requirements.txt 12 | 13 | # command to run tests, e.g. python setup.py test 14 | script: 15 | - invoke coverage 16 | 17 | before_install: 18 | - pip install codecov 19 | 20 | after_success: 21 | - codecov -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 0.9.3 (2019-01-05) 4 | ------------------ 5 | * Update dependencies on lz4 and tornado 6 | * Allow float type hinting 7 | 8 | 0.9.2 (2018-05-23) 9 | ------------------ 10 | * Fix #15, fix cache item size at creation 11 | 12 | 0.9.1 (2017-11-15) 13 | ------------------ 14 | * Downgrade to Pandas 0.20.3 in an attempt to fix #14. 15 | 16 | 0.9.0 (2017-11-14) 17 | ------------------ 18 | * Numexpr filter engine is not available anymore, only Pandas. Numexpr is no longer a requirement of qcache. 19 | NB! Major backwards incompatibility 20 | * Fix #12, like now ignores NaN 21 | * Fix #13, only empty string is considered as NaN when reading CSV 22 | * Fix #8, integer standins remain integers 23 | * Upgrade Pandas to 0.21.0 and Numpy to 0.13.1 24 | 25 | 0.8.1 (2017-04-06) 26 | ------------------ 27 | * Bump Pandas to 0.19.2 28 | 29 | 0.8.0 (2017-01-08) 30 | ------------------ 31 | * Support client cert verification 32 | 33 | 0.7.2 (2016-12-18) 34 | ------------------ 35 | * Fix #10 & #11, minor statistics improvements 36 | 37 | 0.7.1 (2016-11-30) 38 | ------------------ 39 | * Fix #9, df overwritten by mistake 40 | 41 | 0.7.0 (2016-11-09) 42 | ------------------ 43 | * Compression using LZ4 or GZIP in requests and responses (#3) 44 | * Sub queries in "in" filter (#7) 45 | * Enum type based on Pandas category type (#6) 46 | * Support for stand in columns in queries (#5) 47 | * Additional metrics/statistics for complete request duration for stores and queries 48 | * Update size estimates to do deep inspection of objects contained in dataframe. This should 49 | be more accurate than the previous shallow inspection. 50 | * Update Pandas to 0.19.1 51 | * Update Tornado to 4.4.2 52 | 53 | 0.6.1 (2016-09-18) 54 | ------------------ 55 | * Fix packaging, the new qcache.qframe package was missing from the submitted package. 56 | 57 | 0.6.0 (2016-09-18) 58 | ------------------ 59 | * New filter engine based on Pandas rather than Numexpr. This enables new types of filters in the where 60 | clause (see below). By default the old engine is still used but the new one can be enabled either 61 | by default on server startup or on a per-query basis by setting the new 'X-QCache-filter-engine' header 62 | to 'pandas'. 63 | * New bitwise filters in the 'pandas' filter engine, 'all_bits' and 'any_bits'. 64 | * New string filters, 'like' and 'ilike' which corresponds roughly to LIKE in SQL with the addition 65 | of regex support. 'like' is case sensitive while 'ilike' is case insensitive. 66 | 67 | 0.5.0 (2016-06-19) 68 | ------------------ 69 | * New header when uploading data, 'X-QCache-stand-in-columns', that let you specify default values 70 | for columns that may not be present in the uploaded data. 71 | 72 | 0.4.2 (2016-06-04) 73 | ------------------ 74 | * Additional statistics to measure for how long data remains in the cache before it's evicted. 75 | * Bump dependency versions of Pandas, Numexpr and Tornado. 76 | 77 | 0.4.1 (2016-01-31) 78 | ------------------ 79 | * Provide the duration for which statistics were collected and statistics buffer size 80 | 81 | 0.4.0 (2016-01-24) 82 | ------------------ 83 | * Sub query support with new 'from' clause 84 | * Column aliasing + support for calculated columns 85 | * Error message improvements 86 | 87 | 0.3.0 (2015-12-23) 88 | ------------------ 89 | * Accepts conjunctions and disjunctions with only one clause 90 | * Accept POST queries, good for large queries 91 | * Improved performance for "in" queries, up to 30x faster for large lists 92 | 93 | 0.2.1 (2015-12-15) 94 | ------------------ 95 | * More efficient cache size tracking 96 | * Check against unknown query clauses 97 | 98 | 0.2.0 (2015-12-06) 99 | ------------------ 100 | * Report the unsliced result length as part of the result, nice for pagination for example 101 | * Use connection pooling 102 | * SSL and basic auth support 103 | 104 | 0.1.0 (2015-10-25) 105 | ------------------ 106 | * First release that actually does something sensible. 107 | 108 | 0.0.1 (2015-10-15) 109 | ------------------ 110 | * First release on PyPI. 111 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7.15-slim-jessie 2 | 3 | RUN pip install qcache==0.9.3 4 | 5 | EXPOSE 9401 9402 9403 9404 9405 9406 9407 9408 6 | ENV QCACHE_PORT 9401 7 | 8 | # Start container like this: 9 | # - docker run -p 9401:9401 qcache 10 | # - docker run --env QCACHE_PORT=9402 -p 9402:9402 qcache 11 | CMD [ "sh", "-c", "qcache -p $QCACHE_PORT"] 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Tobias Gustafsson 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst LICENSE 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | QCache 3 | ====== 4 | 5 | .. image:: https://travis-ci.org/tobgu/qcache.png?branch=master 6 | :target: https://travis-ci.org/tobgu/qcache 7 | 8 | .. image:: https://badge.fury.io/py/qcache.svg 9 | :target: https://badge.fury.io/py/qcache 10 | 11 | .. image:: http://codecov.io/github/tobgu/qcache/coverage.svg?branch=master 12 | :target: http://codecov.io/github/tobgu/qcache?branch=master 13 | 14 | .. _Memcached: http://memcached.org/ 15 | 16 | QCache is a key-table cache, an in memory cache server with analytical query capabilities. 17 | 18 | While the more commonly known key-value caches (such as Memcached_) lets you fetch a value 19 | based on a key QCache lets you run queries against a table based on a key. 20 | 21 | ********** 22 | Motivation 23 | ********** 24 | You are working with table data that you want to run flexible queries against but do not want to 25 | load them into an SQL database or similar because of any of the following: 26 | 27 | - The operational cost and complexity of bringing in an SQL server 28 | - The tables do not have a homogeneous format 29 | - The data is short lived 30 | - Not all data available is ever used, you only want to use resources on demand 31 | - You want to treat queries as data and build them dynamically using data structures 32 | that you are used to (dictionaries and lists or objects and arrays depending on your 33 | language background) 34 | - Expensive JOINs are required to create the table. 35 | - ... 36 | 37 | Or, you are building server software and want to add the possibility for your clients to run 38 | queries directly against the data without the need for dreadful translations between a REST 39 | interface with some home grown filter language. 40 | 41 | 42 | .. _QCache-client: https://github.com/tobgu/qcache-client 43 | .. _Go-QCache-client: https://github.com/tobgu/go-qcache-client 44 | 45 | ******** 46 | Features 47 | ******** 48 | - Simple, single thread, single process, server. 49 | - Expressive JSON-based query language with format and features similar to SQL SELECT. Queries 50 | are data that can easily be transformed or enriched. 51 | - Support for JSON or CSV input and output format 52 | - Performant queries on tables as large as 10 x 1000000 cells out of the box 53 | - No need for table definitions, tables are created dynamically based on the data inserted 54 | - Statistics about hit and miss count, query and insert performance and more available 55 | through HTTP API 56 | - Scales linearly in query capacity with the number of servers. A python client library that 57 | uses consistent hashing for key distribution among servers is available 58 | here QCache-client_. There's also a basic Go client here Go-QCache-client_. 59 | More clients are welcome! 60 | 61 | 62 | ************ 63 | Requirements 64 | ************ 65 | Python 2.7 (2.7.9+ if using TLS) for now 66 | 67 | 68 | ************ 69 | Installation 70 | ************ 71 | .. code:: 72 | 73 | pip install qcache 74 | 75 | ******* 76 | Running 77 | ******* 78 | .. code:: 79 | 80 | qcache 81 | 82 | This will start qcache on the default port using the default cache size. To get help on available parameters: 83 | 84 | .. code:: 85 | 86 | qcache --help 87 | 88 | 89 | ****** 90 | Docker 91 | ****** 92 | You can also get the latest version as a Docker image. This is probably the easiest way to try it out if you 93 | are running Linux or if you have Docker Machine installed. 94 | 95 | .. code:: 96 | 97 | docker run -p 9401:9401 tobgu/qcache 98 | 99 | 100 | ******* 101 | License 102 | ******* 103 | MIT licensed. See the bundled `LICENSE `_ file for more details. 104 | 105 | ************** 106 | Query examples 107 | ************** 108 | Below are examples of the major features of the query language. A JSON object is used to 109 | describe the query. The query should be URL encoded and passed in using the 'q' GET-parameter. 110 | 111 | The query language uses LISP-style prefix notation for simplicity. This makes it easy 112 | to parse and build queries dynamically since no rules for operator precedence 113 | ever need to be applied. 114 | 115 | Like so: 116 | `http://localhost:8888/qcache/datasets/?q=` 117 | 118 | You can also POST queries as JSON against: 119 | `http://localhost:8888/qcache/datasets//q/` 120 | 121 | This is a good alternative to GET if your queries are too large to fit in the query string. 122 | 123 | Select all 124 | ========== 125 | An empty object will return all rows in the table: 126 | 127 | .. code:: python 128 | 129 | {} 130 | 131 | Projection 132 | ========== 133 | .. code:: python 134 | 135 | {"select": ["foo", "bar"]} 136 | 137 | Not specifying select is equivalent to SELECT * in SQL 138 | 139 | Column aliasing 140 | --------------- 141 | .. code:: python 142 | 143 | {"select": [["=", "foo", "bar"]]} 144 | 145 | This will rename column bar to foo in the result. 146 | 147 | You can also make more elaborate calculations in the aliasing expression. 148 | 149 | .. code:: python 150 | 151 | {"select": [["=", "baz", ["+", ["*", "bar", 2], "foo"]]] 152 | 153 | As well as simple constant assignments. 154 | 155 | .. code:: python 156 | 157 | {"select": [["=", "baz", 55]]} 158 | 159 | 160 | Filtering 161 | ========= 162 | 163 | Comparison 164 | ---------- 165 | .. code:: python 166 | 167 | {"where": ["<", "foo", 1]} 168 | 169 | The following operators are supported: 170 | 171 | .. code:: 172 | 173 | ==, !=, <=, <, >, >= 174 | 175 | In 176 | -- 177 | .. code:: python 178 | 179 | {"where": ["in", "foo", [1, 2]]} 180 | 181 | 182 | Like/ilike 183 | ---------- 184 | Like and ilike are used for string matching and work similar to LIKE in SQL. Like is case sensitive 185 | while ilike is case insensitive. In addition to string matching using % as wildcard like/ilike also 186 | supports regexps. 187 | 188 | .. code:: python 189 | 190 | {"where": ["like", "foo", "'%bar%'"]} 191 | 192 | 193 | Bitwise operators 194 | ----------------- 195 | There are two operators for bitwise filtering on integers: `all_bits` and `any_bits`. 196 | 197 | * all_bits - evaluates to true if all bits in the supplied argument are set in value tested against. 198 | * any_bits - evaluates to true if any bits in the supplied argument are set in value tested agains. 199 | 200 | .. code:: python 201 | 202 | {"where": ["any_bits", "foo", 31]} 203 | 204 | 205 | Clauses 206 | ------- 207 | .. code:: python 208 | 209 | {"where": ["&", [">", "foo", 1], 210 | ["==", "bar", 2]]} 211 | 212 | The following operators are supported: 213 | 214 | .. code:: 215 | 216 | &, | 217 | 218 | 219 | Negation 220 | -------- 221 | .. code:: python 222 | 223 | {"where": ["!", ["==", "foo", 1]]} 224 | 225 | 226 | Ordering 227 | ======== 228 | 229 | Ascending 230 | 231 | .. code:: python 232 | 233 | {"order_by": ["foo"]} 234 | 235 | 236 | Descending 237 | 238 | .. code:: python 239 | 240 | {"order_by": ["-foo"]} 241 | 242 | 243 | Offset 244 | ====== 245 | Great for pagination of long results! 246 | 247 | .. code:: python 248 | 249 | {"offset": 5} 250 | 251 | 252 | Limit 253 | ===== 254 | Great for pagination of long results! 255 | 256 | .. code:: python 257 | 258 | {"limit": 10} 259 | 260 | 261 | Group by 262 | ======== 263 | .. code:: python 264 | 265 | {"group_by": ["foo"]} 266 | 267 | 268 | Aggregation 269 | =========== 270 | Aggregation is done as part of the select, just like in SQL. 271 | 272 | .. code:: python 273 | 274 | {"select": ["foo" ["sum", "bar"]], 275 | "group_by": ["foo"]} 276 | 277 | 278 | Distinct 279 | ======== 280 | Distinct has its own query clause unlike in SQL. 281 | 282 | .. code:: python 283 | 284 | {"select": ["foo", "bar"], 285 | "distinct": ["foo"]} 286 | 287 | 288 | Sub queries using from 289 | ====================== 290 | Filter, transform and select your data in multiple steps. 291 | 292 | .. code:: python 293 | 294 | {"select": [["=", "foo_pct", ["*", 100, ["/", "foo", "bar"]]]], 295 | "from": {"select": ["foo", ["sum", "bar"]], 296 | "group_by": ["foo"]}} 297 | 298 | 299 | Sub queries using in 300 | ==================== 301 | Filter your data using the result of a query as filter input. 302 | 303 | .. code:: python 304 | 305 | {"where", ["in", "foo", {"where": ["==", "bar", 10]}]} 306 | 307 | 308 | All together now! 309 | ================= 310 | A slightly more elaborate example. Get the top 10 foo:s with most bar:s. 311 | 312 | .. code:: python 313 | 314 | {"select": ["foo", ["sum", "bar"]], 315 | "where": [">", "bar", 0], 316 | "order_by": ["-bar"], 317 | "group_by": ["foo"], 318 | "limit": 10} 319 | 320 | 321 | *********************** 322 | API examples using curl 323 | *********************** 324 | Upload table data to cache (a 404 will be returned if querying on a key that does not exist). 325 | 326 | .. code:: 327 | 328 | curl -X POST --data-binary @my_csv.csv http://localhost:8888/qcache/dataset/my-key 329 | 330 | 331 | Query table 332 | 333 | .. code:: 334 | 335 | curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"select\": [[\"count\"]], \"where\": [\"<\", \"baz\", 99999999999915], \"offset\": 100, \"limit\": 50}" 336 | curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"select\": [[\"count\"]], \"where\": [\"in\", \"baz\", [779889,8958854,8281368,6836605,3080972,4072649,7173075,4769116,4766900,4947128,7314959,683531,6395813,7834211,12051932,3735224,12368089,9858334,4424629,4155280]], \"offset\": 0, \"limit\": 50}" 337 | curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"where\": [\"==\", \"foo\", \"\\\"95d9f671\\\"\"], \"offset\": 0, \"limit\": 50}" 338 | curl -G localhost:8888/qcache/dataset/my-key --data-urlencode "q={\"select\": [[\"max\", \"baz\"]], \"offset\": 0, \"limit\": 500000000000}" 339 | 340 | 341 | *************************** 342 | Custom request HTTP headers 343 | *************************** 344 | 345 | There are a couple of custom HTTP headers that can be used to control the behaviour of Q-Cache. 346 | 347 | Posting tables 348 | ============== 349 | 350 | X-QCache-types 351 | -------------- 352 | QCache will usually recognize the data types of submitted data automatically. There may be times when 353 | strings are mistaken for numbers because all of the data submitted for a column in a dataset happens 354 | to be in numbers. 355 | 356 | This header makes it possible to explicitly type column to be a string to. In the example below columns 357 | foo and bar are both typed to string. 358 | 359 | .. code:: 360 | 361 | X-QCache-types: foo=string;bar=string 362 | 363 | Explicitly setting the type to string is only relevant when submitting data in CSV. With JSON the data 364 | has an unambiguous (well...) data type that is used by QCache. 365 | 366 | Enums 367 | ----- 368 | The `X-QCache-types` header can also be used to specify columns with enum types. 369 | 370 | .. code:: 371 | 372 | X-QCache-types: foo=enum;bar=enum 373 | 374 | Enums are a good way to store low cardinality string columns space efficiently. They can be compared 375 | for equality and inequality but currently do not have a well defined order so filtering by 376 | larger than and less than is not possible for example. 377 | 378 | 379 | X-QCache-stand-in-columns 380 | ------------------------- 381 | It may be that your submitted data varies a little from dataset to dataset with respect to the columns 382 | available in the dataset. You still want to be able to query the datasets in the same way and make 383 | some assumptions of which columns that are available. This header lets you do that. 384 | 385 | In the below example column foo will be set to 10 in case it does not exist in the submitted data. bar will 386 | be set to the value of the baz column if it is not submitted. 387 | 388 | This header can be used in request both for storing and querying data. 389 | 390 | .. code:: 391 | 392 | X-QCache-stand-in-columns: foo=10;bar=baz 393 | 394 | 395 | Query responses 396 | =============== 397 | 398 | X-QCache-unsliced-length 399 | ------------------------ 400 | This header is added to responses and states how many rows the total filtered result was before applying 401 | any limits or offsets for pagination. 402 | 403 | .. code:: 404 | 405 | X-QCache-unsliced-length: 8324 406 | 407 | 408 | ************* 409 | More examples 410 | ************* 411 | Please look at the tests in the project or QCache-client_ for some further examples of queries. 412 | The unit tests in this project is also a good source for examples. 413 | 414 | If you still have questions don't hesitate to contact the author or write an issue! 415 | 416 | ********** 417 | Statistics 418 | ********** 419 | 420 | .. code:: 421 | 422 | http://localhost:8888/qcache/statistics 423 | 424 | A get against the above endpoint will return a JSON object containing cache statistics, 425 | hit & miss count, query & upload duration. Statistics are reset when querying. 426 | 427 | ************* 428 | Data encoding 429 | ************* 430 | Just use UTF-8 when uploading data and in queries and you'll be fine. All responses are UTF-8. 431 | No other codecs are supported. 432 | 433 | **************** 434 | Data compression 435 | **************** 436 | QCache supports request and response body compression with LZ4 or GZIP using standard HTTP headers. 437 | 438 | In a query request set the following header to receive a compressed response: 439 | 440 | .. code:: 441 | 442 | Accept-Encoding: lz4,gzip 443 | 444 | 445 | The response will contain the following header indicating the used encoding 446 | 447 | .. code:: 448 | 449 | Content-Encoding: lz4 450 | 451 | LZ4 will always be preferred if present. 452 | 453 | The above header should also be set indicating the compression algorithm if you are 454 | submitting compressed data. 455 | 456 | 457 | ************************** 458 | Performance & dimensioning 459 | ************************** 460 | Since QCache is single thread, single process, the way to scale capacity is by adding more servers. 461 | If you have 8 Gb of ram available on a 4 core machine don't start one server using all 8 Gb. Instead 462 | start 4 servers with 2 Gb memory each or even 8 servers with 1 Gb each or 16 servers with 512 Mb each. 463 | depending on your use case. Assign them to different ports and use a client library to do the key 464 | balancing between them. That way you will have 4 - 16 times the query capacity. 465 | 466 | QCache is ideal for container deployment. Start one container running one QCache instance. 467 | 468 | Expect a memory overhead of about 20% - 30% of the configured cache size for querying and table loading. 469 | To be on the safe side you should probably assume a 50% overhead. Eg. if you have 3 Gb available set the 470 | cache size to 2 Gb. 471 | 472 | When choosing between CSV and JSON as upload format prefer CSV as the amount of data can be large and it's 473 | more compact and faster to insert than JSON. 474 | 475 | For query responses prefer JSON as the amount of data is often small and it's easier to work with than CSV. 476 | 477 | .. _Pandas: http://pandas.pydata.org/ 478 | .. _NumPy: http://www.numpy.org/ 479 | .. _Tornado: http://www.tornadoweb.org/en/stable/ 480 | 481 | *********************************** 482 | Standing on the shoulders of giants 483 | *********************************** 484 | QCache makes heavy use of the fantastic python libraries Pandas_, NumPy_ and Tornado_. 485 | 486 | 487 | ********************* 488 | Ideas for coming work 489 | ********************* 490 | These may or may not be realized, it's far from sure that all of the ideas are good. 491 | 492 | * Improve documentation 493 | * Stream data into dataframe rather than waiting for complete input, chunked HTTP upload or similar. 494 | * Streaming proxy to allow clients to only know about one endpoint. 495 | * Configurable URL prefix to allow being mounted at arbitrary position behind a proxy. 496 | * Make it possible to execute multiple queries and return multiple responses in one request (qs=,/qs/). 497 | * Allow post with data and query in one request, this will guarantee progress 498 | as long as the dataset fits in memory. {"query": ..., "dataset": ...} 499 | * Possibility to specify indexes when uploading data (how do the indexes affect size? write performance? read performance?) 500 | * Possibility to upload files as a way to prime the cache without taking up memory. 501 | * Namespaces for more diverse statistics based on namespace? 502 | * Publish performance numbers 503 | * Other table formats in addition to CSV and JSON? 504 | * Break out all things dataframe into an own package and provide possibility to update 505 | and insert into dataframe based on predicate just like querying is done now. 506 | * Investigate type hints for pandas categorials on enum-like values to improve storage 507 | layout and filter speed. Check new import options from CSV when Pandas 0.19 is available. 508 | * Support math functions as part of the where clause (see pandas expr.py/ops.py) 509 | * Some kind of light weight joining? Could create dataset groups that all are allocated to 510 | the same cache. Sub queries could then be used to query datasets based on data selected 511 | from other datasets in the same dataset group. 512 | 513 | ************ 514 | Contributing 515 | ************ 516 | Want to contribute? That's great! 517 | 518 | If you experience problems please log them on GitHub. If you want to contribute code, 519 | please fork the code and submit a pull request. 520 | 521 | If you intend to implement major features or make major changes please raise an issue 522 | so that we can discuss it first. 523 | 524 | Running tests 525 | ============= 526 | .. code:: 527 | 528 | pip install -r dev-requirements.txt 529 | invoke test 530 | 531 | TLS 532 | === 533 | Some tests rely on a couple of certs found under `tls/`. If these have expired 534 | they have to be regenerated. This is done by executing `generate_test_certs.sh` 535 | from the `tls` directory. 536 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | invoke<=0.12.0 3 | freezegun 4 | pytest-cov 5 | flake8 -------------------------------------------------------------------------------- /qcache/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """QCache 4 | 5 | Usage: 6 | qcache [-hd] [--port=PORT] [--size=MAX_SIZE] [--age=MAX_AGE] [--statistics-buffer-size=BUFFER_SIZE] 7 | [--cert-file=PATH_TO_CERT] [--ca-file=PATH_TO_CA] [--basic-auth=:] 8 | 9 | Options: 10 | -h --help Show this screen 11 | -p PORT --port=PORT Port [default: 8888] 12 | -s MAX_SIZE --size=MAX_SIZE Max cache size, bytes [default: 1000000000] 13 | -a MAX_AGE --age=MAX_AGE Max age of cached item, seconds. 0 = never expire. [default: 0] 14 | -b BUFFER_SIZE --statistics-buffer-size=BUFFER_SIZE Number of entries to store in statistics 15 | ring buffer. [default: 1000] 16 | -c PATH_TO_CERT --cert-file=PATH_TO_CERT Path to PEM file containing private key and certificate for SSL 17 | -ca PATH_TO_CA --ca-file=PATH_TO_CA Path to CA file, if provided client certificates will be checked against this ca 18 | -d --debug Run in debug mode 19 | -ba : --basic-auth=: Enable basic auth, requires that SSL is enabled. 20 | """ 21 | 22 | from docopt import docopt 23 | from qcache.app import run 24 | 25 | __version__ = "0.9.3" 26 | __author__ = "Tobias Gustafsson" 27 | __license__ = "MIT" 28 | 29 | 30 | def main(): 31 | """ 32 | Main entry point for the qcache server. 33 | """ 34 | args = docopt(__doc__, version=__version__) 35 | 36 | # Should be possible to solve this without casting to int... 37 | if '--version' in args: 38 | print __version__ 39 | else: 40 | run(port=int(args['--port']), 41 | max_cache_size=int(args['--size']), 42 | max_age=int(args['--age']), 43 | statistics_buffer_size=int(args['--statistics-buffer-size']), 44 | debug=args['--debug'], 45 | certfile=args['--cert-file'], 46 | cafile=args['--ca-file'], 47 | basic_auth=args['--basic-auth']) 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /qcache/app.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import re 4 | import ssl 5 | import time 6 | import gc 7 | 8 | from tornado.ioloop import IOLoop 9 | from tornado.web import RequestHandler, Application, url, HTTPError 10 | 11 | from qcache.dataset_cache import DatasetCache 12 | from qcache.compression import CompressedContentEncoding, decoded_body 13 | from qcache.qframe import MalformedQueryException, QFrame 14 | from qcache.statistics import Statistics 15 | 16 | 17 | class ResponseCode(object): 18 | OK = 200 19 | CREATED = 201 20 | 21 | BAD_REQUEST = 400 22 | UNAUTHORIZED = 401 23 | NOT_FOUND = 404 24 | NOT_ACCEPTABLE = 406 25 | UNSUPPORTED_MEDIA_TYPE = 415 26 | 27 | 28 | CONTENT_TYPE_JSON = 'application/json' 29 | CONTENT_TYPE_CSV = 'text/csv' 30 | ACCEPTED_TYPES = {CONTENT_TYPE_JSON, CONTENT_TYPE_CSV} # text/*, */*? 31 | CHARSET_REGEX = re.compile('charset=([A-Za-z0-9_-]+)') 32 | 33 | auth_user = None 34 | auth_password = None 35 | 36 | 37 | def auth_enabled(): 38 | return auth_user is not None and auth_password is not None 39 | 40 | 41 | def credentials_correct(provided_user, provided_password): 42 | return provided_user == auth_user and provided_password == auth_password 43 | 44 | 45 | def http_auth(handler_class): 46 | """ 47 | Basic auth decorator. Based on the decorator found here: 48 | https://simplapi.wordpress.com/2014/03/26/python-tornado-and-decorator/ 49 | """ 50 | 51 | def set_401(handler): 52 | handler.set_status(ResponseCode.UNAUTHORIZED) 53 | handler.set_header('WWW-Authenticate', 'Basic realm=Restricted') 54 | handler._transforms = [] 55 | handler.finish() 56 | 57 | def wrap_execute(handler_execute): 58 | def is_authenticated(handler): 59 | if not auth_enabled(): 60 | return True 61 | 62 | auth_header = handler.request.headers.get('Authorization') 63 | if auth_header is None or not auth_header.startswith('Basic '): 64 | set_401(handler) 65 | return False 66 | 67 | auth_decoded = base64.decodestring(auth_header[6:]) 68 | user, password = auth_decoded.split(':', 2) 69 | 70 | if not credentials_correct(user, password): 71 | set_401(handler) 72 | return False 73 | 74 | return True 75 | 76 | def _execute(self, transforms, *args, **kwargs): 77 | if not is_authenticated(self): 78 | return False 79 | 80 | return handler_execute(self, transforms, *args, **kwargs) 81 | 82 | return _execute 83 | 84 | handler_class._execute = wrap_execute(handler_class._execute) 85 | return handler_class 86 | 87 | 88 | class UTF8JSONDecoder(json.JSONDecoder): 89 | def decode(self, json_string): 90 | obj = super(UTF8JSONDecoder, self).decode(json_string) 91 | assert isinstance(obj, list), "Must pass a list of objects" 92 | 93 | for r in obj: 94 | yield {k: v.encode(encoding='utf-8') if isinstance(v, unicode) else v for k, v in r.items()} 95 | 96 | 97 | class AppState(object): 98 | def __init__(self): 99 | self.query_count = 0 100 | 101 | 102 | @http_auth 103 | class DatasetHandler(RequestHandler): 104 | def initialize(self, dataset_cache, state, stats): 105 | self.dataset_cache = dataset_cache 106 | self.state = state 107 | self.stats = stats 108 | 109 | def prepare(self): 110 | self.request_start = time.time() 111 | 112 | def on_finish(self): 113 | if hasattr(self, 'operation'): 114 | self.stats.append('{}_request_durations'.format(self.operation), time.time() - self.request_start) 115 | 116 | def accept_type(self): 117 | accept_types = [t.strip() for t in self.request.headers.get('Accept', CONTENT_TYPE_JSON).split(',')] 118 | for t in accept_types: 119 | if t in ACCEPTED_TYPES: 120 | return t 121 | 122 | raise HTTPError(ResponseCode.NOT_ACCEPTABLE) 123 | 124 | def content_type(self): 125 | header = self.request.headers.get("Content-Type", CONTENT_TYPE_CSV).split(';') 126 | content_type = header[0] 127 | if content_type not in ACCEPTED_TYPES: 128 | raise HTTPError(ResponseCode.UNSUPPORTED_MEDIA_TYPE, 129 | "Content-Type '{content_type}' not supported".format(content_type=content_type)) 130 | 131 | if len(header) > 1: 132 | m = CHARSET_REGEX.match(header[1].strip()) 133 | if m and m.group(1) != 'utf-8': 134 | raise HTTPError(ResponseCode.UNSUPPORTED_MEDIA_TYPE, 135 | "charset={charset} not supported, only utf-8".format(charset=m.group(1))) 136 | 137 | return content_type 138 | 139 | def header_to_key_values(self, header_name): 140 | header_value = self.request.headers.get(header_name, None) 141 | if not header_value: 142 | return None 143 | 144 | key_values = [] 145 | for key_value in header_value.split(';'): 146 | key_values.append(tuple(s.strip() for s in key_value.split('='))) 147 | 148 | return key_values 149 | 150 | def dtypes(self): 151 | types = self.header_to_key_values('X-QCache-types') 152 | if not types: 153 | return None 154 | 155 | dtypes = {} 156 | for column_name, type_name in types: 157 | if type_name == 'string': 158 | dtypes[column_name] = 'object' 159 | elif type_name == 'enum': 160 | dtypes[column_name] = 'category' 161 | elif type_name == 'float': 162 | dtypes[column_name] = 'float64' 163 | else: 164 | raise HTTPError(ResponseCode.BAD_REQUEST, 165 | 'Unrecognized type name "{type_name}" for column "{column_name}"'.format( 166 | type_name=type_name, column_name=column_name)) 167 | 168 | return dtypes 169 | 170 | def stand_in_columns(self): 171 | return self.header_to_key_values('X-QCache-stand-in-columns') 172 | 173 | def query(self, dataset_key, q): 174 | t0 = time.time() 175 | self.operation = 'query' 176 | accept_type = self.accept_type() 177 | if dataset_key not in self.dataset_cache: 178 | self.stats.inc('miss_count') 179 | raise HTTPError(ResponseCode.NOT_FOUND) 180 | 181 | if self.dataset_cache.evict_if_too_old(dataset_key): 182 | self.stats.inc('miss_count') 183 | self.stats.inc('age_evict_count') 184 | raise HTTPError(ResponseCode.NOT_FOUND) 185 | 186 | qf = self.dataset_cache[dataset_key] 187 | try: 188 | result_frame = qf.query(q, stand_in_columns=self.stand_in_columns()) 189 | except MalformedQueryException as e: 190 | self.write(json.dumps({'error': str(e)})) 191 | self.set_status(ResponseCode.BAD_REQUEST) 192 | return 193 | 194 | self.set_header("Content-Type", "{content_type}; charset=utf-8".format(content_type=accept_type)) 195 | self.set_header("X-QCache-unsliced-length", result_frame.unsliced_df_len) 196 | if accept_type == CONTENT_TYPE_CSV: 197 | self.write(result_frame.to_csv()) 198 | else: 199 | self.write(result_frame.to_json()) 200 | 201 | self.post_query_processing() 202 | self.stats.inc('hit_count') 203 | self.stats.append('query_durations', time.time() - t0) 204 | 205 | def q_json_to_dict(self, q_json): 206 | try: 207 | return json.loads(q_json) 208 | except ValueError: 209 | self.write(json.dumps({'error': 'Could not load JSON: {json}'.format(json=json)})) 210 | self.set_status(ResponseCode.BAD_REQUEST) 211 | 212 | return None 213 | 214 | def get(self, dataset_key, optional_q): 215 | if optional_q: 216 | # There should not be a q URL for the GET method, it's supposed to take 217 | # q as a query parameter 218 | raise HTTPError(ResponseCode.NOT_FOUND) 219 | 220 | q_dict = self.q_json_to_dict(self.get_argument('q', default='')) 221 | if q_dict is not None: 222 | self.query(dataset_key, q_dict) 223 | 224 | def post_query_processing(self): 225 | if self.state.query_count % 10 == 0: 226 | # Run a collect every now and then. It reduces the process memory consumption 227 | # considerably but always doing it will impact query performance negatively. 228 | gc.collect() 229 | 230 | self.state.query_count += 1 231 | 232 | def post(self, dataset_key, optional_q): 233 | if optional_q: 234 | q_dict = self.q_json_to_dict(decoded_body(self.request)) 235 | if q_dict is not None: 236 | self.query(dataset_key, q_dict) 237 | return 238 | 239 | t0 = time.time() 240 | self.operation = 'store' 241 | if dataset_key in self.dataset_cache: 242 | self.stats.inc('replace_count') 243 | del self.dataset_cache[dataset_key] 244 | 245 | content_type = self.content_type() 246 | input_data = decoded_body(self.request) 247 | if content_type == CONTENT_TYPE_CSV: 248 | durations_until_eviction = self.dataset_cache.ensure_free(len(input_data)) 249 | qf = QFrame.from_csv(input_data, column_types=self.dtypes(), 250 | stand_in_columns=self.stand_in_columns()) 251 | else: 252 | # This is a waste of CPU cycles, first the JSON decoder decodes all strings 253 | # from UTF-8 then we immediately encode them back into UTF-8. Couldn't 254 | # find an easy solution to this though. 255 | durations_until_eviction = self.dataset_cache.ensure_free(len(input_data) / 2) 256 | data = json.loads(input_data, cls=UTF8JSONDecoder) 257 | qf = QFrame.from_dicts(data, stand_in_columns=self.stand_in_columns()) 258 | 259 | self.dataset_cache[dataset_key] = qf 260 | self.set_status(ResponseCode.CREATED) 261 | self.stats.inc('size_evict_count', count=len(durations_until_eviction)) 262 | self.stats.inc('store_count') 263 | self.stats.append('store_row_counts', len(qf)) 264 | self.stats.append('store_durations', time.time() - t0) 265 | self.stats.extend('durations_until_eviction', durations_until_eviction) 266 | self.write("") 267 | 268 | def delete(self, dataset_key, optional_q): 269 | if optional_q: 270 | # There should not be a q parameter for the delete method 271 | raise HTTPError(ResponseCode.NOT_FOUND) 272 | 273 | if dataset_key in self.dataset_cache: 274 | del self.dataset_cache[dataset_key] 275 | 276 | self.write("") 277 | 278 | 279 | @http_auth 280 | class StatusHandler(RequestHandler): 281 | def get(self): 282 | self.write("OK") 283 | 284 | 285 | @http_auth 286 | class StatisticsHandler(RequestHandler): 287 | def initialize(self, dataset_cache, stats): 288 | self.dataset_cache = dataset_cache 289 | self.stats = stats 290 | 291 | def get(self): 292 | self.set_header("Content-Type", "application/json; charset=utf-8") 293 | stats = self.stats.snapshot() 294 | stats['dataset_count'] = len(self.dataset_cache) 295 | stats['cache_size'] = self.dataset_cache.size 296 | self.write(json.dumps(stats)) 297 | 298 | 299 | def make_app(url_prefix='/qcache', debug=False, max_cache_size=1000000000, max_age=0, 300 | statistics_buffer_size=1000, basic_auth=None): 301 | if basic_auth: 302 | global auth_user, auth_password 303 | auth_user, auth_password = basic_auth.split(':', 2) 304 | 305 | stats = Statistics(buffer_size=statistics_buffer_size) 306 | cache = DatasetCache(max_size=max_cache_size, max_age=max_age) 307 | return Application([ 308 | url(r"{url_prefix}/dataset/([A-Za-z0-9\-_]+)/?(q)?".format(url_prefix=url_prefix), 309 | DatasetHandler, 310 | dict(dataset_cache=cache, state=AppState(), stats=stats), 311 | name="dataset"), 312 | url(r"{url_prefix}/status".format(url_prefix=url_prefix), 313 | StatusHandler, 314 | dict(), 315 | name="status"), 316 | url(r"{url_prefix}/statistics".format(url_prefix=url_prefix), 317 | StatisticsHandler, 318 | dict(dataset_cache=cache, stats=stats), 319 | name="statistics") 320 | ], debug=debug, transforms=[CompressedContentEncoding]) 321 | 322 | 323 | def ssl_options(certfile, cafile=None): 324 | if certfile: 325 | print "Enabling TLS" 326 | ssl_context = ssl.create_default_context(purpose=ssl.Purpose.CLIENT_AUTH, cafile=cafile) 327 | ssl_context.load_cert_chain(certfile) 328 | 329 | if cafile: 330 | print "Enabling client certificate verification" 331 | ssl_context.verify_mode = ssl.CERT_REQUIRED 332 | return dict(ssl_options=ssl_context) 333 | 334 | return {} 335 | 336 | 337 | def run(port=8888, max_cache_size=1000000000, max_age=0, statistics_buffer_size=1000, 338 | debug=False, certfile=None, cafile=None, basic_auth=None): 339 | if basic_auth and not certfile: 340 | print "TLS must be enabled to use basic auth!" 341 | return 342 | 343 | print("Starting on port {port}, max cache size {max_cache_size} bytes, max age {max_age} seconds," 344 | " statistics_buffer_size {statistics_buffer_size}, debug={debug},".format( 345 | port=port, max_cache_size=max_cache_size, max_age=max_age, 346 | statistics_buffer_size=statistics_buffer_size, debug=debug)) 347 | 348 | app = make_app( 349 | debug=debug, max_cache_size=max_cache_size, max_age=max_age, 350 | statistics_buffer_size=statistics_buffer_size, basic_auth=basic_auth) 351 | 352 | args = {} 353 | args.update(ssl_options(certfile=certfile, cafile=cafile)) 354 | app.listen(port, max_buffer_size=max_cache_size, **args) 355 | IOLoop.current().start() 356 | 357 | 358 | if __name__ == "__main__": 359 | run() 360 | -------------------------------------------------------------------------------- /qcache/compression.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | from io import BytesIO 3 | 4 | import lz4.block 5 | from tornado.web import OutputTransform, HTTPError 6 | 7 | 8 | GZIP_LEVEL = 6 9 | 10 | def gzip_dumps(string): 11 | buffer = BytesIO() 12 | file = gzip.GzipFile(mode='w', fileobj=buffer, compresslevel=GZIP_LEVEL) 13 | file.write(string) 14 | file.close() 15 | return buffer.getvalue() 16 | 17 | 18 | def gzip_loads(string): 19 | buffer = BytesIO(string) 20 | file = gzip.GzipFile(mode='r', fileobj=buffer) 21 | return file.read() 22 | 23 | 24 | ENCODINGS = { 25 | 'lz4': (lz4.block.decompress, lz4.block.compress), 26 | 'gzip': (gzip_loads, gzip_dumps), 27 | None: (lambda c: c, lambda c: c) 28 | } 29 | 30 | 31 | def decoded_body(request): 32 | encoding = request.headers.get('Content-Encoding') 33 | if encoding not in ENCODINGS: 34 | raise HTTPError(400, 35 | 'Unrecognized encoding "{encoding}"'.format(encoding=encoding)) 36 | 37 | return ENCODINGS[encoding][0](request.body) 38 | 39 | 40 | class CompressedContentEncoding(OutputTransform): 41 | """Applies compression to response. Prefers lz4 if accepted else uses gzip. 42 | """ 43 | def __init__(self, request): 44 | accept_coding = request.headers.get("Accept-Encoding", "") 45 | if 'lz4' in accept_coding: 46 | self.encoding = 'lz4' 47 | elif 'gzip' in accept_coding: 48 | self.encoding = 'gzip' 49 | else: 50 | self.encoding = None 51 | 52 | super(CompressedContentEncoding, self).__init__(request) 53 | 54 | def transform_first_chunk(self, status_code, headers, chunk, finishing): 55 | if status_code != 200: 56 | # Only compress responses containing query data 57 | self.encoding = None 58 | 59 | if self.encoding: 60 | if not finishing: 61 | raise Exception("Multi chunk not accepted by QCache when applying compression") 62 | 63 | chunk = ENCODINGS[self.encoding][1](chunk) 64 | headers['Content-Encoding'] = self.encoding 65 | headers['Content-Length'] = str(len(chunk)) 66 | 67 | return status_code, headers, chunk 68 | 69 | def transform_chunk(self, chunk, finishing): 70 | return chunk 71 | -------------------------------------------------------------------------------- /qcache/dataset_cache.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | 3 | 4 | class CacheItem(object): 5 | def __init__(self, qframe): 6 | self.creation_time = time() 7 | self.last_access_time = self.creation_time 8 | self._qframe = qframe 9 | self.access_count = 0 10 | 11 | # 100 bytes is just a very rough estimate of the object overhead of this instance 12 | self.size = 100 + qframe.byte_size() 13 | 14 | @property 15 | def dataset(self): 16 | self.last_access_time = time() 17 | self.access_count += 1 18 | return self._qframe 19 | 20 | 21 | class DatasetCache(object): 22 | def __init__(self, max_size, max_age): 23 | self.max_size = max_size 24 | self.max_age = max_age 25 | self._cache_dict = {} 26 | self.size = 0.0 27 | 28 | def has_expired(self, item): 29 | return self.max_age and time() > item.creation_time + self.max_age 30 | 31 | def evict_if_too_old(self, key): 32 | if self.has_expired(self._cache_dict[key]): 33 | del self[key] 34 | return True 35 | 36 | return False 37 | 38 | def __contains__(self, key): 39 | return key in self._cache_dict 40 | 41 | def __getitem__(self, item): 42 | return self._cache_dict[item].dataset 43 | 44 | def __setitem__(self, key, qframe): 45 | current_size = 0.0 46 | if key in self._cache_dict: 47 | current_size = self._cache_dict[key].size 48 | 49 | new_item = CacheItem(qframe) 50 | self.size += new_item.size - current_size 51 | self._cache_dict[key] = new_item 52 | 53 | def __delitem__(self, key): 54 | self.size -= self._cache_dict[key].size 55 | del self._cache_dict[key] 56 | 57 | def __len__(self): 58 | return len(self._cache_dict) 59 | 60 | def ensure_free(self, byte_count): 61 | """ 62 | :return: A list of durations in seconds that the dataset spent in the cache before 63 | being evicted. 64 | """ 65 | if byte_count > self.max_size: 66 | raise Exception('Impossible to allocate') 67 | 68 | if self.max_size - self.size >= byte_count: 69 | return [] 70 | 71 | # This is not very efficient but good enough for now 72 | lru_datasets = sorted(self._cache_dict.items(), key=lambda item: item[1].last_access_time) 73 | now = time() 74 | durations_until_eviction = [] 75 | for key, _ in lru_datasets: 76 | durations_until_eviction.append(now - self._cache_dict[key].creation_time) 77 | del self[key] 78 | if self.max_size - self.size >= byte_count: 79 | break 80 | 81 | return durations_until_eviction 82 | -------------------------------------------------------------------------------- /qcache/qframe/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from StringIO import StringIO 4 | 5 | import numpy 6 | from pandas import DataFrame, pandas 7 | 8 | from qcache.qframe.common import unquote, MalformedQueryException 9 | from qcache.qframe.context import set_current_qframe 10 | from qcache.qframe.query import query 11 | from qcache.qframe.update import update_frame 12 | 13 | 14 | def _get_dtype(obj): 15 | try: 16 | try: 17 | int(obj) 18 | return numpy.int64 19 | except ValueError: 20 | float(obj) 21 | return numpy.float64 22 | except ValueError: 23 | return numpy.object 24 | 25 | 26 | def _add_stand_in_columns(df, stand_in_columns): 27 | if not stand_in_columns: 28 | return df 29 | 30 | for column_name, stand_in_value in stand_in_columns: 31 | if column_name not in df: 32 | if stand_in_value in df: 33 | df.loc[:, column_name] = df[stand_in_value] 34 | else: 35 | dtype = _get_dtype(stand_in_value) 36 | stand_in_value = unquote(stand_in_value) 37 | arr = numpy.full(len(df), stand_in_value, dtype=dtype) 38 | df.loc[:, column_name] = pandas.Series(arr, index=df.index) 39 | 40 | 41 | class QFrame(object): 42 | """ 43 | Thin wrapper around a Pandas dataframe. 44 | """ 45 | __slots__ = ('df', 'unsliced_df_len') 46 | 47 | def __init__(self, pandas_df, unsliced_df_len=None): 48 | self.unsliced_df_len = len(pandas_df) if unsliced_df_len is None else unsliced_df_len 49 | self.df = pandas_df 50 | 51 | @staticmethod 52 | def from_csv(csv_string, column_types=None, stand_in_columns=None): 53 | df = pandas.read_csv(StringIO(csv_string), dtype=column_types, na_values=[''], keep_default_na=False) 54 | _add_stand_in_columns(df, stand_in_columns) 55 | return QFrame(df) 56 | 57 | @staticmethod 58 | def from_dicts(d, column_types=None, stand_in_columns=None): 59 | df = DataFrame.from_records(d) 60 | 61 | # Setting columns to categorials is slightly awkward from dicts 62 | # than from CSV... 63 | if column_types: 64 | for name, type in column_types.items(): 65 | if type == 'category': 66 | df[name] = df[name].astype("category") 67 | 68 | _add_stand_in_columns(df, stand_in_columns=stand_in_columns) 69 | return QFrame(df) 70 | 71 | def query(self, q, stand_in_columns=None): 72 | _add_stand_in_columns(self.df, stand_in_columns) 73 | set_current_qframe(self) 74 | if 'update' in q: 75 | # In place operation, should it be? 76 | update_frame(self.df, q) 77 | return None 78 | 79 | new_df, unsliced_df_len = query(self.df, q) 80 | return QFrame(new_df, unsliced_df_len=unsliced_df_len) 81 | 82 | def to_csv(self): 83 | return self.df.to_csv(index=False) 84 | 85 | def to_json(self): 86 | return self.df.to_json(orient='records') 87 | 88 | def to_dicts(self): 89 | return self.df.to_dict(orient='records') 90 | 91 | @property 92 | def columns(self): 93 | return self.df.columns 94 | 95 | def __len__(self): 96 | return len(self.df) 97 | 98 | def byte_size(self): 99 | # Estimate of the number of bytes consumed by this QFrame 100 | return self.df.memory_usage(index=True, deep=True).sum() 101 | -------------------------------------------------------------------------------- /qcache/qframe/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | 4 | class MalformedQueryException(Exception): 5 | pass 6 | 7 | 8 | def raise_malformed(message, q): 9 | raise MalformedQueryException(message + ': {q}'.format(q=q)) 10 | 11 | 12 | def assert_integer(name, i): 13 | if not isinstance(i, (int, long)): 14 | raise_malformed('Invalid type for {name}'.format(name=name), i) 15 | 16 | 17 | def assert_list(name, l): 18 | if not isinstance(l, list): 19 | raise_malformed('Invalid format for {name}'.format(name=name), l) 20 | 21 | 22 | def assert_len(q, expected, error_message="Invalid number of arguments"): 23 | if len(q) != expected: 24 | raise_malformed(error_message, q) 25 | 26 | 27 | def is_quoted(string): 28 | l = len(string) 29 | return (l >= 2) and \ 30 | ((string[0] == "'" and string[-1] == "'") or 31 | (string[0] == '"' and string[-1] == '"')) 32 | 33 | 34 | def unquote(s): 35 | if s.startswith("'") or s.startswith('"'): 36 | s = s[1:] 37 | 38 | if s.endswith("'") or s.endswith('"'): 39 | s = s[:-1] 40 | 41 | return s 42 | -------------------------------------------------------------------------------- /qcache/qframe/constants.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | COMPARISON_OPERATORS = {'==': operator.eq, 4 | '!=': operator.ne, 5 | '<': operator.lt, 6 | '<=': operator.le, 7 | '>': operator.gt, 8 | '>=': operator.ge} -------------------------------------------------------------------------------- /qcache/qframe/context.py: -------------------------------------------------------------------------------- 1 | """ 2 | Context to keep track of the qframe that is currently being operated on. 3 | 4 | NB! Not thread safe and not safe for interleaved operations on multiple frames. 5 | """ 6 | 7 | _current_qframe = None 8 | 9 | 10 | def set_current_qframe(qframe): 11 | global _current_qframe 12 | _current_qframe = qframe 13 | 14 | 15 | def get_current_qframe(): 16 | return _current_qframe 17 | -------------------------------------------------------------------------------- /qcache/qframe/pandas_filter.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import operator 4 | 5 | import numpy 6 | 7 | from qcache.qframe.common import assert_list, raise_malformed, is_quoted, unquote, assert_len 8 | from qcache.qframe.constants import COMPARISON_OPERATORS 9 | from qcache.qframe.context import get_current_qframe 10 | 11 | JOINING_OPERATORS = {'&': operator.and_, 12 | '|': operator.or_} 13 | 14 | 15 | def _leaf_node(df, q): 16 | if isinstance(q, basestring): 17 | if is_quoted(q): 18 | return q[1:-1].encode('utf-8') 19 | 20 | try: 21 | return df[q] 22 | except KeyError: 23 | raise_malformed("Unknown column", q) 24 | 25 | return q 26 | 27 | 28 | def _bitwise_filter(df, q): 29 | assert_len(q, 3) 30 | op, column, arg = q 31 | if not isinstance(arg, (int, long)): 32 | raise_malformed('Invalid argument type, must be an integer:'.format(t=type(arg)), q) 33 | 34 | try: 35 | series = df[column] & arg 36 | if op == "any_bits": 37 | return series > 0 38 | return series == arg 39 | except TypeError: 40 | raise_malformed("Invalid column type, must be an integer", q) 41 | 42 | 43 | def _not_filter(df, q): 44 | assert_len(q, 2, "! is a single arity operator, invalid number of arguments") 45 | return ~_do_pandas_filter(df, q[1]) 46 | 47 | 48 | def _isnull_filter(df, q): 49 | assert_len(q, 2, "isnull is a single arity operator, invalid number of arguments") 50 | 51 | # Slightly hacky but the only way I've come up with so far. 52 | return df[q[1]] != df[q[1]] 53 | 54 | 55 | def _comparison_filter(df, q): 56 | assert_len(q, 3) 57 | op, col_name, arg = q 58 | return COMPARISON_OPERATORS[op](df[col_name], _do_pandas_filter(df, arg)) 59 | 60 | 61 | def _join_filter(df, q): 62 | result = None 63 | if len(q) < 2: 64 | raise_malformed("Invalid number of arguments", q) 65 | elif len(q) == 2: 66 | # Conjunctions and disjunctions with only one clause are OK 67 | result = _do_pandas_filter(df, q[1]) 68 | else: 69 | result = reduce(lambda l, r: JOINING_OPERATORS[q[0]](l, _do_pandas_filter(df, r)), 70 | q[2:], _do_pandas_filter(df, q[1])) 71 | 72 | return result 73 | 74 | 75 | def prepare_in_clause(q): 76 | """ 77 | The arguments to an in expression may be either a list of values or 78 | a sub query which is then executed to produce a list of values. 79 | """ 80 | assert_len(q, 3) 81 | _, col_name, args = q 82 | 83 | if isinstance(args, dict): 84 | # Sub query, circular dependency on query by nature so need to keep the import local 85 | from qcache.qframe import query 86 | current_qframe = get_current_qframe() 87 | sub_df, _ = query(current_qframe.df, args) 88 | try: 89 | args = sub_df[col_name].values 90 | except KeyError: 91 | raise_malformed('Unknown column "{}"'.format(col_name), q) 92 | 93 | if not isinstance(args, (list, numpy.ndarray)): 94 | raise_malformed("Second argument must be a list", q) 95 | 96 | return col_name, args 97 | 98 | 99 | def _in_filter(df, q): 100 | col_name, args = prepare_in_clause(q) 101 | return df[col_name].isin(args) 102 | 103 | 104 | def _like_filter(df, q): 105 | assert_len(q, 3) 106 | op, column, raw_expr = q 107 | 108 | if not is_quoted(raw_expr): 109 | raise_malformed("like expects a quoted string as second argument", q) 110 | 111 | regexp = unquote(raw_expr) 112 | 113 | if not regexp.startswith('%'): 114 | regexp = '^' + regexp 115 | else: 116 | regexp = regexp[1:] 117 | 118 | if not regexp.endswith('%'): 119 | regexp += '$' 120 | else: 121 | regexp = regexp[:-1] 122 | 123 | # 'like' is case sensitive, 'ilike' is case insensitive 124 | case = op == 'like' 125 | 126 | try: 127 | return df[column].str.contains(regexp, case=case, na=False) 128 | except AttributeError: 129 | raise_malformed("Invalid column type for (i)like", q) 130 | 131 | 132 | def _do_pandas_filter(df, q): 133 | if not isinstance(q, list): 134 | return _leaf_node(df, q) 135 | 136 | if not q: 137 | raise_malformed("Empty expression not allowed", q) 138 | 139 | result = None 140 | op = q[0] 141 | try: 142 | if op in ('any_bits', 'all_bits'): 143 | result = _bitwise_filter(df, q) 144 | elif op == "!": 145 | result = _not_filter(df, q) 146 | elif op == "isnull": 147 | result = _isnull_filter(df, q) 148 | elif op in COMPARISON_OPERATORS: 149 | result = _comparison_filter(df, q) 150 | elif op in JOINING_OPERATORS: 151 | result = _join_filter(df, q) 152 | elif op == 'in': 153 | result = _in_filter(df, q) 154 | elif op in ('like', 'ilike'): 155 | result = _like_filter(df, q) 156 | else: 157 | raise_malformed("Unknown operator", q) 158 | except KeyError: 159 | raise_malformed("Column is not defined", q) 160 | except TypeError: 161 | raise_malformed("Invalid type in argument", q) 162 | 163 | return result 164 | 165 | 166 | def pandas_filter(df, filter_q): 167 | if filter_q: 168 | assert_list('where', filter_q) 169 | return df[_do_pandas_filter(df, filter_q)] 170 | 171 | return df 172 | -------------------------------------------------------------------------------- /qcache/qframe/query.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | import re 3 | 4 | from pandas import DataFrame 5 | from pandas.core.computation.ops import UndefinedVariableError 6 | from pandas.core.groupby import DataFrameGroupBy 7 | from qcache.qframe.pandas_filter import pandas_filter 8 | from qcache.qframe.common import assert_list, assert_integer, raise_malformed, MalformedQueryException 9 | 10 | 11 | CLAUSE_WHERE = 'where' 12 | CLAUSE_GROUP_BY = 'group_by' 13 | CLAUSE_DISTINCT = 'distinct' 14 | CLAUSE_SELECT = 'select' 15 | CLAUSE_ORDER_BY = 'order_by' 16 | CLAUSE_OFFSET = 'offset' 17 | CLAUSE_LIMIT = 'limit' 18 | CLAUSE_FROM = 'from' 19 | QUERY_CLAUSES = {CLAUSE_WHERE, CLAUSE_GROUP_BY, CLAUSE_DISTINCT, CLAUSE_SELECT, 20 | CLAUSE_ORDER_BY, CLAUSE_OFFSET, CLAUSE_LIMIT, CLAUSE_FROM} 21 | 22 | 23 | def _group_by(dataframe, group_by_q): 24 | if not group_by_q: 25 | return dataframe 26 | 27 | assert_list('group_by', group_by_q) 28 | 29 | try: 30 | return dataframe.groupby(group_by_q, as_index=False) 31 | except KeyError: 32 | raise_malformed('Group by column not in table', group_by_q) 33 | 34 | 35 | def is_aggregate_function(expr): 36 | return type(expr) is list and len(expr) == 2 37 | 38 | 39 | def is_alias_assignment(expr): 40 | """ 41 | Examples: 42 | ['=', 'column_name', 1] Constant assignment 43 | ['=', 'column_name', 'other_column'] Basic aliasing 44 | ['=', 'column_name', ['sin', 'column_name']] 45 | ['=', 'column_name', ['+', 'column_name', 'other_column']] Complex calculations 46 | """ 47 | return type(expr) is list and len(expr) == 3 and expr[0] == '=' 48 | 49 | 50 | def _aggregate(dataframe_group_by, project_q, aggregate_fns): 51 | if not aggregate_fns: 52 | raise_malformed("Aggregate function required when group_by is specified", project_q) 53 | 54 | try: 55 | return dataframe_group_by.agg(aggregate_fns) 56 | except AttributeError as e: 57 | functions = [fn_name for fn_name in aggregate_fns.values() if fn_name in str(e)] 58 | raise_malformed("Unknown aggregation function '{fn}'".format(fn=functions[0]), project_q) 59 | 60 | 61 | def _aggregate_without_group_by(dataframe, project_q, aggregate_fns): 62 | if len(aggregate_fns) != len(project_q): 63 | raise_malformed('Cannot mix aggregation functions and columns without group_by clause', project_q) 64 | 65 | results = {} 66 | for column_name, fn_name in aggregate_fns.items(): 67 | # Intricate, apply the selected function to the selected column 68 | temp_dataframe = dataframe[[column_name]] 69 | fn = getattr(temp_dataframe, fn_name, None) 70 | if not fn or not callable(fn): 71 | raise_malformed('Unknown aggregation function', project_q) 72 | 73 | results[column_name] = [fn(axis=0)[0]] 74 | 75 | # The result must be a data frame 76 | return DataFrame.from_dict(results) 77 | 78 | ALIAS_STRING = "^([A-Za-z0-9_-]+)$" 79 | ALIAS_RE = re.compile(ALIAS_STRING) 80 | 81 | 82 | def _build_eval_expression(expr): 83 | if type(expr) is list: 84 | if len(expr) == 3: 85 | arg1 = _build_eval_expression(expr[1]) 86 | arg2 = _build_eval_expression(expr[2]) 87 | op = expr[0] 88 | return "({arg1} {op} {arg2})".format(arg1=arg1, op=op, arg2=arg2) 89 | 90 | if len(expr) == 2: 91 | arg1 = _build_eval_expression(expr[1]) 92 | op = expr[0] 93 | return "{op}({arg1})".format(op=op, arg1=arg1) 94 | 95 | raise_malformed('Invalid number of arguments', expr) 96 | 97 | return expr 98 | 99 | 100 | def _alias(dataframe, expressions): 101 | result_frame = dataframe 102 | for expression in expressions: 103 | destination, source = expression[1], expression[2] 104 | if not isinstance(destination, basestring): 105 | raise_malformed('Invalid alias, must be a string', expression) 106 | 107 | if not re.match(ALIAS_RE, destination): 108 | raise_malformed('Invalid alias, must match {alias}'.format(alias=ALIAS_STRING), expression) 109 | 110 | eval_expr = _build_eval_expression(source) 111 | try: 112 | result_frame = result_frame.eval('{destination} = {expr}'.format(destination=destination, expr=eval_expr), inplace=False) 113 | except (SyntaxError, ValueError): 114 | raise_malformed('Unknown function in alias', source) 115 | 116 | return result_frame 117 | 118 | 119 | def classify_expressions(project_q): 120 | aggregate_functions = {} 121 | alias_expressions = [] 122 | for expression in project_q: 123 | if is_aggregate_function(expression): 124 | aggregate_functions[expression[1]] = expression[0] 125 | elif is_alias_assignment(expression): 126 | alias_expressions.append(expression) 127 | elif type(expression) is list: 128 | raise_malformed('Invalid expression in select', expression) 129 | 130 | return aggregate_functions, alias_expressions 131 | 132 | 133 | def _project(dataframe, project_q): 134 | if not project_q: 135 | return dataframe 136 | 137 | assert_list('project', project_q) 138 | 139 | if project_q == [['count']]: 140 | # Special case for count only, ~equal to SQL count(*) 141 | return DataFrame.from_dict({'count': [len(dataframe)]}) 142 | 143 | aggregate_fns, alias_expressions = classify_expressions(project_q) 144 | 145 | if aggregate_fns and alias_expressions: 146 | raise_malformed("Cannot mix aliasing and aggregation functions", project_q) 147 | 148 | if isinstance(dataframe, DataFrameGroupBy): 149 | dataframe = _aggregate(dataframe, project_q, aggregate_fns) 150 | elif aggregate_fns: 151 | return _aggregate_without_group_by(dataframe, project_q, aggregate_fns) 152 | elif alias_expressions: 153 | dataframe = _alias(dataframe, alias_expressions) 154 | else: 155 | # Nothing to do here 156 | pass 157 | 158 | columns = [e if type(e) is not list else e[1] for e in project_q] 159 | 160 | try: 161 | return dataframe[columns] 162 | except KeyError: 163 | missing_columns = set(columns) - set(dataframe.columns.values) 164 | raise_malformed("Selected columns not in table", list(missing_columns)) 165 | 166 | 167 | def _order_by(dataframe, order_q): 168 | if not order_q: 169 | return dataframe 170 | 171 | assert_list('order_by', order_q) 172 | if not all(isinstance(c, basestring) for c in order_q): 173 | raise_malformed("Invalid order by format", order_q) 174 | 175 | columns = [e[1:] if e.startswith('-') else e for e in order_q] 176 | ascending = [not e.startswith('-') for e in order_q] 177 | 178 | try: 179 | return dataframe.sort_values(by=columns, ascending=ascending) 180 | except KeyError: 181 | raise_malformed("Order by column not in table", columns) 182 | 183 | 184 | def _do_slice(dataframe, offset, limit): 185 | if offset: 186 | assert_integer('offset', offset) 187 | dataframe = dataframe[offset:] 188 | 189 | if limit: 190 | assert_integer('limit', limit) 191 | dataframe = dataframe[:limit] 192 | 193 | return dataframe 194 | 195 | 196 | def _distinct(dataframe, columns): 197 | if columns is None: 198 | return dataframe 199 | 200 | args = {} 201 | if columns: 202 | args['subset'] = columns 203 | 204 | return dataframe.drop_duplicates(**args) 205 | 206 | 207 | def query(dataframe, q): 208 | if not isinstance(q, dict): 209 | raise MalformedQueryException('Query must be a dictionary, not "{q}"'.format(q=q)) 210 | 211 | key_set = set(q.keys()) 212 | if not key_set.issubset(QUERY_CLAUSES): 213 | raise MalformedQueryException('Unknown query clauses: {keys}'.format( 214 | keys=', '.join(key_set.difference(QUERY_CLAUSES)))) 215 | 216 | try: 217 | if CLAUSE_FROM in q: 218 | dataframe, _ = query(dataframe, q[CLAUSE_FROM]) 219 | 220 | filtered_df = pandas_filter(dataframe, q.get('where')) 221 | grouped_df = _group_by(filtered_df, q.get('group_by')) 222 | distinct_df = _distinct(grouped_df, q.get('distinct')) 223 | projected_df = _project(distinct_df, q.get('select')) 224 | ordered_df = _order_by(projected_df, q.get('order_by')) 225 | sliced_df = _do_slice(ordered_df, q.get('offset'), q.get('limit')) 226 | return sliced_df, len(ordered_df) 227 | except UndefinedVariableError as e: 228 | raise MalformedQueryException(str(e)) 229 | -------------------------------------------------------------------------------- /qcache/qframe/update.py: -------------------------------------------------------------------------------- 1 | from qcache.qframe.common import assert_len, raise_malformed, is_quoted, unquote 2 | from qcache.qframe.constants import COMPARISON_OPERATORS 3 | 4 | 5 | def _prepare_arg(df, arg): 6 | if isinstance(arg, basestring): 7 | if is_quoted(arg): 8 | return unquote(arg) 9 | 10 | return getattr(df, arg) 11 | 12 | return arg 13 | 14 | 15 | def _build_update_filter(df, update_q): 16 | if type(update_q) is not list: 17 | raise_malformed("Expressions must be lists", update_q) 18 | 19 | if not update_q: 20 | raise_malformed("Empty expression not allowed", update_q) 21 | 22 | operator = update_q[0] 23 | if operator == "isnull": 24 | assert_len(update_q, 2, 'Invalid length of isnull query') 25 | try: 26 | return getattr(_prepare_arg(df, update_q[1]), 'isnull')() 27 | except AttributeError: 28 | raise_malformed("Unknown column for 'isnull'", update_q) 29 | 30 | if operator == "in": 31 | if len(update_q) != 3: 32 | raise_malformed("Invalid length of 'in' query", update_q) 33 | 34 | _, column, values = update_q 35 | if column not in df: 36 | raise_malformed("First argument to 'in' must be a column present in frame", update_q) 37 | 38 | if not isinstance(values, (list, tuple)): 39 | raise_malformed("Second argument to 'in' must be a list", update_q) 40 | 41 | return getattr(df, column).isin([_prepare_arg(df, val) for val in values]) 42 | 43 | if operator in COMPARISON_OPERATORS: 44 | arg1 = _prepare_arg(df, update_q[1]) 45 | arg2 = _prepare_arg(df, update_q[2]) 46 | return COMPARISON_OPERATORS[operator](arg1, arg2) 47 | 48 | raise_malformed("Unknown operator '{operator}'".format(operator=operator), update_q) 49 | 50 | 51 | def _build_update_values(df, updates): 52 | columns, values = zip(*updates) 53 | return columns, [_prepare_arg(df, val) for val in values] 54 | 55 | 56 | def classify_updates(q): 57 | # Updates can be either simple assignments or self referring updates (e. column += 1). 58 | # The former can be applied all at once while pandas only supports updates of one column 59 | # at the time for the latter. All updates are performed in the order they are declared 60 | # in the query. 61 | simple_run = [] 62 | for update in q['update']: 63 | if not isinstance(update, (list, tuple)): 64 | raise_malformed("Invalid update clause", update) 65 | 66 | if len(update) == 2: 67 | simple_run.append(update) 68 | else: 69 | if simple_run: 70 | yield ('simple', simple_run) 71 | simple_run = [] 72 | yield ('self-referring', update) 73 | 74 | if simple_run: 75 | yield ('simple', simple_run) 76 | 77 | 78 | def apply_operation(df, update_filter, column, op, value): 79 | # This is repetitive and ugly but the only way I've found to do in place updates 80 | if op == '+': 81 | df.ix[update_filter, column] += value 82 | elif op == '-': 83 | df.ix[update_filter, column] -= value 84 | elif op == '*': 85 | df.ix[update_filter, column] *= value 86 | elif op == '/': 87 | df.ix[update_filter, column] /= value 88 | elif op == '<<': 89 | df.ix[update_filter, column] <<= value 90 | elif op == '>>': 91 | df.ix[update_filter, column] >>= value 92 | elif op == '&': 93 | df.ix[update_filter, column] &= value 94 | elif op == '|': 95 | df.ix[update_filter, column] |= value 96 | elif op == '^': 97 | df.ix[update_filter, column] ^= value 98 | elif op == '%': 99 | df.ix[update_filter, column] %= value 100 | elif op == '**': 101 | df.ix[update_filter, column] **= value 102 | else: 103 | raise_malformed('Invalid update operator', (op, value, column)) 104 | 105 | 106 | def update_frame(df, q): 107 | update_filter = _build_update_filter(df, q['where']) 108 | for update_type, updates in classify_updates(q): 109 | if update_type == 'simple': 110 | columns, values = _build_update_values(df, updates) 111 | df.ix[update_filter, columns] = values 112 | else: 113 | op, column, value = updates 114 | apply_operation(df, update_filter, column, op, value) 115 | -------------------------------------------------------------------------------- /qcache/statistics.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import json 3 | import time 4 | 5 | 6 | class Statistics(object): 7 | def __init__(self, buffer_size): 8 | self.buffer_size = buffer_size 9 | self.reset() 10 | 11 | def inc(self, stat_name, count=1): 12 | if stat_name not in self.stats: 13 | self.stats[stat_name] = 0 14 | 15 | self.stats[stat_name] += count 16 | 17 | def append(self, stat_name, value): 18 | if stat_name not in self.stats: 19 | self.stats[stat_name] = deque(maxlen=self.buffer_size) 20 | 21 | self.stats[stat_name].append(value) 22 | 23 | def extend(self, stat_name, values): 24 | if stat_name not in self.stats: 25 | self.stats[stat_name] = deque(maxlen=self.buffer_size) 26 | 27 | self.stats[stat_name].extend(values) 28 | 29 | def reset(self, timestamp=None): 30 | if timestamp is None: 31 | timestamp = time.time() 32 | self.stats = {'since': timestamp, 33 | 'statistics_buffer_size': self.buffer_size} 34 | 35 | def snapshot(self): 36 | """ 37 | Create a statistics snapshot. This will reset the statistics. 38 | """ 39 | snapshot = self.stats.copy() 40 | for k, v in snapshot.items(): 41 | if isinstance(v, deque): 42 | snapshot[k] = list(v) 43 | 44 | timestamp = time.time() 45 | snapshot['statistics_duration'] = timestamp - snapshot['since'] 46 | del snapshot['since'] 47 | self.reset() 48 | return snapshot 49 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E122, E126, E127, E266, E241 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from setuptools import setup 4 | 5 | 6 | REQUIRES = [ 7 | 'docopt==0.6.2', 8 | 'numpy==1.13.3', 9 | 'pandas==0.20.3', 10 | 'tornado==5.1.1', 11 | 'lz4==2.1.6' 12 | ] 13 | 14 | 15 | def find_version(fname): 16 | '''Attempts to find the version number in the file names fname. 17 | Raises RuntimeError if not found. 18 | ''' 19 | version = '' 20 | with open(fname, 'r') as fp: 21 | reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]') 22 | for line in fp: 23 | m = reg.match(line) 24 | if m: 25 | version = m.group(1) 26 | break 27 | if not version: 28 | raise RuntimeError('Cannot find version information') 29 | return version 30 | 31 | __version__ = find_version("qcache/__init__.py") 32 | 33 | 34 | def read(fname): 35 | with open(fname) as fp: 36 | content = fp.read() 37 | return content 38 | 39 | setup( 40 | name='qcache', 41 | version=__version__, 42 | description='In memory cache server with analytical query capabilities', 43 | long_description=read("README.rst"), 44 | author='Tobias Gustafsson', 45 | author_email='tobias.l.gustafsson@gmail.com', 46 | url='https://github.com/tobgu/qcache', 47 | install_requires=REQUIRES, 48 | license="MIT", 49 | zip_safe=False, 50 | keywords='qcache', 51 | classifiers=[ 52 | 'Development Status :: 4 - Beta', 53 | 'Intended Audience :: Developers', 54 | 'License :: OSI Approved :: MIT License', 55 | 'Natural Language :: English', 56 | "Programming Language :: Python :: 2", 57 | 'Programming Language :: Python :: 2.7', 58 | 'Programming Language :: Python :: Implementation :: CPython', 59 | ], 60 | packages=["qcache", "qcache.qframe"], 61 | entry_points={ 62 | 'console_scripts': [ 63 | "qcache = qcache:main" 64 | ] 65 | } 66 | ) 67 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from invoke import task, run 5 | from qcache import __version__ as qcache_version 6 | 7 | docs_dir = 'docs' 8 | build_dir = os.path.join(docs_dir, '_build') 9 | 10 | 11 | @task 12 | def test(): 13 | run('python -m pytest -s -v -m "not benchmark"', pty=True) 14 | 15 | 16 | @task 17 | def test_limited(limit_by): 18 | run('python -m pytest -s -v -m "not benchmark" -k{}'.format(limit_by), pty=True) 19 | 20 | 21 | @task 22 | def benchmark(): 23 | run('python -m pytest -s -v -m "benchmark"', pty=True) 24 | 25 | 26 | @task 27 | def coverage(): 28 | run('python -m pytest --cov=qcache', pty=True) 29 | run('coverage report -m', pty=True) 30 | run('coverage html', pty=True) 31 | 32 | 33 | @task 34 | def flake8(): 35 | run("flake8 qcache test") 36 | 37 | 38 | @task 39 | def clean(): 40 | run("rm -rf build") 41 | run("rm -rf dist") 42 | run("rm -rf qcache.egg-info") 43 | clean_docs() 44 | print("Cleaned up.") 45 | 46 | 47 | @task 48 | def clean_docs(): 49 | run("rm -rf %s" % build_dir) 50 | 51 | 52 | @task 53 | def browse_docs(): 54 | run("open %s" % os.path.join(build_dir, 'index.html')) 55 | 56 | 57 | @task 58 | def build_docs(clean=False, browse=False): 59 | if clean: 60 | clean_docs() 61 | run("sphinx-build %s %s" % (docs_dir, build_dir), pty=True) 62 | if browse: 63 | browse_docs() 64 | 65 | 66 | @task 67 | def readme(browse=False): 68 | run('rst2html.py README.rst > README.html') 69 | 70 | 71 | @task 72 | def publish(test=False): 73 | """Publish to the cheeseshop.""" 74 | if test: 75 | run('python setup.py register -r pypitest sdist upload -r pypitest') 76 | else: 77 | run("python setup.py register sdist upload") 78 | 79 | 80 | @task 81 | def install(): 82 | run('python setup.py sdist install') 83 | 84 | 85 | @task 86 | def build_image(): 87 | run("sudo docker build -t tobgu/qcache:{version} .".format(version=qcache_version)) 88 | run("sudo docker tag tobgu/qcache:{version} tobgu/qcache:latest".format(version=qcache_version)) 89 | 90 | 91 | @task 92 | def push_image(): 93 | run("sudo docker push tobgu/qcache:{version}".format(version=qcache_version)) 94 | run("sudo docker push tobgu/qcache:latest") 95 | 96 | 97 | @task 98 | def tag(): 99 | run("git tag -fa v{version} -m 'v{version}'".format(version=qcache_version)) 100 | -------------------------------------------------------------------------------- /test/performance_run.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from time import time 3 | 4 | if __name__ == '__main__': 5 | i = 0 6 | results = [] 7 | t0 = time() 8 | while i <= 1000: 9 | requests.get('http://localhost:8088/status') 10 | t1 = time() 11 | results.append(t1 - t0) 12 | t0 = t1 13 | i += 1 14 | 15 | results.sort() 16 | print "Median: %s, 90perc: %s, 99perc: %s" % (results[500], results[900], results[990]) 17 | -------------------------------------------------------------------------------- /test/test_api.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import os 4 | 5 | import lz4 as lz4 6 | import ssl 7 | 8 | from tornado.httputil import url_concat 9 | from tornado.testing import AsyncHTTPTestCase 10 | from freezegun import freeze_time 11 | 12 | import qcache 13 | import qcache.app as app 14 | import csv 15 | from StringIO import StringIO 16 | 17 | 18 | def to_json(data): 19 | return json.dumps(data) 20 | 21 | 22 | def to_csv(data): 23 | if not data: 24 | return "" 25 | 26 | out = StringIO() 27 | writer = csv.DictWriter(out, data[0].keys()) 28 | writer.writeheader() 29 | 30 | for entry in data: 31 | writer.writerow(entry) 32 | 33 | return out.getvalue() 34 | 35 | 36 | def from_csv(text): 37 | input_data = StringIO(text) 38 | return list(csv.DictReader(input_data)) 39 | 40 | 41 | class SharedTest(AsyncHTTPTestCase): 42 | def get_app(self): 43 | return app.make_app(url_prefix='', debug=True) 44 | 45 | def post_json(self, url, data, extra_headers=None): 46 | if not isinstance(data, basestring): 47 | body = to_json(data) 48 | else: 49 | # Data already prepared by calling function 50 | body = data 51 | 52 | headers = {'Content-Type': 'application/json'} 53 | 54 | if extra_headers: 55 | headers.update(extra_headers) 56 | 57 | return self.fetch(url, method='POST', body=body, headers=headers, use_gzip=False) 58 | 59 | def query_json(self, url, query, extra_headers=None): 60 | url = url_concat(url, {'q': json.dumps(query)}) 61 | headers = {'Accept': 'application/json, text/csv'} 62 | if extra_headers: 63 | headers.update(extra_headers) 64 | return self.fetch(url, headers=headers, use_gzip=False) 65 | 66 | def post_csv(self, url, data, types=None, extra_headers=None): 67 | headers = {'Content-Type': 'text/csv'} 68 | if types: 69 | headers['X-QCache-types'] = '; '.join('{column_name}={type_name}'.format(column_name=c, type_name=t) 70 | for c, t in types.items()) 71 | if extra_headers: 72 | headers.update(extra_headers) 73 | 74 | body = to_csv(data) 75 | return self.fetch(url, method='POST', body=body, headers=headers, use_gzip=False) 76 | 77 | def query_csv(self, url, query): 78 | url = url_concat(url, {'q': json.dumps(query)}) 79 | return self.fetch(url, headers={'Accept': 'text/csv, application/json'}, use_gzip=False) 80 | 81 | def get_statistics(self): 82 | response = self.fetch('/statistics', use_gzip=False) 83 | assert response.code == 200 84 | return json.loads(response.body) 85 | 86 | 87 | class TestBaseCases(SharedTest): 88 | def test_404_when_item_is_missing(self): 89 | url = url_concat('/dataset/abc', {'q': json.dumps('{}')}) 90 | response = self.fetch(url) 91 | assert response.code == 404 92 | 93 | def test_upload_json_query_json(self): 94 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 95 | assert response.code == 201 96 | 97 | response = self.query_json('/dataset/abc', {'where': ['==', 'foo', 1]}) 98 | assert response.code == 200 99 | assert json.loads(response.body) == [{'foo': 1, 'bar': 10}] 100 | 101 | def test_upload_csv_query_csv(self): 102 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}, {'baz': 2, 'bar': 20}]) 103 | assert response.code == 201 104 | 105 | response = self.query_csv('/dataset/cba', {'where': ['==', 'baz', 1]}) 106 | assert response.code == 200 107 | assert from_csv(response.body) == [{'baz': '1', 'bar': '10'}] # NB: Strings for numbers here 108 | 109 | def test_division_by_zero(self): 110 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 0}]) 111 | assert response.code == 201 112 | 113 | # Result of division by 0 will be transmitted as null/None 114 | response = self.query_json('/dataset/abc', {'select': [['=', 'baz', ['/', 'foo', 'bar']]]}) 115 | assert response.code == 200 116 | assert json.loads(response.body) == [{'baz': None}] 117 | 118 | 119 | class TestQueryWithPost(SharedTest): 120 | def post_query_json(self, url, query): 121 | return self.fetch(url, headers={'Accept': 'application/json, text/csv', 'Content-Type': 'application/json'}, 122 | method="POST", body=to_json(query)) 123 | 124 | def test_upload_json_post_query_json(self): 125 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 126 | assert response.code == 201 127 | 128 | response = self.post_query_json('/dataset/abc/q', {'where': ['==', 'foo', 1]}) 129 | assert response.code == 200 130 | assert json.loads(response.body) == [{'foo': 1, 'bar': 10}] 131 | 132 | def test_upload_json_post_query_json_malformed_query(self): 133 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 134 | assert response.code == 201 135 | 136 | response = self.post_query_json('/dataset/abc/q', {'blabb': ['==', 'foo', 1]}) 137 | assert response.code == 400 138 | 139 | def test_delete_against_q_endpoint_is_404(self): 140 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 141 | assert response.code == 201 142 | 143 | response = self.fetch('/dataset/abc/q', method='DELETE') 144 | assert response.code == 404 145 | 146 | response = self.fetch('/dataset/abc', method='DELETE') 147 | assert response.code == 200 148 | 149 | def test_get_against_q_endpoint_is_404(self): 150 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 151 | assert response.code == 201 152 | 153 | response = self.query_json('/dataset/abc/q', query={}) 154 | assert response.code == 404 155 | 156 | response = self.query_json('/dataset/abc', query={}) 157 | assert response.code == 200 158 | 159 | 160 | class TestSlicing(SharedTest): 161 | def test_unsliced_size_header_indicates_the_dataset_size_before_slicing_it(self): 162 | # This helps out in pagination of data 163 | self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}, {'baz': 2, 'bar': 20}]) 164 | 165 | # Fetch all data, the header value should be the same as the length of the response 166 | response = self.query_json('/dataset/cba', {}) 167 | assert response.code == 200 168 | assert len(json.loads(response.body)) == 2 169 | assert response.headers['X-QCache-unsliced-length'] == '2' 170 | 171 | response = self.query_json('/dataset/cba', {"offset": 1}) 172 | assert response.code == 200 173 | assert len(json.loads(response.body)) == 1 174 | assert response.headers['X-QCache-unsliced-length'] == '2' 175 | 176 | 177 | class TestCharacterEncoding(SharedTest): 178 | def test_upload_json_query_json_unicode_characters(self): 179 | response = self.post_json('/dataset/abc', [{'foo': u'Iñtërnâtiônàližætiøn'}, {'foo': 'qux'}]) 180 | assert response.code == 201 181 | 182 | response = self.query_json('/dataset/abc', {'where': ['==', 'foo', u'"Iñtërnâtiônàližætiøn"']}) 183 | 184 | assert response.code == 200 185 | response_data = json.loads(response.body) 186 | assert response_data == [{'foo': u'Iñtërnâtiônàližætiøn'}] 187 | assert type(response_data[0]['foo']) is unicode 188 | 189 | def test_upload_csv_query_csv_unicode_characters_encoded_as_utf8(self): 190 | response = self.post_csv('/dataset/abc', [{'foo': u'Iñtërnâtiônàližætiønåäö'.encode('utf-8')}, {'foo': 'qux'}]) 191 | assert response.code == 201 192 | 193 | response = self.query_csv('/dataset/abc', {'where': ['==', 'foo', u'"Iñtërnâtiônàližætiønåäö"']}) 194 | assert response.code == 200 195 | assert from_csv(response.body) == [{'foo': u'Iñtërnâtiônàližætiønåäö'.encode('utf-8')}] 196 | 197 | def test_upload_csv_query_json_unicode_characters_encoded_as_utf8(self): 198 | response = self.post_csv('/dataset/abc', [{'foo': u'Iñtërnâtiônàližætiønåäö'.encode('utf-8')}, {'foo': 'qux'}]) 199 | assert response.code == 201 200 | 201 | response = self.query_json('/dataset/abc', {'where': ['==', 'foo', u'"Iñtërnâtiônàližætiønåäö"']}) 202 | 203 | assert response.code == 200 204 | response_data = json.loads(response.body) 205 | assert json.loads(response.body) == [{'foo': u'Iñtërnâtiônàližætiønåäö'}] 206 | assert type(response_data[0]['foo']) is unicode 207 | 208 | def test_upload_invalid_content_type(self): 209 | response = self.fetch('/dataset/abc', method='POST', body='', headers={'Content-Type': 'text/html'}) 210 | assert response.code == 415 211 | 212 | def test_upload_invalid_charset(self): 213 | response = self.fetch('/dataset/abc', method='POST', body='', 214 | headers={'Content-Type': 'text/csv; charset=iso-123'}) 215 | assert response.code == 415 216 | 217 | 218 | class TestInvalidQueries(SharedTest): 219 | def setUp(self): 220 | super(TestInvalidQueries, self).setUp() 221 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 222 | assert response.code == 201 223 | 224 | def do_invalid(self, q): 225 | response = self.query_json('/dataset/abc', q) 226 | assert response.code == 400 227 | return response 228 | 229 | def test_list_instead_of_dict(self): 230 | self.do_invalid(['where', ['==', 'foo', 1]]) 231 | 232 | def test_json_not_possible_to_parse(self): 233 | response = self.fetch(url_concat('/dataset/abc', {'q': 'foo'})) 234 | assert response.code == 400 235 | 236 | def test_invalid_filter_format(self): 237 | response = self.do_invalid({'where': ['==', 'foo', 1, 2]}) 238 | assert 'Invalid number of arguments' in json.loads(response.body)['error'] 239 | 240 | def test_unknown_filter_operator(self): 241 | response = self.query_json('/dataset/abc', {'where': ['<>', 'foo', 1]}) 242 | assert 'Unknown operator' in json.loads(response.body)['error'] 243 | 244 | def test_unknown_select_operator(self): 245 | response = self.query_json('/dataset/abc', {'select': [['baz', 'foo']]}) 246 | assert 'Unknown aggregation function' in json.loads(response.body)['error'] 247 | 248 | def test_missing_column_in_select(self): 249 | response = self.query_json('/dataset/abc', {'select': ['baz', 'foo']}) 250 | assert 'Selected columns not in table' in json.loads(response.body)['error'] 251 | 252 | def test_missing_column_in_filter(self): 253 | response = self.query_json('/dataset/abc', {'where': ['>', 'baz', 1]}) 254 | assert 'is not defined' in json.loads(response.body)['error'] 255 | 256 | def test_missing_column_in_group_by(self): 257 | response = self.query_json('/dataset/abc', {'group_by': ['baz']}) 258 | assert 'Group by column not in table' in json.loads(response.body)['error'] 259 | 260 | def test_missing_column_in_order_by(self): 261 | response = self.query_json('/dataset/abc', {'order_by': ['baz']}) 262 | assert 'Order by column not in table' in json.loads(response.body)['error'] 263 | 264 | def test_malformed_order_by(self): 265 | response = self.query_json('/dataset/abc', {'order_by': [['baz']]}) 266 | assert 'Invalid order by format' in json.loads(response.body)['error'] 267 | 268 | def test_wrong_type_for_offset(self): 269 | response = self.query_json('/dataset/abc', {'offset': 4.3}) 270 | assert 'Invalid type' in json.loads(response.body)['error'] 271 | 272 | def test_group_by_not_list(self): 273 | response = self.query_json('/dataset/abc', {'group_by': {'foo': 4.3}}) 274 | assert 'Invalid format' in json.loads(response.body)['error'] 275 | 276 | 277 | # Error cases: 278 | # - Malformed query 279 | # * Still some edge cases left in projection and filter. 280 | # - Malformed input data 281 | # * No data sent in => error 282 | # * Wrong format specified 283 | # * Accepted format specified but cannot be encoded 284 | # * Non-uniform JSON and CSV 285 | # - Non fitting data 286 | # * The data is too large to be fitted into memory of the current instance. 287 | 288 | 289 | class TestBitwiseQueries(SharedTest): 290 | def test_bitwise_query_succeeds(self): 291 | response = self.post_json('/dataset/abc', [{'foo': 1, 'bar': 10}, {'foo': 2, 'bar': 20}]) 292 | assert response.code == 201 293 | 294 | response = self.query_json('/dataset/abc', {'where': ['all_bits', 'foo', 1]}) 295 | assert response.code == 200 296 | 297 | 298 | class TestCacheEvictionOnSize(SharedTest): 299 | def get_app(self): 300 | # A cache size of 200 is trimmed for the below test cases 301 | just_enough_to_fit_smaller_values = 315 302 | return app.make_app(url_prefix='', max_cache_size=just_enough_to_fit_smaller_values, debug=True) 303 | 304 | def test_evicts_entry_when_too_much_space_occupied(self): 305 | data = [{'some_longish_key': 'some_fairly_longish_value_that_needs_to_be_stuffed_in'}, 306 | {'some_longish_key': 'another_fairly_longish_value_that_also_should_be_fitted'}] 307 | 308 | # Post data and assure available 309 | response = self.post_json('/dataset/abc', data) 310 | assert response.code == 201 311 | assert self.query_json('/dataset/abc', {}).code == 200 312 | 313 | response = self.post_json('/dataset/cba', data) 314 | assert response.code == 201 315 | 316 | # The old dataset has been evicted, the new one has taken its place 317 | assert self.query_json('/dataset/abc', {}).code == 404 318 | assert self.query_json('/dataset/cba', {}).code == 200 319 | 320 | # Check statistics 321 | stats = self.get_statistics() 322 | 323 | assert stats['dataset_count'] == 1 324 | assert stats['cache_size'] == 370 325 | assert stats['hit_count'] == 2 326 | assert stats['miss_count'] == 1 327 | assert stats['size_evict_count'] == 1 328 | assert stats['store_count'] == 2 329 | assert stats['statistics_duration'] > 0.0 330 | assert stats['statistics_buffer_size'] == 1000 331 | assert len(stats['store_durations']) == 2 332 | assert len(stats['store_row_counts']) == 2 333 | assert sum(stats['store_row_counts']) == 4 334 | assert len(stats['query_durations']) == 2 335 | assert len(stats['durations_until_eviction']) == 1 336 | assert stats['durations_until_eviction'][0] > 0.0 337 | 338 | # Check stats again, this time it should have been cleared 339 | assert set(self.get_statistics().keys()) == \ 340 | {'dataset_count', 'cache_size', 'statistics_duration', 'statistics_buffer_size'} 341 | 342 | def test_can_insert_more_entries_with_smaller_values(self): 343 | data = [{'some_longish_key': 'short'}, 344 | {'some_longish_key': 'another_short'}] 345 | 346 | self.post_json('/dataset/abc', data) 347 | self.post_json('/dataset/cba', data) 348 | 349 | # Both datasets co-exist in the cache 350 | assert self.query_json('/dataset/abc', {}).code == 200 351 | assert self.query_json('/dataset/cba', {}).code == 200 352 | 353 | def test_query_stand_in_columns_do_not_interfere_with_cache_eviction(self): 354 | # Executing a query with stand in columns can increase the dataset 355 | # size after insert. This should not lead to failed handling of the 356 | # current cache size where the measured cache size gets smaller and 357 | # smaller which causes the actual cache size to grow. See #15. 358 | data = [{'some_longish_key': 'some_fairly_longish_value_that_needs_to_be_stuffed_in'}, 359 | {'some_longish_key': 'another_fairly_longish_value_that_also_should_be_fitted'}] 360 | 361 | repetitions = 10 362 | for i in range(repetitions): 363 | response = self.post_json('/dataset/{i}'.format(i=i), data) 364 | assert response.code == 201 365 | assert self.query_json('/dataset/{i}'.format(i=i), 366 | {}, 367 | extra_headers={'X-QCache-stand-in-columns': 'foo="bar_baz_qux"'}).code == 200 368 | 369 | stats = self.get_statistics() 370 | assert stats['dataset_count'] == 1 371 | assert stats['size_evict_count'] == repetitions - 1 372 | assert stats['cache_size'] == 370 373 | 374 | 375 | class TestCacheEvictionOnAge(SharedTest): 376 | def get_app(self): 377 | # A cache size of 200 is trimmed for the below test cases 378 | return app.make_app(url_prefix='', max_age=5, debug=True) 379 | 380 | def test_evicts_dataset_when_data_too_old(self): 381 | with freeze_time('2015-10-22 00:00:00'): 382 | data = [{'some_longish_key': 'short'}] 383 | self.post_json('/dataset/abc', data) 384 | 385 | with freeze_time('2015-10-22 00:00:04'): 386 | assert self.query_json('/dataset/abc', {}).code == 200 387 | 388 | with freeze_time('2015-10-22 00:00:06'): 389 | assert self.query_json('/dataset/abc', {}).code == 404 390 | 391 | 392 | class TestStatusEndpoint(SharedTest): 393 | def test_status_endpoint_returns_200_ok(self): 394 | response = self.fetch('/status') 395 | 396 | assert response.code == 200 397 | assert response.body == "OK" 398 | 399 | 400 | class TestDatasetDelete(SharedTest): 401 | def test_post_data_then_delete(self): 402 | data = [{'some_key': '123456'}] 403 | self.post_json('/dataset/abc', data) 404 | 405 | assert self.query_json('/dataset/abc', {}).code == 200 406 | assert self.fetch('/dataset/abc', method='DELETE').code == 200 407 | assert self.query_json('/dataset/abc', {}).code == 404 408 | 409 | 410 | class TestColumnTyping(SharedTest): 411 | def get(self, q, response_code=200): 412 | response = self.query_json('/dataset/abc', q) 413 | assert response.code == response_code 414 | return json.loads(response.body) 415 | 416 | def test_type_hint_string_on_column_with_only_integers(self): 417 | data = [ 418 | {'some_key': '123456', 'another_key': 1111}, 419 | {'some_key': 'abcdef', 'another_key': 2222}] 420 | 421 | self.post_csv('/dataset/abc', data, types={'another_key': 'string'}) 422 | 423 | assert self.get({'where': ['==', 'another_key', '"2222"']}) == \ 424 | [{'some_key': 'abcdef', 'another_key': '2222'}] 425 | 426 | # No matching item when querying by integer 427 | assert not self.get({'where': ['==', 'another_key', 2222]}) 428 | 429 | def test_type_hinting_with_invalid_type_results_in_bad_request(self): 430 | # It's currently only possible to type hint strings and enums. 431 | # Is there ever a need for other type hints? 432 | 433 | data = [{'some_key': '123456', 'another_key': 1111}] 434 | response = self.post_csv('/dataset/abc', data, types={'another_key': 'int'}) 435 | assert response.code == 400 436 | 437 | def test_type_hinting_with_enum(self): 438 | data = [{'some_key': 'aaa'}] 439 | response = self.post_csv('/dataset/abc', data, types={'some_key': 'enum'}) 440 | assert response.code == 201 441 | 442 | assert self.get({'where': ['==', 'some_key', '"aaa"']}) == [ 443 | {'some_key': 'aaa'} 444 | ] 445 | 446 | def test_type_int_to_string(self): 447 | def get(q, response_code=200): 448 | response = self.query_json('/dataset/abc', q) 449 | assert response.code == response_code 450 | return json.loads(response.body) 451 | 452 | data = [ 453 | {'some_key': '123456', 'another_key': 1111}, 454 | {'some_key': 'abcdef', 'another_key': 2222}] 455 | 456 | self.post_csv('/dataset/abc', data) 457 | 458 | # Querying on integer field 459 | assert get({'where': ['==', 'another_key', 2222]}) == \ 460 | [{'some_key': 'abcdef', 'another_key': 2222}] 461 | 462 | get({'where': ['==', 'another_key', '2222']}, response_code=400) 463 | get({'where': ['==', 'another_key', '"2222"']}, response_code=400) 464 | 465 | # Querying on string field 466 | assert not get({'where': ['==', 'some_key', 123456]}) 467 | get({'where': ['==', 'some_key', '123456']}, response_code=400) 468 | 469 | # Matching string 470 | assert get({'where': ['==', 'some_key', '"123456"']}) == \ 471 | [{'some_key': '123456', 'another_key': 1111}] 472 | 473 | # Here abcdef is interpreted as another column. Since column abcdef 474 | # doesn't exist a 400, Bad request will be returned. 475 | get({'where': ['==', 'some_key', 'abcdef']}, response_code=400) 476 | 477 | def test_type_hinting_with_float(self): 478 | data = [{'some_key': 12}] 479 | response = self.post_csv('/dataset/abc', data, types={'some_key': 'float'}) 480 | assert response.code == 201 481 | 482 | result = self.get({'where': ['==', 'some_key', 12.0]}) 483 | assert result == [{'some_key': 12.0}] 484 | assert type(result[0]['some_key']) == float 485 | 486 | 487 | class TestStandInColumns(SharedTest): 488 | def test_stand_in_column_with_numeric_value(self): 489 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}], 490 | extra_headers={'X-QCache-stand-in-columns': 'foo=13'}) 491 | assert response.code == 201 492 | 493 | response = self.query_json('/dataset/cba', {'where': ['==', 'foo', 13]}) 494 | assert response.code == 200 495 | result = json.loads(response.body) 496 | assert result == [{'baz': 1, 'bar': 10, 'foo': 13}] 497 | assert type(result[0]['foo']) == int 498 | 499 | response = self.query_json('/dataset/cba', {'where': ['==', 'foo', 14]}) 500 | assert response.code == 200 501 | assert json.loads(response.body) == [] 502 | 503 | def test_stand_in_column_with_string_value(self): 504 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}], 505 | extra_headers={'X-QCache-stand-in-columns': 'foo="13"'}) 506 | assert response.code == 201 507 | 508 | response = self.query_json('/dataset/cba', {'where': ['==', 'foo', '"13"']}) 509 | assert response.code == 200 510 | assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': "13"}] 511 | 512 | def test_stand_in_column_with_other_column(self): 513 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}, {'baz': 2, 'bar': 20}], 514 | extra_headers={'X-QCache-stand-in-columns': 'foo=bar'}) 515 | assert response.code == 201 516 | 517 | response = self.query_json('/dataset/cba', {'where': ['==', 'foo', 20]}) 518 | assert response.code == 200 519 | assert json.loads(response.body) == [{'baz': 2, 'bar': 20, 'foo': 20}] 520 | 521 | def test_multiple_stand_in_columns(self): 522 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}], 523 | extra_headers={'X-QCache-stand-in-columns': 'foo=bar; qux=13'}) 524 | assert response.code == 201 525 | 526 | response = self.query_json('/dataset/cba', {}) 527 | assert response.code == 200 528 | assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': 10, 'qux': 13}] 529 | 530 | def test_chained_stand_in_columns(self): 531 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}], 532 | extra_headers={'X-QCache-stand-in-columns': 'foo=13; qux=foo'}) 533 | assert response.code == 201 534 | 535 | response = self.query_json('/dataset/cba', {}) 536 | assert response.code == 200 537 | assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': 13, 'qux': 13}] 538 | 539 | def test_json_stand_in_columns(self): 540 | response = self.post_json('/dataset/cba', [{'baz': 1, 'bar': 10}], 541 | extra_headers={'X-QCache-stand-in-columns': 'foo=13'}) 542 | assert response.code == 201 543 | 544 | response = self.query_json('/dataset/cba', {}) 545 | assert json.loads(response.body) == [{'baz': 1, 'bar': 10, 'foo': 13}] 546 | 547 | def test_stand_in_column_not_applied_when_column_exists_in_submitted_data(self): 548 | response = self.post_csv('/dataset/cba', [{'baz': 1, 'bar': 10}], 549 | extra_headers={'X-QCache-stand-in-columns': 'bar=13'}) 550 | assert response.code == 201 551 | 552 | response = self.query_json('/dataset/cba', {}) 553 | assert json.loads(response.body) == [{'baz': 1, 'bar': 10}] 554 | 555 | def test_stand_in_columns_in_query(self): 556 | response = self.post_csv('/dataset/cba', [{'foo': 1}]) 557 | assert response.code == 201 558 | 559 | response = self.query_json('/dataset/cba', {}, extra_headers={'X-QCache-stand-in-columns': 'bar=13;baz=foo'}) 560 | assert json.loads(response.body) == [{'foo': 1, 'bar': 13, 'baz': 1}] 561 | 562 | 563 | class TestCompression(SharedTest): 564 | def call_api_with_compression(self, accept_encoding, content_encoding, decoding_fn, encoding_fn, expected_encoding): 565 | input_data = 10000 * [{'foo': 1, 'bar': 10}] 566 | data = encoding_fn(to_json(input_data)) 567 | 568 | response = self.post_json('/dataset/abc', data, extra_headers={'Content-Encoding': content_encoding}) 569 | assert response.code == 201 570 | 571 | response = self.query_json('/dataset/abc', query={}, extra_headers={'Accept-Encoding': accept_encoding}) 572 | 573 | assert response.code == 200 574 | assert response.headers.get('Content-Encoding') == expected_encoding 575 | assert json.loads(decoding_fn(response.body)) == input_data 576 | 577 | def test_upload_gzip_accept_gzip(self): 578 | self.call_api_with_compression(accept_encoding='gzip', 579 | content_encoding='gzip', 580 | decoding_fn=qcache.compression.gzip_loads, 581 | encoding_fn=qcache.compression.gzip_dumps, 582 | expected_encoding='gzip') 583 | 584 | def test_upload_lz4_accept_lz4(self): 585 | self.call_api_with_compression(accept_encoding='lz4', 586 | content_encoding='lz4', 587 | decoding_fn=lz4.block.decompress, 588 | encoding_fn=lz4.block.compress, 589 | expected_encoding='lz4') 590 | 591 | def test_upload_lz4_accept_gzip(self): 592 | self.call_api_with_compression(accept_encoding='lz4', 593 | content_encoding='gzip', 594 | decoding_fn=lz4.block.decompress, 595 | encoding_fn=qcache.compression.gzip_dumps, 596 | expected_encoding='lz4') 597 | 598 | def test_prefer_lz4_if_multiple_supported_encodings_exists(self): 599 | self.call_api_with_compression(accept_encoding='compress,gzip,lz4', 600 | content_encoding='gzip', 601 | decoding_fn=lz4.block.decompress, 602 | encoding_fn=qcache.compression.gzip_dumps, 603 | expected_encoding='lz4') 604 | 605 | def test_unknown_accept_encoding_results_in_no_response_compression(self): 606 | self.call_api_with_compression(accept_encoding='foo,bar', 607 | content_encoding='lz4', 608 | decoding_fn=lambda x: x, 609 | encoding_fn=lz4.block.compress, 610 | expected_encoding=None) 611 | 612 | def test_upload_with_unknown_encoding_results_in_400(self): 613 | data = to_json([{'foo': 'bar'}]) 614 | response = self.post_json('/dataset/abc', data, extra_headers={'Content-Encoding': 'baz'}) 615 | assert response.code == 400 616 | assert 'Unrecognized encoding' in response.body 617 | 618 | def test_only_200_responses_are_compressed(self): 619 | data = to_json([{'foo': 'bar'}]) 620 | response = self.post_json('/dataset/abc', data) 621 | assert response.code == 201 622 | 623 | response = self.query_json('/dataset/non_present_dataset', query={}, extra_headers={'Accept-Encoding': 'lz4'}) 624 | assert response.code == 404 625 | assert response.headers.get('Content-Encoding') is None 626 | 627 | 628 | class TestStatistics(SharedTest): 629 | def test_store_and_query_durations(self): 630 | assert self.post_json('/dataset/abc', [{'foo': 123}]).code == 201 631 | assert self.query_json('/dataset/abc', query={}).code == 200 632 | 633 | stats = self.get_statistics() 634 | 635 | assert len(stats['query_durations']) == 1 636 | assert len(stats['store_durations']) == 1 637 | assert len(stats['query_request_durations']) == 1 638 | assert len(stats['store_request_durations']) == 1 639 | 640 | assert stats['query_durations'][0] < stats['query_request_durations'][0] 641 | assert stats['store_durations'][0] < stats['store_request_durations'][0] 642 | 643 | 644 | class SSLTestBase(AsyncHTTPTestCase): 645 | TLS_DIR = os.path.join(os.path.dirname(__file__), '../tls/') 646 | 647 | def get_app(self): 648 | return app.make_app(url_prefix='', debug=True) 649 | 650 | def get_protocol(self): 651 | return 'https' 652 | 653 | def get_url(self, path): 654 | """Returns an absolute url for the given path on the test server.""" 655 | return '%s://localhost:%s%s' % (self.get_protocol(), 656 | self.get_http_port(), path) 657 | 658 | def get_ssl_version(self): 659 | raise NotImplementedError() 660 | 661 | def get_httpserver_options(self): 662 | # By default don't require client certificate. Override in subclasses where client 663 | # certs are tested. 664 | return app.ssl_options(certfile=self.TLS_DIR + 'host.pem') 665 | 666 | def fetch(self, path, **kwargs): 667 | if 'validate_cert' not in kwargs: 668 | ssl_context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, 669 | cafile=os.path.join(self.TLS_DIR, 'ca.pem')) 670 | 671 | if 'client_cert' in kwargs: 672 | ssl_context.load_cert_chain(kwargs['client_cert']) 673 | 674 | kwargs['ssl_options'] = ssl_context 675 | 676 | return super(SSLTestBase, self).fetch(path=path, **kwargs) 677 | 678 | 679 | class TestSSLServerWithSSL(SSLTestBase): 680 | def test_fetch_status(self): 681 | response = self.fetch('/status') 682 | assert response.code == 200 683 | 684 | def test_fetch_status_no_cert_validation(self): 685 | response = self.fetch('/status', validate_cert=False) 686 | assert response.code == 200 687 | 688 | 689 | class TestSSLServerWithSSLClientCertVerification(SSLTestBase): 690 | def get_httpserver_options(self): 691 | return app.ssl_options(certfile=self.TLS_DIR + 'host.pem', 692 | cafile=self.TLS_DIR + 'ca.pem') 693 | 694 | def test_fetch_status(self): 695 | response = self.fetch('/status', client_cert=self.TLS_DIR + 'host.pem') 696 | assert response.code == 200 697 | 698 | def test_fetch_status_no_client_cert_supplied(self): 699 | response = self.fetch('/status') 700 | assert response.code == 599 701 | 702 | 703 | class TestSSLServerWithoutSSL(SSLTestBase): 704 | def get_protocol(self): 705 | return 'http' 706 | 707 | def test_fetch_status(self): 708 | response = self.fetch('/status') 709 | assert response.code == 599 710 | 711 | 712 | class TestSSLServerWithSSLAndBasicAuth(SSLTestBase): 713 | def get_app(self): 714 | return app.make_app(url_prefix='', debug=True, basic_auth='foo:bar') 715 | 716 | def test_fetch_status_correct_credentials(self): 717 | response = self.fetch('/status', auth_username='foo', auth_password='bar') 718 | assert response.code == 200 719 | 720 | def test_fetch_status_incorrect_password(self): 721 | response = self.fetch('/status', auth_username='foo', auth_password='ba') 722 | assert response.code == 401 723 | 724 | def test_fetch_status_unknown_user(self): 725 | response = self.fetch('/status', auth_username='fo', auth_password='bar') 726 | assert response.code == 401 727 | 728 | def test_fetch_status_missing_credentials(self): 729 | response = self.fetch('/status') 730 | assert response.code == 401 731 | 732 | def test_fetch_data_missing_credentials(self): 733 | response = self.fetch('/dataset/XYZ') 734 | assert response.code == 401 735 | 736 | def test_fetch_data_correct_credentials(self): 737 | url = url_concat('/dataset/XYZ', {'q': json.dumps('{}')}) 738 | response = self.fetch(url, auth_username='foo', auth_password='bar') 739 | assert response.code == 404 740 | 741 | def test_fetch_statistics_missing_credentials(self): 742 | response = self.fetch('/statistics') 743 | assert response.code == 401 744 | 745 | def test_fetch_statistics_correct_credentials(self): 746 | response = self.fetch('/statistics', auth_username='foo', auth_password='bar') 747 | assert response.code == 200 748 | 749 | # Delete against a Q endpoint is a 404 750 | # Get against a Q endpoint is a 404 751 | -------------------------------------------------------------------------------- /test/test_qframe.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | from contextlib import contextmanager 4 | import pytest 5 | import time 6 | 7 | from qcache.qframe import MalformedQueryException, QFrame 8 | 9 | 10 | def query(df, q): 11 | return QFrame(df).query(q).df 12 | 13 | ######################### Filtering ########################## 14 | 15 | 16 | @pytest.fixture 17 | def basic_frame(): 18 | data = """ 19 | foo,bar,baz,qux 20 | bbb,1.25,5,qqq 21 | aaa,3.25,7,qqq 22 | ccc,,9,www""" 23 | 24 | return QFrame.from_csv(data) 25 | 26 | 27 | def assert_rows(qframe, rows, column='foo'): 28 | frame = qframe.df 29 | assert len(frame) == len(rows) 30 | 31 | for ix, row in enumerate(rows): 32 | assert frame.iloc[ix][column] == row 33 | 34 | 35 | @pytest.mark.parametrize("operation, column, value, expected", [ 36 | ("<", 'bar', 2, 'bbb'), 37 | (">", 'bar', 2, 'aaa'), 38 | (">", 'foo', "'bbb'", 'ccc'), 39 | ("<=", 'baz', 6, 'bbb'), 40 | ("<=", 'baz', 5, 'bbb'), 41 | (">=", 'foo', "'bbc'", 'ccc'), 42 | (">=", 'foo', "'ccc'", 'ccc'), 43 | ("==", 'foo', "'ccc'", 'ccc'), 44 | ("!=", 'qux', "'qqq'", 'ccc'), 45 | ]) 46 | def test_filter_operations(basic_frame, operation, column, value, expected): 47 | frame = basic_frame.query({'where': [operation, column, value]}) 48 | assert_rows(frame, [expected]) 49 | 50 | 51 | def test_negation(basic_frame): 52 | frame = basic_frame.query({'where': ["!", ["==", "qux", "'qqq'"]]}) 53 | assert_rows(frame, ['ccc']) 54 | 55 | 56 | def test_and(basic_frame): 57 | frame = basic_frame.query({'where': ["&", ["==", "qux", "'qqq'"], [">", "baz", 6]]}) 58 | assert_rows(frame, ['aaa']) 59 | 60 | 61 | def test_and_with_only_one_clause(basic_frame): 62 | frame = basic_frame.query({'where': ["&", ["==", "foo", "'aaa'"]]}) 63 | assert_rows(frame, ['aaa']) 64 | 65 | frame = basic_frame.query({'where': ["&", ["==", "foo", "'abc'"]]}) 66 | assert_rows(frame, []) 67 | 68 | 69 | def test_or(basic_frame): 70 | frame = basic_frame.query({'where': ["|", ["==", "baz", 5], ["==", "baz", 7]]}) 71 | assert_rows(frame, ['bbb', 'aaa']) 72 | 73 | 74 | def test_or_with_only_one_clause(basic_frame): 75 | frame = basic_frame.query({'where': ["|", ["==", "foo", "'aaa'"]]}) 76 | assert_rows(frame, ['aaa']) 77 | 78 | frame = basic_frame.query({'where': ["|", ["==", "foo", "'abc'"]]}) 79 | assert_rows(frame, []) 80 | 81 | 82 | def test_col_in_list(basic_frame): 83 | frame = basic_frame.query({'where': ["in", "baz", [5, 8, -2]]}) 84 | assert_rows(frame, ['bbb']) 85 | 86 | 87 | def test_null_value(basic_frame): 88 | frame = basic_frame.query({'where': ["isnull", "bar"]}) 89 | assert_rows(frame, ['ccc']) 90 | 91 | 92 | @pytest.mark.skipif(True, reason='This should work I think, but it does not...') 93 | def test_string_in_col(basic_frame): 94 | frame = basic_frame.query({'where': ["contains", "foo", "'bb'"]}) 95 | assert_rows(frame, ['bbb']) 96 | 97 | 98 | def test_unknown_column_name(basic_frame): 99 | with pytest.raises(MalformedQueryException): 100 | basic_frame.query({'where': ["==", "unknown", 3]}) 101 | 102 | 103 | def test_invalid_column_name(basic_frame): 104 | with pytest.raises(MalformedQueryException): 105 | basic_frame.query({'where': ["==", "", 3]}) 106 | 107 | 108 | def test_empty_filter_returns_same_frame(basic_frame): 109 | assert basic_frame.query({'where': []}).df.equals(basic_frame.df) 110 | 111 | 112 | def test_empty_filter_clause_not_allowed(basic_frame): 113 | with pytest.raises(MalformedQueryException): 114 | basic_frame.query({'where': ["|", []]}) 115 | 116 | 117 | @pytest.mark.parametrize("operation", ["!", "isnull"]) 118 | def test_single_argument_operators_require_single_argument(basic_frame, operation): 119 | with pytest.raises(MalformedQueryException): 120 | basic_frame.query({'where': [operation, 'foo', 'bar']}) 121 | 122 | 123 | @pytest.mark.parametrize("operation", ["<", ">", ">", "<=", "<=", ">=", ">=", "==", "!=", "in"]) 124 | def test_double_argument_operators_require_single_argument(basic_frame, operation): 125 | with pytest.raises(MalformedQueryException): 126 | basic_frame.query({'where': [operation, 'foo']}) 127 | 128 | with pytest.raises(MalformedQueryException): 129 | basic_frame.query({'where': [operation, 'foo', 'bar', 'baz']}) 130 | 131 | 132 | @pytest.mark.parametrize("operation", ["&", "|"]) 133 | def test_and_or_requires_at_least_one_argument(basic_frame, operation): 134 | with pytest.raises(MalformedQueryException): 135 | basic_frame.query({'where': [operation]}) 136 | 137 | 138 | @pytest.fixture 139 | def bitwise_frame(): 140 | data = """foo,bar,baz 141 | 1,1.5,abc 142 | 2,1.5,def 143 | 3,1.5,ghi 144 | 4,1.5,ijk 145 | 5,1.5,lmn""" 146 | 147 | return QFrame.from_csv(data) 148 | 149 | 150 | @pytest.mark.parametrize("filter, expected_rows", [ 151 | (1, [1, 3, 5]), 152 | (2, [2, 3]), 153 | (3, [3]), 154 | (4, [4, 5]), 155 | (5, [5]), 156 | (6, []), 157 | ]) 158 | def test_bitwise_all_bits_with_constant(filter, expected_rows, bitwise_frame): 159 | result = bitwise_frame.query({'where': ["all_bits", "foo", filter]}) 160 | assert_rows(result, expected_rows) 161 | 162 | 163 | @pytest.mark.parametrize("filter, expected_rows", [ 164 | (1, [1, 3, 5]), 165 | (2, [2, 3]), 166 | (3, [1, 2, 3, 5]), 167 | (4, [4, 5]), 168 | (5, [1, 3, 4, 5]), 169 | (6, [2, 3, 4, 5]), 170 | (8, []), 171 | ]) 172 | def test_bitwise_any_bits_with_constant(filter, expected_rows, bitwise_frame): 173 | result = bitwise_frame.query({'where': ["any_bits", "foo", filter]}) 174 | assert_rows(result, expected_rows) 175 | 176 | 177 | def test_bitwise_invalid_arg(bitwise_frame): 178 | with pytest.raises(MalformedQueryException): 179 | bitwise_frame.query({'where': ["any_bits", "foo", 1.3]}) 180 | 181 | 182 | def test_bitwise_invalid_column_type(bitwise_frame): 183 | with pytest.raises(MalformedQueryException): 184 | bitwise_frame.query({'where': ["any_bits", "baz", 1]}) 185 | 186 | 187 | def test_bitwise_column_missing(bitwise_frame): 188 | with pytest.raises(MalformedQueryException): 189 | bitwise_frame.query({'where': ["any_bits", "dont_exist", 1]}) 190 | 191 | 192 | def test_bitwise_invalid_filter_length(bitwise_frame): 193 | with pytest.raises(MalformedQueryException): 194 | bitwise_frame.query({'where': ["any_bits", "foo", 1, 2]}) 195 | 196 | 197 | @pytest.fixture 198 | def string_frame(): 199 | data = """foo,bar 200 | 1,abcd 201 | 2,defg 202 | 3,ghij 203 | 4,gxyj""" 204 | 205 | return QFrame.from_csv(data) 206 | 207 | 208 | @pytest.mark.parametrize("operator, filter, expected_rows", [ 209 | ("like", "'a%'", [1]), 210 | ("like", "'%g'", [2]), 211 | ("like", "'%d%'", [1, 2]), 212 | ("like", "'%cc%'", []), 213 | ("like", "''", []), 214 | ("like", "'%'", [1, 2, 3, 4]), 215 | ("like", "'%%'", [1, 2, 3, 4]), 216 | ("like", "'%D%'", []), 217 | ("ilike", "'%D%'", [1, 2]), 218 | ("like", "'%g[a-z]{2}j%'", [3, 4]), 219 | ("like", "'%g[a-z]{3}j%'", []), 220 | ("like", "'g[a-z]{2}j'", [3, 4]), 221 | ("like", "'g[a-z]{2}'", []), 222 | ("like", "'g[a-z]{2}%'", [3, 4]), 223 | ("like", "'g[a-z]{3}'", [3, 4]), 224 | ]) 225 | def test_like(operator, filter, expected_rows, string_frame): 226 | result = string_frame.query({'where': [operator, "bar", filter]}) 227 | assert_rows(result, expected_rows) 228 | 229 | 230 | def test_like_missing_quotes_on_argument(string_frame): 231 | with pytest.raises(MalformedQueryException): 232 | string_frame.query({'where': ['like', "bar", "%abc%"]}) 233 | 234 | 235 | def test_like_invalid_argument_type(string_frame): 236 | with pytest.raises(MalformedQueryException): 237 | string_frame.query({'where': ['like', "bar", 12]}) 238 | 239 | 240 | def test_like_invalid_column_type(string_frame): 241 | with pytest.raises(MalformedQueryException): 242 | string_frame.query({'where': ['like', "foo", "'%a%'"]}) 243 | 244 | 245 | ############### Sub select ################## 246 | 247 | 248 | @pytest.mark.parametrize("data", [ 249 | """foo,bar 250 | 1,1 251 | 2,1 252 | 3,2""", # Numbers 253 | """foo,bar 254 | 1,aa 255 | 2,aa 256 | 3,bb""", # Strings 257 | """foo,bar 258 | 1, 259 | 2, 260 | 3,bb""", # null/None 261 | ]) 262 | def test_sub_select(data): 263 | frame = QFrame.from_csv(data) 264 | 265 | result = frame.query({'where': ['in', 'bar', {'where': ['==', 'foo', 2]}]}) 266 | 267 | assert_rows(result, [1, 2]) 268 | 269 | 270 | def test_sub_select_in_column_missing_in_sub_select(): 271 | frame = QFrame.from_csv("""foo,bar 272 | 1,aa""") 273 | 274 | with pytest.raises(MalformedQueryException): 275 | frame.query({'where': ['in', 'bar', {'select': ['foo'], 276 | 'where': ['==', 'foo', 2]}]}) 277 | 278 | 279 | ############### Projections ####################### 280 | 281 | 282 | def test_select_subset(basic_frame): 283 | frame = basic_frame.query({'select': ['foo', 'baz']}) 284 | assert list(frame.columns) == ['foo', 'baz'] 285 | 286 | 287 | def test_select_subset_invalid_column(basic_frame): 288 | with pytest.raises(MalformedQueryException): 289 | basic_frame.query({'select': ['foof', 'baz']}) 290 | 291 | 292 | def test_select_distinct_without_columns(basic_frame): 293 | # Should not have any effect since all rows are unique with respect to all columns 294 | frame = basic_frame.query({'distinct': []}) 295 | assert_rows(frame, ['bbb', 'aaa', 'ccc']) 296 | 297 | 298 | def test_select_distinct_with_columns(basic_frame): 299 | frame = basic_frame.query({'distinct': ['qux']}) 300 | assert_rows(frame, ['bbb', 'ccc']) 301 | 302 | 303 | ################ Aggregation ##################### 304 | 305 | # TODO: More tests and error handling 306 | 307 | def test_basic_sum_aggregation(basic_frame): 308 | expected = QFrame.from_csv(""" 309 | qux,baz 310 | www,9 311 | qqq,12""") 312 | 313 | frame = basic_frame.query({ 314 | 'select': ['qux', ['sum', 'baz']], 315 | 'group_by': ['qux'], 316 | 'order_by': ['baz']}) 317 | 318 | assert frame.to_csv() == expected.to_csv() 319 | 320 | 321 | def test_basic_count_aggregation(basic_frame): 322 | expected = QFrame.from_csv(""" 323 | qux,baz 324 | qqq,2 325 | www,1""") 326 | 327 | frame = basic_frame.query({ 328 | 'select': ['qux', ['count', 'baz']], 329 | 'group_by': ['qux']}) 330 | 331 | assert frame.to_csv() == expected.to_csv() 332 | 333 | 334 | def test_unknown_aggregation_function(basic_frame): 335 | with pytest.raises(MalformedQueryException): 336 | basic_frame.query({ 337 | 'select': ['qux', ['foo_bar', 'baz']], 338 | 'group_by': ['qux']}) 339 | 340 | 341 | def test_missing_aggregation_function(basic_frame): 342 | with pytest.raises(MalformedQueryException): 343 | basic_frame.query({ 344 | 'select': ['qux'], 345 | 'group_by': ['qux']}) 346 | 347 | 348 | def test_count_without_aggregation(basic_frame): 349 | expected = QFrame.from_csv(""" 350 | count 351 | 3""") 352 | 353 | frame = basic_frame.query({'select': [['count']]}) 354 | assert frame.to_csv() == expected.to_csv() 355 | 356 | 357 | def test_max_without_aggregation(basic_frame): 358 | expected = QFrame.from_csv(""" 359 | baz 360 | 9""") 361 | 362 | frame = basic_frame.query({'select': [['max', 'baz']]}) 363 | assert frame.to_csv() == expected.to_csv() 364 | 365 | 366 | ############### Ordering ################ 367 | 368 | 369 | def test_single_column_ascending_ordering(basic_frame): 370 | frame = basic_frame.query({'order_by': ['foo']}) 371 | assert_rows(frame, ['aaa', 'bbb', 'ccc']) 372 | 373 | 374 | def test_single_column_decending_ordering(basic_frame): 375 | frame = basic_frame.query({'order_by': ['-foo']}) 376 | assert_rows(frame, ['ccc', 'bbb', 'aaa']) 377 | 378 | 379 | def test_sort_on_unknown_column(basic_frame): 380 | with pytest.raises(MalformedQueryException): 381 | basic_frame.query({'order_by': ['foof']}) 382 | 383 | 384 | ############## Slicing ################## 385 | 386 | 387 | def test_offset_and_limit(basic_frame): 388 | frame = basic_frame.query({"offset": 1, "limit": 1}) 389 | assert_rows(frame, ['aaa']) 390 | assert frame.unsliced_df_len == 3 391 | 392 | 393 | ############## Unicode ################# 394 | 395 | 396 | def test_unicode_content_from_csv(): 397 | data = u"""foo,bar 398 | aaa,Iñtërnâtiônàližætiøn 399 | bbb,räksmörgås 400 | ccc,""" 401 | 402 | input_frame = QFrame.from_csv(data) 403 | frame = input_frame.query({'where': ["==", "bar", u"'räksmörgås'"]}) 404 | 405 | assert_rows(frame, ['bbb']) 406 | 407 | 408 | def test_unicode_content_from_dicts(): 409 | data = [{'foo': 'aaa', 'bar': u'Iñtërnâtiônàližætiøn'}, 410 | {'foo': 'bbb', 'bar': u'räksmörgås'.encode(encoding='utf-8')}] 411 | input_frame = QFrame.from_dicts(data) 412 | frame = input_frame.query({'where': ["==", "bar", u"'räksmörgås'"]}) 413 | 414 | assert_rows(frame, ['bbb']) 415 | 416 | 417 | @pytest.fixture 418 | def calculation_frame(): 419 | data = """ 420 | foo,bar 421 | 1,10 422 | 1,11 423 | 2,20 424 | 3,30 425 | 3,33""" 426 | 427 | return QFrame.from_csv(data) 428 | 429 | 430 | def test_column_aliasing(calculation_frame): 431 | frame = calculation_frame.query({"select": [["=", "baz", "foo"]]}) 432 | 433 | assert frame.to_dicts() == [ 434 | {"baz": 1}, 435 | {"baz": 1}, 436 | {"baz": 2}, 437 | {"baz": 3}, 438 | {"baz": 3} 439 | ] 440 | 441 | 442 | def test_constant_int_aliasing(calculation_frame): 443 | frame = calculation_frame.query({"select": [["=", "baz", 55]], 444 | "limit": 2}) 445 | 446 | assert frame.to_dicts() == [ 447 | {"baz": 55}, 448 | {"baz": 55}, 449 | ] 450 | 451 | 452 | def test_constant_string_aliasing(calculation_frame): 453 | frame = calculation_frame.query({"select": [["=", "baz", "'qux'"]], 454 | "limit": 2}) 455 | 456 | assert frame.to_dicts() == [ 457 | {"baz": "qux"}, 458 | {"baz": "qux"}, 459 | ] 460 | 461 | 462 | def test_alias_as_sum_of_two_other_columns(calculation_frame): 463 | frame = calculation_frame.query({"select": [["=", "baz", ["+", "bar", "foo"]]], 464 | "limit": 2}) 465 | 466 | assert frame.to_dicts() == [ 467 | {"baz": 11}, 468 | {"baz": 12}, 469 | ] 470 | 471 | 472 | def test_alias_as_nested_expression(calculation_frame): 473 | frame = calculation_frame.query({"select": [["=", "baz", ["+", ["*", "bar", 2], "foo"]]], 474 | "limit": 2}) 475 | 476 | assert frame.to_dicts() == [ 477 | {"baz": 21}, 478 | {"baz": 23}, 479 | ] 480 | 481 | 482 | def test_alias_with_single_argument_function(calculation_frame): 483 | frame = calculation_frame.query({"select": [["=", "baz", ["sqrt", ["+", 3, "foo"]]]], 484 | "limit": 1}) 485 | 486 | assert frame.to_dicts() == [{"baz": 2}] 487 | 488 | 489 | @pytest.fixture 490 | def frame_with_zero(): 491 | data = """ 492 | foo,bar 493 | 1,0 494 | 1,11""" 495 | 496 | return QFrame.from_csv(data) 497 | 498 | 499 | def test_alias_with_division_by_zero(frame_with_zero): 500 | frame = frame_with_zero.query({"select": [["=", "baz", ["/", "foo", "bar"]]], 501 | "limit": 1}) 502 | 503 | assert frame.to_dicts() == [{"baz": float("inf")}] 504 | 505 | 506 | def test_invalid_alias_target_string_with_invalid_character(calculation_frame): 507 | with pytest.raises(MalformedQueryException): 508 | calculation_frame.query({"select": [["=", "ba/r", 1]]}) 509 | 510 | 511 | def test_invalid_alias_target_non_string(calculation_frame): 512 | with pytest.raises(MalformedQueryException): 513 | calculation_frame.query({"select": [["=", 23, 1]]}) 514 | 515 | 516 | def test_aliasing_does_not_overwrite_original_qframe(calculation_frame): 517 | frame = calculation_frame.query({"select": [["=", "baz", "foo"]]}) 518 | assert list(frame.columns.values) == ['baz'] 519 | assert 'baz' not in list(calculation_frame.df.columns.values) 520 | 521 | 522 | def test_cannot_mix_aliasing_and_aggregation_expressions(calculation_frame): 523 | with pytest.raises(MalformedQueryException): 524 | calculation_frame.query({"select": [["=", "bar", 1], ["max", "foo"]], 525 | "group_by": ["bar"]}) 526 | 527 | 528 | def test_aliasing_with_wrong_number_of_parameters_in_function(calculation_frame): 529 | with pytest.raises(MalformedQueryException): 530 | calculation_frame.query({"select": [["=", "baz", ["+", "bar", "foo", "foo"]]]}) 531 | 532 | 533 | def test_aliasing_with_unknown_function(calculation_frame): 534 | with pytest.raises(MalformedQueryException): 535 | calculation_frame.query({"select": [["=", "baz", ["?", "bar", "foo"]]]}) 536 | 537 | 538 | def test_aliasing_with_unknown_function_2(calculation_frame): 539 | with pytest.raises(MalformedQueryException): 540 | calculation_frame.query({"select": [["=", "baz", ["zin", "bar"]]]}) 541 | 542 | 543 | def test_aliasing_with_invalid_arity(calculation_frame): 544 | with pytest.raises(MalformedQueryException): 545 | calculation_frame.query({"select": [["=", "baz", ["zin", "bar"], "foobar"]]}) 546 | 547 | 548 | def test_multiple_aggregation_functions_without_group_by(calculation_frame): 549 | frame = calculation_frame.query({"select": [["max", "bar"], ["min", "foo"]]}) 550 | assert frame.to_dicts() == [{"bar": 33, "foo": 1}] 551 | 552 | 553 | def test_cannot_mix_aggregation_functions_and_columns_without_group_by(calculation_frame): 554 | with pytest.raises(MalformedQueryException): 555 | calculation_frame.query({"select": [["max", "bar"], "foo"]}) 556 | 557 | 558 | ################# Sub queries ################### 559 | 560 | 561 | @pytest.fixture 562 | def subselect_frame(): 563 | data = """ 564 | foo,bar 565 | 1,10 566 | 1,15 567 | 5,50""" 568 | 569 | return QFrame.from_csv(data) 570 | 571 | 572 | def test_alias_aggregation_from_sub_select(subselect_frame): 573 | frame = subselect_frame.query({"select": [["=", "foo_pct", 574 | ["*", 100, ["/", "foo", "bar"]]]], 575 | "from": 576 | {"select": ["foo", ["sum", "bar"]], 577 | "group_by": ["foo"]}}) 578 | 579 | assert frame.to_dicts() == [ 580 | {"foo_pct": 4.0}, 581 | {"foo_pct": 10.0} 582 | ] 583 | 584 | 585 | ################ Enums ######################## 586 | 587 | @pytest.fixture 588 | def enum_data(): 589 | return """ 590 | foo,bar 591 | ccc,10 592 | ccc,11 593 | ccc,12 594 | ccc,13 595 | ccc,14 596 | ccc,15 597 | ccc,16 598 | bbb,20 599 | aaa,25""" 600 | 601 | 602 | @pytest.fixture 603 | def enum_frame(enum_data): 604 | return QFrame.from_csv(enum_data, column_types={'foo': 'category'}) 605 | 606 | 607 | def test_enum_basic_sorting(enum_frame): 608 | assert enum_frame.query({'order_by': ['foo']}).to_dicts() == [ 609 | {'foo': 'aaa', 'bar': 25}, 610 | {'foo': 'bbb', 'bar': 20}, 611 | {'foo': 'ccc', 'bar': 10}, 612 | {'foo': 'ccc', 'bar': 11}, 613 | {'foo': 'ccc', 'bar': 12}, 614 | {'foo': 'ccc', 'bar': 13}, 615 | {'foo': 'ccc', 'bar': 14}, 616 | {'foo': 'ccc', 'bar': 15}, 617 | {'foo': 'ccc', 'bar': 16}, 618 | ] 619 | 620 | 621 | def test_enum_filter_by_equality(enum_frame): 622 | assert enum_frame.query({'where': ['==', 'foo', '"bbb"']}).to_dicts() == [ 623 | {'foo': 'bbb', 'bar': 20}, 624 | ] 625 | 626 | 627 | def test_enum_filter_by_order_comparison_not_possible(enum_frame): 628 | with pytest.raises(MalformedQueryException): 629 | enum_frame.query({'where': ['<', 'foo', '"bbb"']}) 630 | 631 | 632 | def test_enum_size(enum_frame, enum_data): 633 | # Space savings should be possible using categorials 634 | # when multiple rows containing the same value exists. 635 | frame = QFrame.from_csv(enum_data) 636 | assert enum_frame.byte_size() < frame.byte_size() 637 | 638 | 639 | def test_enum_from_dicts(enum_frame): 640 | cat_frame = QFrame.from_dicts(enum_frame.to_dicts(), column_types={'foo': 'category'}) 641 | frame = QFrame.from_dicts(enum_frame.to_dicts()) 642 | 643 | assert cat_frame.byte_size() < frame.byte_size() 644 | 645 | 646 | ############# NaN ############### 647 | 648 | 649 | def test_like_ignores_nan_values(): 650 | f = QFrame.from_csv(""" 651 | foo,bar 652 | aaa,xyz 653 | bbb,""") 654 | 655 | assert f.query({'where': ['ilike', 'bar', '"ccc"']}).to_dicts() == [] 656 | 657 | 658 | def test_only_empty_string_is_nan(): 659 | f = QFrame.from_csv(""" 660 | foo,bar 661 | aaa,N/A 662 | aaa,n/a 663 | aaa,NA 664 | aaa,na 665 | aaa,nan 666 | aaa,NaN 667 | aaa,-NaN 668 | aaa,null 669 | aaa,NULL 670 | bbb,""") 671 | 672 | assert json.loads(f.query({'select': ['bar']}).to_json()) == [ 673 | {"bar": "N/A"}, 674 | {"bar": "n/a"}, 675 | {"bar": "NA"}, 676 | {"bar": "na"}, 677 | {"bar": "nan"}, 678 | {"bar": "NaN"}, 679 | {"bar": "-NaN"}, 680 | {"bar": "null"}, 681 | {"bar": "NULL"}, 682 | {"bar": None}, 683 | ] 684 | 685 | 686 | ################# Update ###################### 687 | 688 | 689 | def assert_column(column, frame, expected): 690 | assert [d[column] for d in frame.to_dicts()] == expected 691 | 692 | 693 | def test_basic_update(basic_frame): 694 | basic_frame.query({'update': [['bar', 2.0], ['baz', 0]], 695 | 'where': ['==', 'foo', '"bbb"']}) 696 | 697 | assert basic_frame.to_dicts()[0]['bar'] == 2.0 698 | assert basic_frame.to_dicts()[0]['baz'] == 0 699 | 700 | 701 | def test_basic_update_function_based_on_current_value_of_column(basic_frame): 702 | basic_frame.query({'update': [['+', 'bar', 2.0]], 703 | 'where': ['==', 'foo', '"bbb"']}) 704 | 705 | assert basic_frame.to_dicts()[0]['bar'] == 3.25 706 | 707 | 708 | def test_unknown_update_function(basic_frame): 709 | with pytest.raises(MalformedQueryException): 710 | basic_frame.query({'update': [['_', 'bar', 2.0]], 711 | 'where': ['==', 'foo', '"bbb"']}) 712 | 713 | 714 | def test_update_is_null(basic_frame): 715 | basic_frame.query({'update': [['baz', 19]], 716 | 'where': ['isnull', 'bar']}) 717 | 718 | assert_column('baz', basic_frame, [5, 7, 19]) 719 | 720 | 721 | def test_update_is_null_invalid_argument_number(basic_frame): 722 | with pytest.raises(MalformedQueryException): 723 | basic_frame.query({'update': [['baz', 19]], 724 | 'where': ['isnull', 9]}) 725 | 726 | 727 | def test_update_in(basic_frame): 728 | basic_frame.query({'update': [['baz', 19]], 729 | 'where': ['in', 'foo', ["'aaa'", "'bbb'"]]}) 730 | 731 | assert_column('baz', basic_frame, [19, 19, 9]) 732 | 733 | 734 | def test_update_in_invalid_arg_count(basic_frame): 735 | with pytest.raises(MalformedQueryException): 736 | basic_frame.query({'update': [['baz', 19]], 737 | 'where': ['in', 'foo', 'bar', ["'aaa'", "'bbb'"]]}) 738 | 739 | 740 | def test_update_in_unknown_column(basic_frame): 741 | with pytest.raises(MalformedQueryException): 742 | basic_frame.query({'update': [['baz', 19]], 743 | 'where': ['in', 'unknown', ["'aaa'", "'bbb'"]]}) 744 | 745 | 746 | def test_update_in_second_arg_not_a_list(basic_frame): 747 | with pytest.raises(MalformedQueryException): 748 | basic_frame.query({'update': [['baz', 19]], 749 | 'where': ['in', 'foo', 'boo']}) 750 | 751 | 752 | def test_unknown_clause_in_query(basic_frame): 753 | try: 754 | basic_frame.query({'foo': []}) 755 | assert False 756 | except MalformedQueryException as e: 757 | print str(e) 758 | assert 'foo' in str(e) 759 | 760 | 761 | ################### Performance #################### 762 | 763 | 764 | @pytest.fixture 765 | def large_frame(): 766 | d = 1000000 * [{'aaa': 123456789, 'bbb': 'abcdefghijklmnopqrvwxyz', 'ccc': 1.23456789}] 767 | return QFrame.from_dicts(d) 768 | 769 | 770 | @contextmanager 771 | def timeit(name): 772 | t0 = time.time() 773 | yield 774 | print('\n{name} duration: {duration} s'.format(name=name, duration=time.time()-t0)) 775 | 776 | 777 | @pytest.mark.benchmark 778 | def test_large_frame_csv(large_frame): 779 | with timeit('to_csv'): 780 | csv_string = large_frame.to_csv() 781 | 782 | with timeit('from_csv'): 783 | QFrame.from_csv(csv_string) 784 | 785 | # Results: 786 | # to_csv duration: 2.43983101845 s 787 | # from_csv duration: 0.532874107361 s 788 | 789 | 790 | @pytest.mark.benchmark 791 | def test_large_frame_json(large_frame): 792 | with timeit('to_json'): 793 | large_frame.to_json() 794 | 795 | # with timeit('from_json'): 796 | # QFrame.from_json(json_string) 797 | 798 | # to_json duration: 0.792788982391 s 799 | # from_json duration: 3.07192707062 s, This implementation no longer exists 800 | 801 | 802 | @pytest.mark.benchmark 803 | @pytest.mark.skipif(True, reason="No implementation") 804 | def test_large_frame_msgpack(large_frame): 805 | # NOTE: This implementation does not exist but once did as an experiment 806 | # This test is left as reference and reminder 807 | with timeit('to_msgpack'): 808 | msgpack_string = large_frame.to_msgpack() 809 | 810 | with timeit('from_msgpack'): 811 | QFrame.from_msgpack(msgpack_string) 812 | 813 | # These numbers explain why there is no msgpack implementation 814 | # to_msgpack duration: 7.02977800369 s 815 | # from_msgpack duration: 1.52387404442 s 816 | 817 | # It's not because msgpack is slow (it's fast), it's because the 818 | # code has to first create a list of python dicts and then serialize 819 | # that using msgpack rather than serializing the dataframe to msgpack 820 | # directly. 821 | 822 | # Not 823 | # Disjunction and conjunction 824 | # Refactor tests to check complete column not just the row that is supposed to be affected 825 | # Mix self referring updates and assignments in same update 826 | # Any way to merge the filter code for select and update (is the update version as performant as the where)? 827 | 828 | 829 | def xtest_update_with_conjunction(basic_frame): 830 | basic_frame.query({'update': [['bar', 2.0]], 831 | 'where': ['==', 'foo', '"bbb"']}) 832 | 833 | assert basic_frame.to_dicts()[0]['bar'] == 3.25 834 | -------------------------------------------------------------------------------- /test/test_statistics.py: -------------------------------------------------------------------------------- 1 | from qcache.statistics import Statistics 2 | 3 | 4 | def test_ring_buffer_size(): 5 | s = Statistics(buffer_size=3) 6 | s.append('foo', 1) 7 | s.append('foo', 2) 8 | s.append('foo', 3) 9 | 10 | assert list(s.stats['foo']) == [1, 2, 3] 11 | 12 | s.append('foo', 4) 13 | assert list(s.stats['foo']) == [2, 3, 4] 14 | -------------------------------------------------------------------------------- /tls/ca-conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "signing": { 3 | "default": { 4 | "expiry": "2000000h", 5 | "usages": [ 6 | "signing", 7 | "key encipherment", 8 | "server auth", 9 | "client auth" 10 | ] 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /tls/ca-key.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEowIBAAKCAQEA5lS6l+dBf1DfkY4RoUXnennEMkzTD+SdRnGMUt3LvnkPjHdz 3 | pb0YQFQ+xLUEEosq4B9S74l0GFM+Y6/vrg+lhpDu8VfWNv4Py7j8tPDA8sLzFqbu 4 | pi+uRLu5FQPd0+HER+vcDvFWrcEM7YNHBHZC+6WH7nnRewzKgQgaQSc9W5megV3h 5 | ToHvAGrcwMK78dQEWtG8+QaprREFEFE279q7K7pTychDMidA+LvW1nSOImwapBhv 6 | lH04PKJ26eWx6SbC3ssS6uMgjJJZVsJrRs4vUz3D86gDbm2/+uSBPBh7zumuKTSH 7 | f78pFIOSd5Ayj9YAnlyft4laGfOFYj0nWyH+SQIDAQABAoIBACdQOvEGhcVnVmGP 8 | sC6osdPTyMsVFclWqgmGA/943/fIzgCZuuGFHwiL2JLWmrVXj5/leNBt06T7QGDf 9 | fLdm8EfBoScMaQHF54hMMMXpeeV1pOwu3fj7lnEg4XxWxpwNouTruwnJ45OQqdY4 10 | W/zE+rXdERCBNmIcUswnR+PrAKwLf2tAH8g9DN0jqilO8KJixmyYM/stfpEC08Q2 11 | N8wLY8zcHMEX6+IIgG4Ok76fKhGQ/gJKaow8fVIAw35Qe0BpV30fbS+mhjHVuPnl 12 | YW1idBDPItCGvBJs3kVS+tTZB9AFmJbHZj9GwOo+0D6ZVPSeXUDFy+czVHI9wL2T 13 | JZtrcgECgYEA/bfGb/siIZ/yG6YND2iSQNO3ip3QLvRzRhzLZCfTdfOax4dPUaFh 14 | xf9ZDB7Qn9Ki7JL4iSpCxbjOUhjAOwjE5N+hdGlqwMWUSJ4eGNQM0Vb98VDLrgsq 15 | FkOxatvUCaW0nqzIQdYMLkef0TnYQw0M+j3hg+t+9dcJzA4zS7A2TakCgYEA6Gca 16 | DQgyyJy9nFPaYk9qTH2RR+3I//NPI3ETBzrAGWkCWsDJS8g814Ln/dERibBuVU86 17 | D+ougtgXk6agDOUhAuC/1YtRVadSCZbyJityMeydHR033OP/fkYpPjOBiSjyOFn1 18 | D0VYWQVgvt/Z0RLvbh1m0PvMPfXf/3cxj4lkT6ECgYAFNvjCJnQ+Iq50OQZ9sZWH 19 | 9ZIJLFMyE94mq8LWbScgfoBI55QOxnVe/2+SGzQIhOjKWf73usGilLjQ4SdaT0TU 20 | u2/zF5OVILp6f514vysARnxzsEhvbFVSHdQQsTH7fMdol36KM98OOHSldT3nquYA 21 | YrM25ek3HlNaOVR+ksGa+QKBgQDEUFNxLru3Oq/wneSbpvnkIy3V1Mc1bhIrnhi0 22 | wqwCyvFyN+fSXBMI+Ut+3Fw0MxUAeyxQxUEExgUkdFw+iE6aX7+sY0MRV4W1FAz4 23 | sTqFcZpGPagyr2XjBOFR6bBCbJQvhc28WJeIm0Jd+jnEonoeSjfP0ON2c3wEEGuN 24 | FEHoAQKBgBF7DEcTo0xeUan/FCLYfdg8Hg/VnEYJUYttg4nOG7dVj35wvCnRTZ+9 25 | mmKpAjnh8Nqk2gTJHBBtzZ3I4L2uoUZocJnOfPjduhRi2A9KIrU0gg2clKKE+xAp 26 | DeEvLUKeaf4hPFRdAgy+yS9+pYwZdObMtr8SWm3FgRNmt4LRrenC 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /tls/ca.csr: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE REQUEST----- 2 | MIIC2zCCAcMCAQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWEx 3 | FjAUBgNVBAcTDVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdl 4 | dHMsIEluYy4xDDAKBgNVBAsTA1dXVzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC 5 | AQoCggEBAOZUupfnQX9Q35GOEaFF53p5xDJM0w/knUZxjFLdy755D4x3c6W9GEBU 6 | PsS1BBKLKuAfUu+JdBhTPmOv764PpYaQ7vFX1jb+D8u4/LTwwPLC8xam7qYvrkS7 7 | uRUD3dPhxEfr3A7xVq3BDO2DRwR2Qvulh+550XsMyoEIGkEnPVuZnoFd4U6B7wBq 8 | 3MDCu/HUBFrRvPkGqa0RBRBRNu/auyu6U8nIQzInQPi71tZ0jiJsGqQYb5R9ODyi 9 | dunlsekmwt7LEurjIIySWVbCa0bOL1M9w/OoA25tv/rkgTwYe87prik0h3+/KRSD 10 | kneQMo/WAJ5cn7eJWhnzhWI9J1sh/kkCAwEAAaAtMCsGCSqGSIb3DQEJDjEeMBww 11 | GgYDVR0RBBMwEYIJbG9jYWxob3N0hwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQDE 12 | eszCFstYVQlcetz3nY/MzJke+I6v4KCo6oyAzzB2OeMXedI0QFLyVqBKVMKaO1yB 13 | 49HmUbrubJaWJlJKNh067KnndcFgZiU1pTZ6yTC4UIVWxGOJoF3SFkgAjl57CVrE 14 | KdhaJH/+YM5c65ck6IzLH7EymYhnC0n4xDT6nP6kiq0b6jxjxx6P1Fz6+iw16Y5R 15 | 8f1HFA8RniZK5bmN1OY/ivPowJZKdobKRqWyIF9oynsiawRAOG20Z+Qo1P51bYNc 16 | jx3nc70vsZ1JfbkSLvrImC/7nLMIuxMS076m3e7WU1p2IQXKPf4tPD8VFz0gsANn 17 | yViBz/BDo+SoElTFV6yG 18 | -----END CERTIFICATE REQUEST----- 19 | -------------------------------------------------------------------------------- /tls/ca.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDszCCApugAwIBAgIUOjMlWz73JcqtOoRMekH2Ff8sj6owDQYJKoZIhvcNAQEL 3 | BQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFjAUBgNVBAcT 4 | DVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdldHMsIEluYy4x 5 | DDAKBgNVBAsTA1dXVzAeFw0xOTAxMDUxOTM5MDBaFw0yNDAxMDQxOTM5MDBaMGkx 6 | CzAJBgNVBAYTAlVTMRMwEQYDVQQIEwpDYWxpZm9ybmlhMRYwFAYDVQQHEw1TYW4g 7 | RnJhbmNpc2NvMR8wHQYDVQQKExZJbnRlcm5ldCBXaWRnZXRzLCBJbmMuMQwwCgYD 8 | VQQLEwNXV1cwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDmVLqX50F/ 9 | UN+RjhGhRed6ecQyTNMP5J1GcYxS3cu+eQ+Md3OlvRhAVD7EtQQSiyrgH1LviXQY 10 | Uz5jr++uD6WGkO7xV9Y2/g/LuPy08MDywvMWpu6mL65Eu7kVA93T4cRH69wO8Vat 11 | wQztg0cEdkL7pYfuedF7DMqBCBpBJz1bmZ6BXeFOge8AatzAwrvx1ARa0bz5Bqmt 12 | EQUQUTbv2rsrulPJyEMyJ0D4u9bWdI4ibBqkGG+UfTg8onbp5bHpJsLeyxLq4yCM 13 | kllWwmtGzi9TPcPzqANubb/65IE8GHvO6a4pNId/vykUg5J3kDKP1gCeXJ+3iVoZ 14 | 84ViPSdbIf5JAgMBAAGjUzBRMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8EBTAD 15 | AQH/MB0GA1UdDgQWBBS8CeQioSIBM+9UI/Fu+d4YjH9CYjAPBgNVHREECDAGhwR/ 16 | AAABMA0GCSqGSIb3DQEBCwUAA4IBAQBfJygLqN/A/rwjoD5gsdbALG26G1/Ei3wV 17 | V4HliQdExQBKCpb02TS2EPKZn0CYEW3WgnEuQ8TZMubwH07OrKFpQqjGnGHNbO7E 18 | u7thzvK1Sj2Wyr+Gml3EDRJw//cFTi11/Mu9zxj9uZyDt3z96y9GIrEZt1uAaNnf 19 | +HoWd5VnXVbZtzDlPxzSU/943XpHz5nPOdRw4zHYZ2ftcmL5ihecONRHPrTze5F1 20 | hyCIWbaTJSe7D90uO4RoA6jiCtiweF01SBVjN6ELT5Deyohxi2e+ctBffefIa7IJ 21 | hQ8fXTjRY+SOcCdYo3d7PnsnhMITysJYmnF3EPjKs8UZo3niuC3A 22 | -----END CERTIFICATE----- 23 | -------------------------------------------------------------------------------- /tls/csr.json: -------------------------------------------------------------------------------- 1 | { 2 | "hosts": [ 3 | "localhost", 4 | "127.0.0.1" 5 | ], 6 | "key": { 7 | "algo": "rsa", 8 | "size": 2048 9 | }, 10 | "names": [ 11 | { 12 | "C": "US", 13 | "L": "San Francisco", 14 | "O": "Internet Widgets, Inc.", 15 | "OU": "WWW", 16 | "ST": "California" 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /tls/generate_test_certs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Small script to generate certs for testing of TLS 4 | # Uses cfssl for cert generation: https://github.com/cloudflare/cfssl 5 | # go get -u github.com/cloudflare/cfssl/cmd/cfssl 6 | # go get -u github.com/cloudflare/cfssl/cmd/cfssljson 7 | 8 | cfssl genkey -initca csr.json | cfssljson -bare ca 9 | cfssl gencert -ca ca.pem -ca-key ca-key.pem -config ca-conf.json csr.json | cfssljson -bare host 10 | cat host-key.pem >> host.pem 11 | -------------------------------------------------------------------------------- /tls/host-key.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEowIBAAKCAQEArqbqq7q9RFEB8qP+sY41rJN5S8s3SJesrneCuo2fpbJQnvqZ 3 | +d8JRhb48PpNQg1xUNzAZi2yGNgjD8qvkq29T/bQmqxCvd7sddZt5tIPfL6nGNea 4 | GXDobiIXgk3OuDwF6LINbdpaTi50s3mxiZEpFWabr9106ZYqvoWmliWegYOKPKf/ 5 | In1Bc/DvoeG5nc2xOT3VjzAz1D4PjdoEn2N99MDwzEkTVs4ErfG+bpsJtaIgNeJf 6 | M0V9TEiQ49/njQygBomdQodw9wy6d1JaaEG6T0sxSscTPeissgScU66JsJdzBiyb 7 | fhjsJFXceMWx52z7BOnpaT61HrY5AVLFmCZsowIDAQABAoIBAQCdsI+JZh2lJQ8B 8 | KRi475F+s9h8morbvDZpf/ZEwcI3NPydzd5gQQR6iBtUWGqRDNoXDHmax/9ZdyyR 9 | AYvsf3nCTTKjtIy+KhyNeIHaOidJlkoAoAm+lrcFWTqop9/RcEBVjQ2a9d87X652 10 | rZWig7H4ZKCE9QquLKuDQeDbCDRI8iCSUwHw69Os0o0xjVjwIEUfrrTAqdCW68db 11 | JxU+2qbJp6QLnQRA2tg6EEXTRuu1YydDQpVk6LWSQl4aDJvt+NmUdLQ9641oq9+U 12 | vEJStGRI/7dUvkIXHWmSjCWYUPufVFMnpiGCnVdOx+pT+7kyX4qkFkd+NRZCCRv7 13 | pjaaPYnhAoGBAOJYImG6o2ZUf6dpC7Qaju0ht/zUwzpnHhxyUEFJMn01E7n98vBS 14 | gwQjacuyEpuiRbZ2eCzXIVYUvJXRjZ6r49ICUjV3wCDcFJGgotovG5ROLDJ4uePZ 15 | j6pExC6PWOYkYBbMDMDcHINMPUMrSCt1q+IWLYwBnw0s3rrcU081G7MVAoGBAMWI 16 | 9d/XeyP1cyRzfZjYXZlUiu656aenmTMFdo1eq+qqZc3nROJAZhkKsXtZDstMhqK5 17 | Zt8JRgJgyTkpbgi7928kZQ3lZTGYRde+E+28n9PiQ/DwhQNprhAdlkxHYLbyIOMS 18 | TIgiYVyFp/hPa2+TIiRj8R0hTxjVB+eeP3c3MG7XAoGAZ4jdIUsYV0Srp85bNiU3 19 | 36ye1p4UN3DXyzdXEqYC3FcdEOTi7Z8wowH38N8ht+NAdDg4vHojm5actQNH/p6t 20 | 9XBuXlrKnb1OA1cxZxBJU0t22Bd1II5yMvaFrq6PgaZ99a+c3dNRj5WeKE1yE78d 21 | wqtWZkaPlJ1DvMHvsbmfVAkCgYBaiA7H7CYPsOp4hkXFy0P6hfi2uJYRtIpOC+7t 22 | k7oO7tGBsMCtQP6J85CsC4DwQ75gzcAL8GAZruoKPKalciBQ28lEuSHLvRIlcoQH 23 | rS9DGKwyvMj5a3HPCQBdLUlyDz5rU3On4LnmgYseDqgalsBLFVLoGt/5bYkV6j0E 24 | ElMsOQKBgHpJ3GIHB0GBW3knMU7UQypMB3C7845YPRMvXexJuINSar0Uw50HgyYL 25 | gPx+lWV23pMO2dksb+BcrbdrlFAlrOZuPg7rzLKP2J/wEQiM2OqdjOXDZ+C1D6yn 26 | 0tPFFneXGUe31tgrMuSWrUoHU6zpurAfYZuE0TJ0+7ud3Nm0q++P 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /tls/host.csr: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE REQUEST----- 2 | MIIC2zCCAcMCAQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWEx 3 | FjAUBgNVBAcTDVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdl 4 | dHMsIEluYy4xDDAKBgNVBAsTA1dXVzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC 5 | AQoCggEBAK6m6qu6vURRAfKj/rGONayTeUvLN0iXrK53grqNn6WyUJ76mfnfCUYW 6 | +PD6TUINcVDcwGYtshjYIw/Kr5KtvU/20JqsQr3e7HXWbebSD3y+pxjXmhlw6G4i 7 | F4JNzrg8BeiyDW3aWk4udLN5sYmRKRVmm6/ddOmWKr6FppYlnoGDijyn/yJ9QXPw 8 | 76HhuZ3NsTk91Y8wM9Q+D43aBJ9jffTA8MxJE1bOBK3xvm6bCbWiIDXiXzNFfUxI 9 | kOPf540MoAaJnUKHcPcMundSWmhBuk9LMUrHEz3orLIEnFOuibCXcwYsm34Y7CRV 10 | 3HjFseds+wTp6Wk+tR62OQFSxZgmbKMCAwEAAaAtMCsGCSqGSIb3DQEJDjEeMBww 11 | GgYDVR0RBBMwEYIJbG9jYWxob3N0hwR/AAABMA0GCSqGSIb3DQEBCwUAA4IBAQCS 12 | fUoC/KrOQDiYEk+z/RXGDzDDZIQu02d9hJVhu8+UIgCUInwfgi4L43/cIJWdrdVp 13 | NmJ+EK+0tHHvNYbaNHeQggYVz/b+geyb7rH8dv+6VFRzfvidHJk9l7I1wqKn4CMQ 14 | BrfQMfeqbFJrOoJjEoKkNB5SgW8SGjTl2DRFgmmYFeb/Y3YBWa5sf0/otmVNpk2O 15 | G7Rw7aXrJPNloX+4tPoIuHM6A3u1h3NKhCmkuSESBd3/VOQP3wBdQCEaVsNGSyQr 16 | +/Rx3HQmJiVEBo+tjaKI9mx5IzMm5khH7jdKBmuOIZVTzhFHZoE82Tl01HlXSN7J 17 | ZIRHwKYJ3E9NEWMgJ7SE 18 | -----END CERTIFICATE REQUEST----- 19 | -------------------------------------------------------------------------------- /tls/host.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIID3DCCAsSgAwIBAgIUD5T+wDbRhV3OEwkguwxlfTXVTTMwDQYJKoZIhvcNAQEL 3 | BQAwaTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFjAUBgNVBAcT 4 | DVNhbiBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdldHMsIEluYy4x 5 | DDAKBgNVBAsTA1dXVzAgFw0xOTAxMDUxOTM5MDBaGA8yMjQ3MDMwNTAzMzkwMFow 6 | aTELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFjAUBgNVBAcTDVNh 7 | biBGcmFuY2lzY28xHzAdBgNVBAoTFkludGVybmV0IFdpZGdldHMsIEluYy4xDDAK 8 | BgNVBAsTA1dXVzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAK6m6qu6 9 | vURRAfKj/rGONayTeUvLN0iXrK53grqNn6WyUJ76mfnfCUYW+PD6TUINcVDcwGYt 10 | shjYIw/Kr5KtvU/20JqsQr3e7HXWbebSD3y+pxjXmhlw6G4iF4JNzrg8BeiyDW3a 11 | Wk4udLN5sYmRKRVmm6/ddOmWKr6FppYlnoGDijyn/yJ9QXPw76HhuZ3NsTk91Y8w 12 | M9Q+D43aBJ9jffTA8MxJE1bOBK3xvm6bCbWiIDXiXzNFfUxIkOPf540MoAaJnUKH 13 | cPcMundSWmhBuk9LMUrHEz3orLIEnFOuibCXcwYsm34Y7CRV3HjFseds+wTp6Wk+ 14 | tR62OQFSxZgmbKMCAwEAAaN6MHgwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQG 15 | CCsGAQUFBwMBBggrBgEFBQcDAjAMBgNVHRMBAf8EAjAAMB0GA1UdDgQWBBQIYEWF 16 | Sah+UG/boeaUl2LQs/qSLTAaBgNVHREEEzARgglsb2NhbGhvc3SHBH8AAAEwDQYJ 17 | KoZIhvcNAQELBQADggEBADgI4NmlGugR8n3qgjQtmKZI97vtFlcKjLn2cx2K7XOr 18 | LOHMRXpxfpQH7X5VJE3sppPPenCYg7x23S5Cs4Lwy6PTgBZQEZwr5sEFpVcuhEbW 19 | jrF7IK43kXZuZz0qInEFvC2MW6jS6crYn0g8401mrmD24Zkg211HeGOhgDIRa9pZ 20 | lAD4fEynoXUmmEF6wrhMWOpZBY7VAecTXko+j96IyfrVW+1kl7mstCmgcIUQHEFl 21 | 1XnxbEp1HwQLdkA32KgDP07pAIeDMhFQjSZ7Fcqr8y96KNbuL3tmS+EAQ+JbTCLG 22 | F88HXUhSFfop+++aPuNdo/QdkHED9zJ0j8oVueBWPMg= 23 | -----END CERTIFICATE----- 24 | -----BEGIN RSA PRIVATE KEY----- 25 | MIIEowIBAAKCAQEArqbqq7q9RFEB8qP+sY41rJN5S8s3SJesrneCuo2fpbJQnvqZ 26 | +d8JRhb48PpNQg1xUNzAZi2yGNgjD8qvkq29T/bQmqxCvd7sddZt5tIPfL6nGNea 27 | GXDobiIXgk3OuDwF6LINbdpaTi50s3mxiZEpFWabr9106ZYqvoWmliWegYOKPKf/ 28 | In1Bc/DvoeG5nc2xOT3VjzAz1D4PjdoEn2N99MDwzEkTVs4ErfG+bpsJtaIgNeJf 29 | M0V9TEiQ49/njQygBomdQodw9wy6d1JaaEG6T0sxSscTPeissgScU66JsJdzBiyb 30 | fhjsJFXceMWx52z7BOnpaT61HrY5AVLFmCZsowIDAQABAoIBAQCdsI+JZh2lJQ8B 31 | KRi475F+s9h8morbvDZpf/ZEwcI3NPydzd5gQQR6iBtUWGqRDNoXDHmax/9ZdyyR 32 | AYvsf3nCTTKjtIy+KhyNeIHaOidJlkoAoAm+lrcFWTqop9/RcEBVjQ2a9d87X652 33 | rZWig7H4ZKCE9QquLKuDQeDbCDRI8iCSUwHw69Os0o0xjVjwIEUfrrTAqdCW68db 34 | JxU+2qbJp6QLnQRA2tg6EEXTRuu1YydDQpVk6LWSQl4aDJvt+NmUdLQ9641oq9+U 35 | vEJStGRI/7dUvkIXHWmSjCWYUPufVFMnpiGCnVdOx+pT+7kyX4qkFkd+NRZCCRv7 36 | pjaaPYnhAoGBAOJYImG6o2ZUf6dpC7Qaju0ht/zUwzpnHhxyUEFJMn01E7n98vBS 37 | gwQjacuyEpuiRbZ2eCzXIVYUvJXRjZ6r49ICUjV3wCDcFJGgotovG5ROLDJ4uePZ 38 | j6pExC6PWOYkYBbMDMDcHINMPUMrSCt1q+IWLYwBnw0s3rrcU081G7MVAoGBAMWI 39 | 9d/XeyP1cyRzfZjYXZlUiu656aenmTMFdo1eq+qqZc3nROJAZhkKsXtZDstMhqK5 40 | Zt8JRgJgyTkpbgi7928kZQ3lZTGYRde+E+28n9PiQ/DwhQNprhAdlkxHYLbyIOMS 41 | TIgiYVyFp/hPa2+TIiRj8R0hTxjVB+eeP3c3MG7XAoGAZ4jdIUsYV0Srp85bNiU3 42 | 36ye1p4UN3DXyzdXEqYC3FcdEOTi7Z8wowH38N8ht+NAdDg4vHojm5actQNH/p6t 43 | 9XBuXlrKnb1OA1cxZxBJU0t22Bd1II5yMvaFrq6PgaZ99a+c3dNRj5WeKE1yE78d 44 | wqtWZkaPlJ1DvMHvsbmfVAkCgYBaiA7H7CYPsOp4hkXFy0P6hfi2uJYRtIpOC+7t 45 | k7oO7tGBsMCtQP6J85CsC4DwQ75gzcAL8GAZruoKPKalciBQ28lEuSHLvRIlcoQH 46 | rS9DGKwyvMj5a3HPCQBdLUlyDz5rU3On4LnmgYseDqgalsBLFVLoGt/5bYkV6j0E 47 | ElMsOQKBgHpJ3GIHB0GBW3knMU7UQypMB3C7845YPRMvXexJuINSar0Uw50HgyYL 48 | gPx+lWV23pMO2dksb+BcrbdrlFAlrOZuPg7rzLKP2J/wEQiM2OqdjOXDZ+C1D6yn 49 | 0tPFFneXGUe31tgrMuSWrUoHU6zpurAfYZuE0TJ0+7ud3Nm0q++P 50 | -----END RSA PRIVATE KEY----- 51 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tobias' 2 | -------------------------------------------------------------------------------- /util/memory_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Rough script used to measure insert and query performance + memory usage. 3 | 4 | For memory usage estimate ps_mem 5 | (http://github.com/pixelb/scripts/commits/master/scripts/ps_mem.py) was used. 6 | 7 | Results 8 | 9 | Sizes: 1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000 rows 10 | Cache size: 1 Gb 11 | Insert only, 1 Gb cache configured => 1,2 Gb used 12 | Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets 13 | * 2.2 - 2.5 Gb used 14 | * Query response time 7 - 55 ms observed 15 | * Cache eviction 1 - 15 ms, datasets in cache 60 - 100, dropped 1 - 11 at a time 16 | 17 | Sizes: 1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000 rows 18 | Cache size: 1 Gb 19 | gc.collect() after every query 20 | Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets 21 | * 1.2 - 1.3 Gb used 22 | * Query response time 22 - 65 ms observed 23 | * Cache eviction 1 - 15 ms, datasets in cache 60 - 100, dropped 1 - 11 at a time 24 | 25 | Sizes: 1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000 rows 26 | Cache size: 1 Gb 27 | gc.collect() after every 10th 28 | Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets 29 | * 1.2 - 1.3 Gb used 30 | * Insert times 600 ms - 850 ms observed 31 | * Query response time 7 - 70 ms observed 32 | * Insert times 90 ms - 1150 ms observed 33 | * Cache eviction 1 - 15 ms, datasets in cache 60 - 100, dropped 1 - 13 at a time 34 | 35 | Sizes: 1000, 5000, 10000, 15000, 20000, 30000, 40000, 50000 rows 36 | Cache size: 1 Gb 37 | gc.collect() after every 10th 38 | Insert followed by 0 - 5 queries against some of the latest 40 inserted datasets 39 | * 1.2 - 1.3 Gb used 40 | * Query response time 7 - 70 ms observed 41 | * Insert times 600 ms - 850 ms observed 42 | * Cache eviction 1 - 15 ms, datasets in cache 400 - 430, dropped 1 - 13 at a time 43 | Performance quite similar to other examples with larger datasets. 44 | """ 45 | 46 | from StringIO import StringIO 47 | import csv 48 | import json 49 | import random 50 | import string 51 | import requests 52 | import time 53 | 54 | example_data_row = { 55 | 'text1': '123abc123', 'text2': 'asdfghjkl', 'some_text': 'aaaaaaaaaaaaaaa', 'a_status': 'b', 56 | 'some_number': 1234567, 'a_float': 1234.1234, 57 | 'a_class': 'qwertyuuer', 'some_label': '1234yzx', 'another_label': '1234yzx', 58 | 'classifier': 'long_classifier', 'another_class': '1', 'float1': 98765432.123, 59 | 'float2': 12345568.9876, 'description': 'a/b/c'} 60 | 61 | 62 | SELECTION = ['aaaaaaaaaaaaaaaaaaa', 63 | 'bbbbbbbbbbbbbbbbbbb', 64 | 'ccccccccccccccccccc', 65 | 'ddddddddddddddddddd', 66 | 'eeeeeeeeeeeeeeeeeee', 67 | 'fffffffffffffffffff', 68 | 'ggggggggggggggggggg', 69 | 'hhhhhhhhhhhhhhhhhhh', 70 | 'iiiiiiiiiiiiiiiiiii', 71 | 'jjjjjjjjjjjjjjjjjjj'] 72 | 73 | SOME_NUMBER = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 22, 33, 44, 55, 66, 77, 88, 99, 100, 74 | 51, 52, 53, 54, 455, 56, 57, 58, 59, 50, 511, 522, 533, 544, 555, 566, 577, 588, 599, 5100] 75 | 76 | def example_data(length): 77 | out = StringIO() 78 | writer = csv.DictWriter(out, example_data_row.keys()) 79 | writer.writeheader() 80 | for i in range(length): 81 | example_data_row['text1'] = random.choice(SELECTION) 82 | example_data_row['classifier'] = random.choice(SELECTION) 83 | example_data_row['some_number'] = random.choice(SOME_NUMBER) 84 | writer.writerow(example_data_row) 85 | 86 | return out.getvalue() 87 | 88 | 89 | def main(): 90 | print "Building datasets" 91 | datasets = [example_data(l) for l in (1000, 5000, 10000, 20000, 50000, 100000, 200000, 400000)] 92 | 93 | latest_datasets = [] 94 | while True: 95 | ds = random.choice(datasets) 96 | key = ''.join(random.choice(string.ascii_uppercase) for _ in range(6)) 97 | t0 = time.time() 98 | response = requests.post("http://localhost:9401/qcache/dataset/{key}".format(key=key), 99 | headers={'Content-type': 'text/csv'}, data=ds) 100 | print "Posted {key}={size}, response={response}, duration={duration}".format( 101 | key=key, size=len(ds), response=response.status_code, duration=time.time()-t0) 102 | 103 | # Keep the last 40 inserted 104 | latest_datasets.append(key) 105 | latest_datasets = latest_datasets[-40:] 106 | 107 | for _ in range(random.randint(0, 5)): 108 | query = dict(select=['text1', 'text2', 'a_status', 'some_number'], 109 | distinct=['text1', 'text2', 'a_status', 'some_number'], 110 | where=['==', 'classifier', "'{}'".format(random.choice(SELECTION))], 111 | limit=50) 112 | params = {'q': json.dumps(query)} 113 | 114 | ds_key = random.choice(latest_datasets) 115 | 116 | t0 = time.time() 117 | response = requests.get("http://localhost:9401/qcache/dataset/{key}".format(key=ds_key), 118 | params=params, headers={'Accept': 'application/json'}) 119 | 120 | if response.status_code == 200: 121 | print "Success length: {length}, duration: {duration}".format( 122 | status=response.status_code, length=len(json.loads(response.content)), 123 | duration=time.time()-t0) 124 | else: 125 | print "Response status: {status}, content: {content}, duration: {duration}".format( 126 | status=response.status_code, content=response.content, duration=time.time()-t0) 127 | 128 | time.sleep(0.5) 129 | 130 | if __name__ == '__main__': 131 | main() --------------------------------------------------------------------------------