├── .github └── workflows │ └── ci.yaml ├── .gitignore ├── .gitmodules ├── DEV_NOTES.md ├── LICENSE ├── README.md ├── pyproject.toml ├── questdb_query ├── __init__.py ├── asynchronous.py ├── endpoint.py ├── errors.py ├── pandas_util.py ├── stats.py ├── synchronous.py └── tool.py ├── test └── tests ├── mock_server.py ├── tests.py └── trips.csv /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Python CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | python-version: ['3.9', '3.10', '3.11', '3.12'] 15 | os: [ubuntu-latest, macos-latest, windows-latest] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | with: 20 | submodules: true 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Set up Java 11 28 | uses: actions/setup-java@v3 29 | with: 30 | java-version: '11' 31 | distribution: 'temurin' 32 | 33 | - name: Install pipx 34 | run: python -m pip install pipx 35 | 36 | - name: Ensure pipx uses the right Python 37 | run: python -m pipx ensurepath 38 | 39 | - name: Install Poetry with pipx 40 | run: pipx install poetry==1.8.2 41 | 42 | - name: Configure Poetry 43 | run: poetry config virtualenvs.create false 44 | 45 | - name: Install dependencies 46 | run: poetry install --no-root 47 | 48 | - name: Run tests 49 | run: poetry run python -m unittest discover tests -v 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | # poetry 163 | .python-version 164 | poetry.lock -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "c-questdb-client"] 2 | path = c-questdb-client 3 | url = https://github.com/questdb/c-questdb-client 4 | -------------------------------------------------------------------------------- /DEV_NOTES.md: -------------------------------------------------------------------------------- 1 | # Developer's Notes 2 | 3 | ## Cloning and Running Tests 4 | 5 | ```shell 6 | git clone https://github.com/questdb/py-questdb-query.git 7 | cd py-questdb-query 8 | git submodule update --init 9 | poetry install 10 | ./test 11 | ``` 12 | 13 | The tests will automatically download and start a QuestDB instance, but you 14 | also need to have a Java 11 runtime installed. 15 | 16 | ## Updating the dependencies 17 | 18 | Tweak the `pyproject.toml` file (if needed) and then run: 19 | 20 | ``` 21 | poetry update 22 | ``` 23 | 24 | ## Running a single test 25 | 26 | ```shell 27 | poetry run python -m unittest tests.tests.TestModule.test_basic_aut 28 | ``` 29 | 30 | ## Cutting a release 31 | 32 | ```shell 33 | poetry export --output requirements.txt 34 | ``` 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-questdb-query 2 | This library allows you to perform fast queries over HTTP(S)/CSV for QuestDB, a high-performance time-series database. 3 | 4 | Query results are obtained as either Pandas dataframes or dicts of numpy arrays. 5 | 6 | ## Installation 7 | 8 | The library can be installed using the following command: 9 | 10 | ```shell 11 | python3 -m pip install -U git+https://github.com/questdb/py-questdb-query.git#questdb_query 12 | ``` 13 | 14 | To uninstall the library, you can use the command: 15 | 16 | ```shell 17 | python3 -m pip uninstall questdb_query 18 | ``` 19 | 20 | ## Basic Usage, querying into Pandas 21 | 22 | Once installed, you can use the library to query a QuestDB database. Here's an example that demonstrates how to query 23 | CPU utilization data using the library against a database running on `localhost` on the default HTTP port (9000). 24 | 25 | ```python 26 | from questdb_query import pandas_query 27 | 28 | df = pandas_query('select * from cpu limit 1000') 29 | ``` 30 | 31 | This allows you, for example, to pre-aggregate results: 32 | 33 | ```python 34 | >>> df = df[['region', 'usage_user', 'usage_nice']].groupby('region').mean() 35 | >>> df 36 | usage_user usage_nice 37 | region 38 | ap-northeast-1 8.163766 6.492334 39 | ap-southeast-1 6.511215 7.341863 40 | ap-southeast-2 6.788770 6.257839 41 | eu-central-1 7.392642 6.416479 42 | eu-west-1 7.213417 7.185956 43 | sa-east-1 7.143568 5.925026 44 | us-east-1 7.620643 7.243553 45 | us-west-1 6.286770 6.531977 46 | us-west-2 6.228692 6.439672 47 | ``` 48 | 49 | You can then switch over to numpy with a simple and fast conversion: 50 | 51 | ```python 52 | >>> from questdb_query import pandas_to_numpy 53 | >>> np_arrs = pandas_to_numpy(df) 54 | >>> np_arrs 55 | {'usage_user': array([8.16376556, 6.51121543, 6.78876964, 7.3926419 , 7.21341716, 56 | 7.14356839, 7.62064304, 6.28677006, 6.22869169]), 'usage_nice': array([6.49233392, 7.34186348, 6.25783903, 6.41647863, 7.18595643, 57 | 5.92502642, 7.24355328, 6.53197733, 6.43967247]), 'region': array(['ap-northeast-1', 'ap-southeast-1', 'ap-southeast-2', 58 | 'eu-central-1', 'eu-west-1', 'sa-east-1', 'us-east-1', 'us-west-1', 59 | 'us-west-2'], dtype=object)} 60 | ``` 61 | 62 | ## Querying a remote database 63 | 64 | If your database is running on a remote host, specify an endpoint: 65 | 66 | ```python 67 | from questdb_query import pandas_query, Endpoint 68 | 69 | endpoint = Endpoint(host='your.hostname.com', port=22453, https=True, username='user', password='pass') 70 | 71 | np_arrs = numpy_query('select * from cpu limit 10', endpoint) 72 | ``` 73 | 74 | Note how the example above enables HTTPS and specifies a username and password for authentication. 75 | 76 | The port is optional and defaults to 9000 for HTTP and 443 for HTTPS. 77 | 78 | Alternatively, if the server is set up with token-based authentication you can use the `token` parameter: 79 | 80 | ```python 81 | endpoint = Endpoint(host='your.hostname.com', https=True, token='your_token') 82 | ``` 83 | 84 | ## Chunks: Query Parallelism 85 | 86 | You can sometimes improve performance by splitting up a large query into smaller ones, running them in parallel, 87 | and joining the results together. This is especially useful if you have multiple CPUs available. 88 | 89 | The `numpy_query` function can do this automatically for you, by specifying the `chunks` parameter. 90 | 91 | The example below, splits up the query into 6 parallel chunks. 92 | 93 | ```python 94 | from questdb_query import numpy_query 95 | 96 | np_arrs = numpy_query('select * from cpu', chunks=6) 97 | ``` 98 | 99 | The speed-up of splitting up a query into smaller ones is highly query-dependent and we recommend you experiment and 100 | benchmark. Mostly due to Python library limitations, not all parts of the query can be parallelized, so whilst you may 101 | see great benefits in going from 1 chunk (the default) to 8, the improvement going from 8 to 16 might be marginal. 102 | 103 | _Read on for more details on benchmarking: This is covered later in this README page._ 104 | 105 | > :warning: The `chunks > 1` parameter parallelizes queries. If the table(s) queried contain fast-moving data the 106 | > results may be inconsistent as each chunk's query would be started at slightly different times. 107 | > 108 | > To avoid consistency issues formulate the query so that it only queries data that is not changing. 109 | > You can do this, for example, by specifying a `timestamp` range in the `WHERE` clause. 110 | 111 | ## Querying into Numpy 112 | 113 | You can also query directly into a dictionary of Numpy arrays. 114 | 115 | Notice that Numpy's datatypes are more limited than Panadas, specifically in the 116 | handling of null values. 117 | 118 | This is a simple shorthand for querying into Pandas and then converting to Numpy: 119 | 120 | ```python 121 | def numpy_query(query: str, endpoint: Endpoint = None, 122 | chunks: int = 1, timeout: int = None) -> dict[str, np.array]: 123 | df = pandas_query(query, endpoint, chunks, timeout) 124 | return pandas_to_numpy(df) 125 | ``` 126 | 127 | To use it, pass the query string to the `numpy_query` function, along with the 128 | same optional parameters as the `pandas_query` function. 129 | 130 | ```python 131 | from questdb_query import numpy_query 132 | 133 | np_arrs = numpy_query(''' 134 | select 135 | timestamp, hostname, datacenter, usage_user, usage_nice 136 | from 137 | cpu 138 | limit 10''') 139 | ``` 140 | 141 | The `np_arrs` object is a python `dict` which holds a numpy array per column, keyed by column name: 142 | ```python 143 | >>> np_arrs 144 | {'timestamp': array(['2016-01-01T00:00:00.000000000', '2016-01-01T00:00:10.000000000', 145 | '2016-01-01T00:00:20.000000000', '2016-01-01T00:00:30.000000000', 146 | '2016-01-01T00:00:40.000000000', '2016-01-01T00:00:50.000000000', 147 | '2016-01-01T00:01:00.000000000', '2016-01-01T00:01:10.000000000', 148 | '2016-01-01T00:01:20.000000000', '2016-01-01T00:01:30.000000000'], 149 | dtype='datetime64[ns]'), 'hostname': array(['host_0', 'host_1', 'host_2', 'host_3', 'host_4', 'host_5', 150 | 'host_6', 'host_7', 'host_8', 'host_9'], dtype=object), 'datacenter': array(['ap-southeast-2b', 'eu-west-1b', 'us-west-1b', 'us-west-2c', 151 | 'us-west-2b', 'eu-west-1b', 'eu-west-1b', 'us-west-1a', 152 | 'ap-southeast-2a', 'us-east-1a'], dtype=object), 'usage_user': array([1.39169048, 0.33846369, 0. , 1.81511203, 0.84273104, 153 | 0. , 0. , 0.28085548, 0. , 1.37192634]), 'usage_nice': array([0.30603088, 1.21496673, 0. , 0.16688796, 0. , 154 | 2.77319521, 0.40332488, 1.81585253, 1.92844804, 2.12841919])} 155 | ``` 156 | 157 | If we wanted to calculate a (rather non-sensical) weighted average of `usage_user` and `usage_nice` we can 158 | do this by accessing the `numpy` columns: 159 | 160 | ```python 161 | >>> np_arrs['usage_user'].dot(np_arrs['usage_nice'].T) 162 | 4.5700692045031985 163 | ``` 164 | 165 | ## Benchmarking 166 | 167 | ### From code 168 | 169 | Each query result also contains a `Stats` object with the performance summary which you can print. 170 | 171 | ```python 172 | >>> from questdb_query import pandas_query 173 | >>> df = pandas_query('select * from cpu', chunks=8) 174 | >>> print(df.query_stats) 175 | Duration: 2.631s 176 | Millions of lines: 5.000 177 | Millions of lines/s: 1.901 178 | MiB: 1332.144 179 | MiB/s: 506.381 180 | ``` 181 | 182 | You can also extract individual fields: 183 | 184 | ```python 185 | >>> df.query_stats 186 | Stats(duration_s=2.630711865, line_count=5000000, byte_count=1396853875, throughput_mbs=506.3814407360216, throughput_mlps=1.900626239810569) 187 | >>> df.query_stats.throughput_mlps 188 | 1.900626239810569 189 | ``` 190 | 191 | ### From the command line 192 | 193 | To get the best performance it may be useful to try queries with different hardware setups, chunk counts etc. 194 | 195 | You can run the benchmarking tool from the command line: 196 | 197 | ```bash 198 | $ python3 -m questdb_query.tool --chunks 8 "select * from cpu" 199 | ``` 200 | ``` 201 | hostname region datacenter rack os arch team service service_version service_environment usage_user usage_system usage_idle usage_nice usage_iowait usage_irq usage_softirq usage_steal usage_guest usage_guest_nice timestamp 202 | 0 host_0 ap-southeast-2 ap-southeast-2b 96 Ubuntu16.10 x86 CHI 11 0 test 1.391690 0.000000 2.644812 0.306031 1.194629 0.000000 0.000000 0.726996 0.000000 0.000000 2016-01-01 00:00:00 203 | 1 host_1 eu-west-1 eu-west-1b 52 Ubuntu16.04LTS x64 NYC 7 0 production 0.338464 1.951409 2.455378 1.214967 2.037935 0.000000 1.136997 1.022753 1.711183 0.000000 2016-01-01 00:00:10 204 | 2 host_2 us-west-1 us-west-1b 69 Ubuntu16.04LTS x64 LON 8 1 production 0.000000 2.800873 2.296324 0.000000 1.754139 1.531160 0.662572 0.000000 0.472402 0.312164 2016-01-01 00:00:20 205 | 3 host_3 us-west-2 us-west-2c 8 Ubuntu16.04LTS x86 LON 11 0 test 1.815112 4.412385 2.056344 0.166888 3.507148 3.276577 0.000000 0.000000 0.000000 1.496152 2016-01-01 00:00:30 206 | 4 host_4 us-west-2 us-west-2b 83 Ubuntu16.04LTS x64 NYC 6 0 test 0.842731 3.141248 2.199520 0.000000 2.943054 5.032342 0.391105 1.375450 0.000000 1.236811 2016-01-01 00:00:40 207 | ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 208 | 624995 host_3995 ap-southeast-2 ap-southeast-2a 30 Ubuntu16.04LTS x86 CHI 19 1 staging 33.238309 82.647341 17.272531 52.707720 71.718564 45.605728 100.000000 22.907723 78.130846 15.652954 2017-08-01 16:52:30 209 | 624996 host_3996 us-west-2 us-west-2a 67 Ubuntu15.10 x64 CHI 9 0 production 33.344070 81.922739 16.653731 52.107537 71.844945 45.880606 99.835977 23.045458 76.468930 17.091646 2017-08-01 16:52:40 210 | 624997 host_3997 us-west-2 us-west-2b 63 Ubuntu15.10 x86 SF 8 0 production 32.932095 80.662915 14.708377 53.354277 72.265215 44.803275 99.013038 20.375169 78.043473 17.870002 2017-08-01 16:52:50 211 | 624998 host_3998 eu-west-1 eu-west-1b 53 Ubuntu16.04LTS x86 CHI 11 1 staging 31.199818 80.994859 15.051577 51.923123 74.169828 46.453950 99.107213 21.004499 78.341154 18.880808 2017-08-01 16:53:00 212 | 624999 host_3999 us-east-1 us-east-1c 87 Ubuntu16.10 x64 SF 8 1 production 30.310735 81.727637 15.413537 51.417897 74.973555 44.882255 98.821672 19.055040 78.094993 19.263652 2017-08-01 16:53:10 213 | 214 | [5000000 rows x 21 columns] 215 | 216 | Duration: 2.547s 217 | Millions of lines: 5.000 218 | Millions of lines/s: 1.963 219 | MiB: 1332.144 220 | MiB/s: 522.962 221 | ``` 222 | 223 | These are the complete command line arguments: 224 | 225 | ```bash 226 | $ python3 -m questdb_query.tool --help 227 | ``` 228 | ``` 229 | usage: tool.py [-h] [--host HOST] [--port PORT] [--https] [--username USERNAME] [--password PASSWORD] [--chunks CHUNKS] query 230 | 231 | positional arguments: 232 | query 233 | 234 | optional arguments: 235 | -h, --help show this help message and exit 236 | --host HOST 237 | --port PORT 238 | --https 239 | --username USERNAME 240 | --password PASSWORD 241 | --chunks CHUNKS 242 | ``` 243 | 244 | 245 | ## Async operation 246 | 247 | The `numpy_query` and `pandas_query` functions are actually wrappers around `async` variants. 248 | 249 | If your application is already using `async`, then call those directly as it allows other parts of your application to 250 | perform work in parallel during the data download. 251 | 252 | The functions take identical arguments as their synchronous counterparts. 253 | 254 | ```python 255 | import asyncio 256 | from questdb_query.asynchronous import numpy_query 257 | 258 | 259 | def main(): 260 | endpoint = Endpoint(host='your.hostname.com', https=True, username='user', password='pass') 261 | np_arrs = await numpy_query('select * from cpu limit 10', endpoint) 262 | print(np_arrs) 263 | 264 | 265 | if __name__ == '__main__': 266 | asyncio.run(main()) 267 | 268 | ``` 269 | 270 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "questdb-query" 3 | version = "0.1.0" 4 | description = "Fast query over HTTP(S)/CSV for QuestDB" 5 | readme = "README.md" 6 | packages = [{include = "questdb_query"}] 7 | authors = ["Adam Cimarosti "] 8 | license = "Apache License 2.0" 9 | repository = "https://github.com/questdb/py-questdb-query/" 10 | 11 | [tool.poetry.dependencies] 12 | python = "^3.9" 13 | numpy = "^1.26.4" 14 | pandas = "^2.2.2" 15 | pyarrow = "^15.0.2" 16 | aiohttp = {extras = ["speedups"], version = "^3.8.4"} 17 | 18 | [build-system] 19 | requires = ["poetry-core"] 20 | build-backend = "poetry.core.masonry.api" 21 | -------------------------------------------------------------------------------- /questdb_query/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query QuestDB over HTTP into Pandas or Numpy arrays. 3 | 4 | The primary implementation is in the `asynchronous` module, with a wrapper 5 | 6 | """ 7 | 8 | __version__ = '0.1.0' 9 | 10 | from .endpoint import Endpoint 11 | from .errors import QueryError 12 | from .synchronous import pandas_query, numpy_query 13 | from .pandas_util import pandas_to_numpy 14 | -------------------------------------------------------------------------------- /questdb_query/asynchronous.py: -------------------------------------------------------------------------------- 1 | """ 2 | Async functions to query QuestDB over HTTP(S) via CSV into Pandas or Numpy. 3 | """ 4 | 5 | __all__ = ['pandas_query', 'numpy_query'] 6 | 7 | import asyncio 8 | import time 9 | from concurrent.futures import ThreadPoolExecutor 10 | from io import BytesIO 11 | 12 | import aiohttp 13 | import numpy as np 14 | import pandas as pd 15 | 16 | from .endpoint import Endpoint 17 | from .errors import QueryError 18 | from .pandas_util import pandas_to_numpy 19 | from .stats import Stats 20 | 21 | 22 | def _new_session(endpoint, timeout: int = None): 23 | auth = None 24 | if endpoint.username: 25 | auth = aiohttp.BasicAuth(endpoint.username, endpoint.password) 26 | timeout = aiohttp.ClientTimeout(total=timeout) \ 27 | or aiohttp.ClientTimeout(total=300) 28 | return aiohttp.ClientSession( 29 | auth=auth, 30 | read_bufsize=4 * 1024 * 1024, 31 | timeout=timeout) 32 | 33 | 34 | def _auth_headers(endpoint: Endpoint) -> dict[str, str]: 35 | if endpoint.token: 36 | return {'Authorization': f'Bearer {endpoint.token}'} 37 | return None 38 | 39 | 40 | async def _pre_query( 41 | session: aiohttp.ClientSession, 42 | endpoint: Endpoint, 43 | query: str 44 | ) -> tuple[list[tuple[str, (str, object)]], int]: 45 | url = f'{endpoint.url}/exec' 46 | params = [('query', query), ('count', 'true'), ('limit', '0')] 47 | dtypes_map = { 48 | 'STRING': ('STRING', 'string'), 49 | 'VARCHAR': ('VARCHAR', 'string'), 50 | 'SYMBOL': ('SYMBOL', 'string'), 51 | 'SHORT': ('SHORT', 'int16'), 52 | 'BYTE': ('BYTE', 'int8'), 53 | 'BOOLEAN': ('BOOLEAN', 'bool'), 54 | 'INT': ('INT', 'Int32'), 55 | 'LONG': ('LONG', 'Int64'), 56 | 'DOUBLE': ('DOUBLE', 'float64'), 57 | 'FLOAT': ('FLOAT', 'float32'), 58 | 'CHAR': ('CHAR', 'string'), 59 | 'TIMESTAMP': ('TIMESTAMP', None), 60 | 'IPV4': ('IPV4', 'string'), 61 | 'BYTE': ('BYTE', 'int8'), 62 | 'DATE': ('DATE', None), 63 | 'UUID': ('UUID', 'string'), 64 | 'BINARY': ('BINARY', 'string'), 65 | 'LONG256': ('LONG256', 'string'), 66 | } 67 | 68 | def get_dtype(col): 69 | ty = col['type'].upper() 70 | if ty.startswith('GEOHASH'): 71 | return (ty, 'string') 72 | return dtypes_map[ty] 73 | 74 | async with session.get( 75 | url=url, 76 | params=params, 77 | headers=_auth_headers(endpoint)) as resp: 78 | result = await resp.json() 79 | if resp.status != 200: 80 | raise QueryError.from_json(result) 81 | columns = [ 82 | (col['name'], get_dtype(col)) 83 | for col in result['columns']] 84 | count = result['count'] 85 | return columns, count 86 | 87 | 88 | async def _query_pandas( 89 | session: aiohttp.ClientSession, 90 | executor: ThreadPoolExecutor, 91 | endpoint: Endpoint, 92 | query: str, 93 | result_schema: list[tuple[str, tuple[str, object]]], 94 | limit_range: tuple[int, int]) -> pd.DataFrame: 95 | url = f'{endpoint.url}/exp' 96 | params = [ 97 | ('query', query), 98 | ('limit', f'{limit_range[0]},{limit_range[1]}')] 99 | async with session.get( 100 | url=url, 101 | params=params, 102 | headers=_auth_headers(endpoint)) as resp: 103 | if resp.status != 200: 104 | raise QueryError.from_json(await resp.json()) 105 | buf = await resp.content.read() 106 | download_bytes = len(buf) 107 | buf_reader = BytesIO(buf) 108 | dtypes = { 109 | col[0]: col[1][1] 110 | for col in result_schema 111 | if col[1][1] is not None} 112 | 113 | def _read_csv(): 114 | df = pd.read_csv(buf_reader, dtype=dtypes, engine='pyarrow') 115 | # Patch up the column types. 116 | for col_schema in result_schema: 117 | col_name = col_schema[0] 118 | col_type = col_schema[1][0] 119 | try: 120 | if col_type in ('TIMESTAMP', 'DATE'): 121 | series = df[col_name] 122 | # if the series is empty (or full of nulls) its csv-read 123 | # default dtype (float64) is not one which we can 124 | # convert `.to_datetime`, 125 | if series.empty or series.isnull().all(): 126 | # so to work around this we first convert the series 127 | # to Int64 (nullable). 128 | series = series.astype('Int64') 129 | series = pd.to_datetime(series, unit='ns') 130 | else: 131 | # Drop the UTC timezone during conversion. 132 | # This allows `.to_numpy()` on the series to 133 | # yield a `datetime64` dtype column. 134 | series = pd.to_datetime(series).dt.tz_convert(None) 135 | df[col_name] = series 136 | except Exception as e: 137 | print(df[col_name]) 138 | raise ValueError( 139 | f'Failed to convert column {col_name} to type {col_type}: {e}\n{series}') 140 | return df 141 | 142 | loop = asyncio.get_running_loop() 143 | df = await loop.run_in_executor(executor, _read_csv) 144 | return df, download_bytes 145 | 146 | 147 | async def pandas_query( 148 | query: str, 149 | endpoint: Endpoint = None, 150 | chunks: int = 1, 151 | timeout: int = None) -> pd.DataFrame: 152 | """ 153 | Query QuestDB via CSV to a Pandas DataFrame. 154 | 155 | :param timeout: The timeout in seconds for the query, defaults to None (300 seconds). 156 | """ 157 | endpoint = endpoint or Endpoint() 158 | start_ts = time.perf_counter_ns() 159 | with ThreadPoolExecutor(max_workers=chunks) as executor: 160 | async with _new_session(endpoint, timeout) as session: 161 | result_schema, row_count = await _pre_query(session, endpoint, query) 162 | chunks = max(min(chunks, row_count), 1) 163 | rows_per_spawn = row_count // chunks 164 | limit_ranges = [ 165 | ( 166 | i * rows_per_spawn, 167 | ((i + 1) * rows_per_spawn) if i < chunks - 1 else row_count 168 | ) 169 | for i in range(chunks)] 170 | tasks = [ 171 | asyncio.ensure_future(_query_pandas( 172 | session, executor, endpoint, query, result_schema, limit_range)) 173 | for limit_range in limit_ranges] 174 | results = await asyncio.gather(*tasks) 175 | sub_dataframes = [result[0] for result in results] 176 | df = pd.concat(sub_dataframes) 177 | if chunks > 1: 178 | df.reset_index(drop=True, inplace=True) 179 | end_ts = time.perf_counter_ns() 180 | total_downloaded = sum(result[1] for result in results) 181 | df.query_stats = Stats(end_ts - start_ts, row_count, total_downloaded) 182 | return df 183 | 184 | 185 | async def numpy_query( 186 | query: str, 187 | endpoint: Endpoint = None, 188 | chunks: int = 1, 189 | timeout: int = None 190 | ) -> dict[str, np.array]: 191 | """ 192 | Query and obtain the result as a dict of columns. 193 | Each column is a numpy array. 194 | 195 | :param timeout: The timeout in seconds for the query, defaults to None (300 seconds). 196 | """ 197 | df = await pandas_query(query, endpoint, chunks, timeout) 198 | return pandas_to_numpy(df) 199 | -------------------------------------------------------------------------------- /questdb_query/endpoint.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class Endpoint: 4 | """ 5 | HTTP connection parameters into QuestDB 6 | """ 7 | def __init__( 8 | self, 9 | host='127.0.0.1', 10 | port=None, 11 | https=False, 12 | username=None, 13 | password=None, 14 | token=None): 15 | self.host = host 16 | self.port = port or (443 if https else 9000) 17 | self.https = https 18 | self.username = username 19 | self.password = password 20 | self.token = token 21 | if ((self.username or self.password) and \ 22 | not (self.username and self.password)): 23 | raise ValueError('Must provide both username and password or neither') 24 | if self.token and self.username: 25 | raise ValueError('Cannot use token with username and password') 26 | if token and not re.match(r'^[A-Za-z0-9-._~+/]+=*$', token): 27 | # https://datatracker.ietf.org/doc/html/rfc6750#section-2.1 28 | raise ValueError("Invalid characters in token") 29 | 30 | @property 31 | def url(self): 32 | protocol = 'https' if self.https else 'http' 33 | return f'{protocol}://{self.host}:{self.port}' 34 | -------------------------------------------------------------------------------- /questdb_query/errors.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | 4 | class QueryError(Exception): 5 | def __init__(self, message: str, query: str, position: Optional[int] = None): 6 | super().__init__(message) 7 | self.query = query 8 | self.position = position 9 | 10 | @classmethod 11 | def from_json(cls, json: dict): 12 | message = json.get('error') 13 | if not message: 14 | message = json.get('message') 15 | return cls( 16 | message=message, 17 | query=json.get('query'), 18 | position=json.get('position')) 19 | 20 | -------------------------------------------------------------------------------- /questdb_query/pandas_util.py: -------------------------------------------------------------------------------- 1 | __all__ = ['pandas_to_numpy'] 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from .stats import StatsDict 7 | 8 | 9 | def pandas_to_numpy(df: pd.DataFrame) -> dict[str, np.array]: 10 | """ 11 | Convert a pandas dataframe into a dict containing numpy arrays, keyed by column name. 12 | 13 | If the index is named, then convert that too. 14 | """ 15 | # Calling `.to_numpy()` for each column is quite efficient and generally avoids copies. 16 | # This is because Pandas internally already usually stores columns as numpy. 17 | np_arrs = {col_name: df[col_name].to_numpy() for col_name in df} 18 | 19 | # If the index is named, then convert that too. 20 | if df.index.name: 21 | np_arrs[df.index.name] = df.index.to_numpy() 22 | 23 | # Carry across stats, if these are present. 24 | if hasattr(df, 'query_stats'): 25 | np_arrs = StatsDict(np_arrs, df.query_stats) 26 | 27 | return np_arrs 28 | -------------------------------------------------------------------------------- /questdb_query/stats.py: -------------------------------------------------------------------------------- 1 | __all__ = ['Stats', 'StatsDict'] 2 | 3 | NS_IN_S = 1e9 4 | 5 | STATS_TEMPLATE = '''Duration: {duration_s:.3f}s 6 | Millions of lines: {line_count_millions:.3f} 7 | Millions of lines/s: {throughput_mlps:.3f} 8 | MiB: {byte_count_mib:.3f} 9 | MiB/s: {throughput_mbs:.3f}''' 10 | 11 | 12 | class Stats: 13 | def __init__(self, duration_ns: int, line_count: int, byte_count: int): 14 | self.duration_ns = duration_ns 15 | self.line_count = line_count 16 | self.byte_count = byte_count 17 | 18 | @property 19 | def duration_s(self) -> float: 20 | """ 21 | How long the query took in seconds. 22 | """ 23 | return self.duration_ns / NS_IN_S 24 | 25 | @property 26 | def throughput_mbs(self) -> float: 27 | """ 28 | How many MiB/s were downloaded and parsed. 29 | """ 30 | return self.byte_count / self.duration_ns * NS_IN_S / 1024 / 1024 31 | 32 | @property 33 | def throughput_mlps(self) -> float: 34 | """ 35 | How many millions of lines per second were parsed. 36 | """ 37 | return self.line_count / self.duration_ns * NS_IN_S / 1e6 38 | 39 | def __repr__(self) -> str: 40 | return (f'Stats(duration_s={self.duration_s}, ' 41 | f'line_count={self.line_count}, ' 42 | f'byte_count={self.byte_count}, ' 43 | f'throughput_mbs={self.throughput_mbs}, ' 44 | f'throughput_mlps={self.throughput_mlps})') 45 | 46 | def __str__(self): 47 | return STATS_TEMPLATE.format( 48 | duration_s=self.duration_s, 49 | line_count_millions=self.line_count / 1e6, 50 | throughput_mbs=self.throughput_mbs, 51 | byte_count_mib=self.byte_count / 1024 / 1024, 52 | throughput_mlps=self.throughput_mlps) 53 | 54 | 55 | class StatsDict(dict): 56 | """A dict with an additional .query_stats attribute.""" 57 | 58 | def __init__(self, other: dict, query_stats: Stats): 59 | super().__init__(other) 60 | self.query_stats = query_stats 61 | -------------------------------------------------------------------------------- /questdb_query/synchronous.py: -------------------------------------------------------------------------------- 1 | """ 2 | A sync shim around the `asynchronous` module. 3 | """ 4 | 5 | __all__ = ['pandas_query', 'numpy_query'] 6 | 7 | import asyncio 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from . import asynchronous as a 13 | from .endpoint import Endpoint 14 | from .pandas_util import pandas_to_numpy 15 | 16 | 17 | def pandas_query( 18 | query: str, 19 | endpoint: Endpoint = None, 20 | chunks: int = 1, 21 | timeout: int = None 22 | ) -> pd.DataFrame: 23 | """ 24 | Query QuestDB via CSV to a Pandas DataFrame. 25 | 26 | :param timeout: The timeout in seconds for the query, defaults to None (300 seconds). 27 | """ 28 | try: 29 | loop = asyncio.get_running_loop() 30 | except RuntimeError: 31 | loop = None 32 | if loop is None: 33 | return asyncio.run(a.pandas_query(query, endpoint, chunks, timeout)) 34 | else: 35 | return loop.run_until_complete(a.pandas_query(query, endpoint, chunks, timeout)) 36 | 37 | 38 | def numpy_query( 39 | query: str, 40 | endpoint: Endpoint = None, 41 | chunks: int = 1, 42 | timeout: int = None 43 | ) -> dict[str, np.array]: 44 | """ 45 | Query and obtain the result as a dict of columns. 46 | Each column is a numpy array. 47 | 48 | :param timeout: The timeout in seconds for the query, defaults to None (300 seconds). 49 | """ 50 | df = pandas_query(query, endpoint, chunks, timeout) 51 | return pandas_to_numpy(df) 52 | -------------------------------------------------------------------------------- /questdb_query/tool.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmarking tool 3 | 4 | From the command line, run as:: 5 | 6 | python3 -m questdb_query.tool --help 7 | 8 | """ 9 | 10 | from .endpoint import Endpoint 11 | from .synchronous import pandas_query 12 | 13 | 14 | def _parse_args(): 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--host', type=str, default='localhost') 18 | parser.add_argument('--port', type=int) 19 | parser.add_argument('--https', action='store_true') 20 | parser.add_argument('--username', type=str) 21 | parser.add_argument('--password', type=str) 22 | parser.add_argument('--token', type=str) 23 | parser.add_argument('--chunks', type=int, default=1) 24 | parser.add_argument('query', type=str) 25 | return parser.parse_args() 26 | 27 | 28 | def main(args): 29 | endpoint = Endpoint( 30 | host=args.host, 31 | port=args.port, 32 | https=args.https, 33 | username=args.username, 34 | password=args.password, 35 | token=args.token) 36 | df = pandas_query(args.query, endpoint, args.chunks) 37 | print(df) 38 | print() 39 | print(df.query_stats) 40 | 41 | 42 | if __name__ == "__main__": 43 | args = _parse_args() 44 | main(args) 45 | -------------------------------------------------------------------------------- /test: -------------------------------------------------------------------------------- 1 | poetry run python -m unittest discover tests -v -------------------------------------------------------------------------------- /tests/mock_server.py: -------------------------------------------------------------------------------- 1 | import http.server as hs 2 | import threading 3 | import time 4 | 5 | 6 | class HttpServer: 7 | def __init__(self): 8 | self.requests = [] 9 | self.responses = [] 10 | self.headers = [] 11 | self._ready_event = None 12 | self._stop_event = None 13 | self._http_server = None 14 | self._http_server_thread = None 15 | 16 | def _serve(self): 17 | self._http_server.serve_forever() 18 | self._stop_event.set() 19 | 20 | def __enter__(self): 21 | headers = self.headers 22 | requests = self.requests 23 | responses = self.responses 24 | 25 | class Handler(hs.BaseHTTPRequestHandler): 26 | def do_GET(self): 27 | try: 28 | headers.append({ 29 | key: value 30 | for key, value in self.headers.items()}) 31 | try: 32 | wait_ms, code, content_type, body = responses.pop(0) 33 | except IndexError: 34 | wait_ms, code, content_type, body = 0, 200, None, None 35 | time.sleep(wait_ms / 1000) 36 | self.send_response(code) 37 | if content_type: 38 | self.send_header('Content-Type', content_type) 39 | if body: 40 | self.send_header('Content-Length', len(body)) 41 | self.end_headers() 42 | if body: 43 | self.wfile.write(body) 44 | except BrokenPipeError: 45 | pass # Client disconnected early, no biggie. 46 | 47 | self._stop_event = threading.Event() 48 | self._http_server = hs.HTTPServer( 49 | ('', 0), 50 | Handler, 51 | bind_and_activate=True) 52 | self._http_server_thread = threading.Thread(target=self._serve) 53 | self._http_server_thread.start() 54 | return self 55 | 56 | def __exit__(self, _ex_type, _ex_value, _ex_tb): 57 | self._http_server.shutdown() 58 | self._http_server.server_close() 59 | self._stop_event.set() 60 | 61 | @property 62 | def port(self): 63 | return self._http_server.server_port 64 | -------------------------------------------------------------------------------- /tests/tests.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | sys.dont_write_bytecode = True 5 | 6 | import os 7 | import unittest 8 | from pathlib import Path 9 | import io 10 | import http.client 11 | 12 | import questdb_query.asynchronous as qdbq_a 13 | import questdb_query.synchronous as qdbq_s 14 | from questdb_query import Endpoint 15 | import pandas as pd 16 | from pandas.testing import assert_frame_equal 17 | 18 | try: 19 | # When running a single test. 20 | from .mock_server import HttpServer 21 | except ImportError: 22 | # When discovered by unittest. 23 | from mock_server import HttpServer 24 | 25 | # Import the code we can use to download and run a test QuestDB instance 26 | sys.path.append(str( 27 | Path(__file__).resolve().parent.parent / 28 | 'c-questdb-client' / 'system_test')) 29 | from fixture import \ 30 | QuestDbFixture, install_questdb, install_questdb_from_repo, AUTH, retry 31 | 32 | 33 | QUESTDB_VERSION = '8.1.4' 34 | QUESTDB_INSTALL_PATH = None 35 | 36 | 37 | def may_install_questdb(): 38 | global QUESTDB_INSTALL_PATH 39 | if QUESTDB_INSTALL_PATH: 40 | return 41 | 42 | install_path = None 43 | if os.environ.get('QDB_REPO_PATH'): 44 | repo = Path(os.environ['QDB_REPO_PATH']) 45 | install_path = install_questdb_from_repo(repo) 46 | else: 47 | url = ('https://github.com/questdb/questdb/releases/download/' + 48 | QUESTDB_VERSION + 49 | '/questdb-' + 50 | QUESTDB_VERSION + 51 | '-no-jre-bin.tar.gz') 52 | install_path = install_questdb(QUESTDB_VERSION, url) 53 | QUESTDB_INSTALL_PATH = install_path 54 | 55 | 56 | def upload_csv(qdb, table, csv_path): 57 | with open(csv_path, 'rb') as file: 58 | file_data = file.read() 59 | 60 | boundary = "2cdcb4a05801c5ab05f174836624949d" 61 | body = io.BytesIO() 62 | body.write(f'--{boundary}\r\n'.encode('utf-8')) 63 | body.write(f'Content-Disposition: form-data; name="data"; filename="{table}"\r\n'.encode('utf-8')) 64 | body.write(b'Content-Type: text/csv\r\n\r\n') 65 | body.write(file_data) 66 | body.write(f'\r\n--{boundary}--\r\n'.encode('utf-8')) 67 | 68 | # Get the byte data from BytesIO 69 | body_bytes = body.getvalue() 70 | 71 | # Prepare headers 72 | headers = { 73 | 'Content-Type': f'multipart/form-data; boundary={boundary}', 74 | 'Content-Length': str(len(body_bytes)) 75 | } 76 | 77 | url = f'/imp?name={table}' 78 | 79 | # Send the HTTP POST request 80 | try: 81 | conn = http.client.HTTPConnection(qdb.host, qdb.http_server_port) 82 | conn.request('POST', url, body_bytes, headers) 83 | response = conn.getresponse() 84 | return response.read().decode() 85 | finally: 86 | conn.close() 87 | 88 | 89 | def load_all_types_table(qdb): 90 | qdb.http_sql_query(''' 91 | CREATE TABLE almost_all_types ( 92 | id int, 93 | active boolean, 94 | ip_address ipv4, 95 | age byte, 96 | temperature short, 97 | grade char, 98 | account_balance float, 99 | currency_symbol symbol, 100 | description string, 101 | comment varchar, 102 | record_date date, 103 | event_timestamp timestamp, 104 | revenue double, 105 | user_uuid uuid, 106 | long_number long, 107 | crypto_hash long256 108 | ) timestamp (event_timestamp) PARTITION BY DAY WAL; 109 | ''') 110 | qdb.http_sql_query(''' 111 | INSERT INTO almost_all_types ( 112 | id, 113 | active, 114 | ip_address, 115 | age, 116 | temperature, 117 | grade, 118 | account_balance, 119 | currency_symbol, 120 | description, 121 | comment, 122 | record_date, 123 | event_timestamp, 124 | revenue, 125 | user_uuid, 126 | long_number, 127 | crypto_hash 128 | ) VALUES 129 | -- id active ip_address age temp gra acc_bal curr description comment record_date event_timestamp revenue user_uuid long_number crypto_hash 130 | (1, true, '192.168.1.1', 25, 72, 'A', 1000.5, 'USD', 'Test record 1', 'pink', '2023-01-01T00:00:00.000Z', '2023-01-01T00:00:00.000000Z', 200.00, '123e4567-e89b-12d3-a456-426614174000', 123456789012345, '0x7fffffffffffffffffffffffffffffff'), 131 | (2, false, NULL, 30, 68, 'B', 1500.25, 'EUR', NULL, 'lightgoldenrodyellow', NULL, '2023-01-02T00:00:00.000000Z', 300.00, '123e4567-e89b-12d3-a456-426614174001', 987654321098765, NULL), 132 | (3, NULL, '10.0.0.1', 35, -40, 'C', NULL, 'JPY', 'Test record 3', NULL, '2023-01-03T00:00:00.000Z', '2023-01-03T00:00:00.000000Z', NULL, '123e4567-e89b-12d3-a456-426614174002', NULL, '0x1fffffffffffffffffffffffffffffff'); 133 | ''') 134 | 135 | def load_trips_table(qdb): 136 | qdb.http_sql_query(''' 137 | CREATE TABLE 'trips' ( 138 | cab_type SYMBOL capacity 256 CACHE, 139 | vendor_id SYMBOL capacity 256 CACHE, 140 | pickup_datetime TIMESTAMP, 141 | dropoff_datetime TIMESTAMP, 142 | rate_code_id SYMBOL capacity 256 CACHE, 143 | pickup_latitude DOUBLE, 144 | pickup_longitude DOUBLE, 145 | dropoff_latitude DOUBLE, 146 | dropoff_longitude DOUBLE, 147 | passenger_count INT, 148 | trip_distance DOUBLE, 149 | fare_amount DOUBLE, 150 | extra DOUBLE, 151 | mta_tax DOUBLE, 152 | tip_amount DOUBLE, 153 | tolls_amount DOUBLE, 154 | ehail_fee DOUBLE, 155 | improvement_surcharge DOUBLE, 156 | congestion_surcharge DOUBLE, 157 | total_amount DOUBLE, 158 | payment_type SYMBOL capacity 256 CACHE, 159 | trip_type SYMBOL capacity 256 CACHE, 160 | pickup_location_id INT, 161 | dropoff_location_id INT 162 | ) timestamp (pickup_datetime) PARTITION BY MONTH WAL; 163 | ''') 164 | 165 | trips_csv = Path(__file__).resolve().parent / 'trips.csv' 166 | upload_csv(qdb, 'trips', trips_csv) 167 | 168 | def check_table(): 169 | try: 170 | resp = qdb.http_sql_query('SELECT count() FROM trips') 171 | if not resp.get('dataset'): 172 | return False 173 | if resp['dataset'][0][0] == 10000: 174 | return True 175 | return False 176 | except: 177 | return None 178 | 179 | # Wait until the apply job is done. 180 | return retry(check_table, timeout_sec=10) 181 | 182 | 183 | class TestModule(unittest.IsolatedAsyncioTestCase): 184 | @classmethod 185 | def setUpClass(cls): 186 | cls.qdb = None 187 | may_install_questdb() 188 | 189 | cls.qdb = QuestDbFixture( 190 | QUESTDB_INSTALL_PATH, auth=False, wrap_tls=True, http=True) 191 | cls.qdb.start() 192 | 193 | load_all_types_table(cls.qdb) 194 | load_trips_table(cls.qdb) 195 | 196 | @classmethod 197 | def tearDownClass(cls): 198 | if cls.qdb: 199 | cls.qdb.stop() 200 | 201 | def _get_endpoint(self): 202 | return Endpoint(self.qdb.host, self.qdb.http_server_port) 203 | 204 | def s_numpy_query(self, query, *, chunks=1): 205 | endpoint = self._get_endpoint() 206 | return qdbq_s.numpy_query(query, endpoint=endpoint, chunks=chunks) 207 | 208 | async def a_numpy_query(self, query, *, chunks=1): 209 | endpoint = self._get_endpoint() 210 | return await qdbq_a.numpy_query(query, endpoint=endpoint, chunks=chunks) 211 | 212 | def s_pandas_query(self, query, *, chunks=1): 213 | endpoint = self._get_endpoint() 214 | return qdbq_s.pandas_query(query, endpoint=endpoint, chunks=chunks) 215 | 216 | async def a_pandas_query(self, query, *, chunks=1): 217 | endpoint = self._get_endpoint() 218 | return await qdbq_a.pandas_query(query, endpoint=endpoint, chunks=chunks) 219 | 220 | def test_count_pandas(self): 221 | act = self.s_pandas_query('SELECT count() FROM trips') 222 | exp = pd.DataFrame({'count': pd.Series([10000], dtype='Int64')}) 223 | assert_frame_equal(act, exp, check_column_type=True) 224 | 225 | def test_count_numpy(self): 226 | act = self.s_numpy_query('SELECT count() FROM trips') 227 | exp = {'count': np.array([10000], dtype='int64')} 228 | self.assertEqual(act, exp) 229 | 230 | def test_head_pandas(self): 231 | act = self.s_pandas_query('SELECT * FROM trips LIMIT 5') 232 | exp = pd.DataFrame({ 233 | 'cab_type': pd.Series([ 234 | 'yellow', 'yellow', 'green', 'yellow', 'yellow'], 235 | dtype='string'), 236 | 'vendor_id': pd.Series([ 237 | 'VTS', 'VTS', 'VTS', 'CMT', 'VTS'], 238 | dtype='string'), 239 | 'pickup_datetime': pd.Series(pd.to_datetime([ 240 | '2016-01-01T00:00:00.000000', 241 | '2016-01-01T00:00:00.000000', 242 | '2016-01-01T00:00:01.000000', 243 | '2016-01-01T00:00:01.000000', 244 | '2016-01-01T00:00:02.000000']), 245 | dtype='datetime64[ns]'), 246 | 'dropoff_datetime': pd.Series(pd.to_datetime([ 247 | '2016-01-01T00:26:45.000000', 248 | '2016-01-01T00:18:30.000000', 249 | '2016-01-01T00:02:10.000000', 250 | '2016-01-01T00:11:55.000000', 251 | '2016-01-01T00:11:08.000000']), 252 | dtype='datetime64[ns]'), 253 | 'rate_code_id': pd.Series([ 254 | 'Standard rate', 255 | 'Standard rate', 256 | 'Standard rate', 257 | 'Standard rate', 258 | 'Standard rate'], 259 | dtype='string'), 260 | 'pickup_latitude': pd.Series([ 261 | -73.9940567, -73.9801178, -73.92303467, -73.97942352, -73.99834442], 262 | dtype='float'), 263 | 'pickup_longitude': pd.Series([ 264 | 40.71998978, 40.74304962, 40.70674515, 40.74461365, 40.72389603], 265 | dtype='float'), 266 | 'dropoff_latitude': pd.Series([ 267 | 40.78987122, 40.76314163, 40.70864487, 40.7539444, 40.68840027], 268 | dtype='float'), 269 | 'dropoff_longitude': pd.Series([ 270 | -73.966362, -73.9134903, -73.92714691, -73.99203491, -73.995849610000], 271 | dtype='float'), 272 | 'passenger_count': pd.Series([ 273 | 2, 2, 1, 1, 1], 274 | dtype='Int32'), 275 | 'trip_distance': pd.Series([ 276 | 7.45, 5.52, 0.34, 1.2, 3.21], 277 | dtype='float'), 278 | 'fare_amount': pd.Series([ 279 | 26.0, 19.0, 3.5, 9.0, 11.5], 280 | dtype='float'), 281 | 'extra': pd.Series([ 282 | 0.5, 0.5, 0.5, 0.5, 0.5], 283 | dtype='float'), 284 | 'mta_tax': pd.Series([ 285 | 0.5, 0.5, 0.5, 0.5, 0.5], 286 | dtype='float'), 287 | 'tip_amount': pd.Series([ 288 | 0.0, 0.0, 0.0, 0.0, 0.0], 289 | dtype='float'), 290 | 'tolls_amount': pd.Series([ 291 | 0.0, 0.0, 0.0, 0.0, 0.0], 292 | dtype='float'), 293 | 'ehail_fee': pd.Series([ 294 | 0.0, 0.0, 0.0, 0.0, 0.0], 295 | dtype='float'), 296 | 'improvement_surcharge': pd.Series([ 297 | 0.3, 0.3, 0.3, 0.3, 0.3], 298 | dtype='float'), 299 | 'congestion_surcharge': pd.Series([ 300 | 0.0, 0.0, 0.0, 0.0, 0.0], 301 | dtype='float'), 302 | 'total_amount': pd.Series([ 303 | 27.3, 20.3, 4.8, 10.3, 12.8], 304 | dtype='float'), 305 | 'payment_type': pd.Series([ 306 | 'Cash', 'Cash', 'Cash', 'Cash', 'Cash'], 307 | dtype='string'), 308 | 'trip_type': pd.Series([ 309 | 'na', 'na', 'na', 'na', 'na'], 310 | dtype='string'), 311 | 'pickup_location_id': pd.Series([ 312 | 0, 0, 0, 0, 0], 313 | dtype='Int32'), 314 | 'dropoff_location_id': pd.Series([ 315 | 0, 0, 0, 0, 0], 316 | dtype='Int32')}) 317 | assert_frame_equal(act, exp, check_column_type=True) 318 | 319 | def test_head_numpy(self): 320 | act = self.s_numpy_query('SELECT * FROM trips LIMIT 5') 321 | exp = { 322 | 'cab_type': np.array([ 323 | 'yellow', 'yellow', 'green', 'yellow', 'yellow'], 324 | dtype='object'), 325 | 'vendor_id': np.array([ 326 | 'VTS', 'VTS', 'VTS', 'CMT', 'VTS'], 327 | dtype='object'), 328 | 'pickup_datetime': np.array([ 329 | '2016-01-01T00:00:00.000000', 330 | '2016-01-01T00:00:00.000000', 331 | '2016-01-01T00:00:01.000000', 332 | '2016-01-01T00:00:01.000000', 333 | '2016-01-01T00:00:02.000000'], 334 | dtype='datetime64[ns]'), 335 | 'dropoff_datetime': np.array([ 336 | '2016-01-01T00:26:45.000000', 337 | '2016-01-01T00:18:30.000000', 338 | '2016-01-01T00:02:10.000000', 339 | '2016-01-01T00:11:55.000000', 340 | '2016-01-01T00:11:08.000000'], 341 | dtype='datetime64[ns]'), 342 | 'rate_code_id': np.array([ 343 | 'Standard rate', 344 | 'Standard rate', 345 | 'Standard rate', 346 | 'Standard rate', 347 | 'Standard rate'], 348 | dtype='object'), 349 | 'pickup_latitude': np.array([ 350 | -73.9940567, -73.9801178, -73.92303467, -73.97942352, -73.99834442], 351 | dtype='float'), 352 | 'pickup_longitude': np.array([ 353 | 40.71998978, 40.74304962, 40.70674515, 40.74461365, 40.72389603], 354 | dtype='float'), 355 | 'dropoff_latitude': np.array([ 356 | 40.78987122, 40.76314163, 40.70864487, 40.7539444, 40.68840027], 357 | dtype='float'), 358 | 'dropoff_longitude': np.array([ 359 | -73.966362, -73.9134903, -73.92714691, -73.99203491, -73.995849610000], 360 | dtype='float'), 361 | 'passenger_count': np.array([ 362 | 2, 2, 1, 1, 1], 363 | dtype='int32'), 364 | 'trip_distance': np.array([ 365 | 7.45, 5.52, 0.34, 1.2, 3.21], 366 | dtype='float'), 367 | 'fare_amount': np.array([ 368 | 26.0, 19.0, 3.5, 9.0, 11.5], 369 | dtype='float'), 370 | 'extra': np.array([ 371 | 0.5, 0.5, 0.5, 0.5, 0.5], 372 | dtype='float'), 373 | 'mta_tax': np.array([ 374 | 0.5, 0.5, 0.5, 0.5, 0.5], 375 | dtype='float'), 376 | 'tip_amount': np.array([ 377 | 0.0, 0.0, 0.0, 0.0, 0.0], 378 | dtype='float'), 379 | 'tolls_amount': np.array([ 380 | 0.0, 0.0, 0.0, 0.0, 0.0], 381 | dtype='float'), 382 | 'ehail_fee': np.array([ 383 | 0.0, 0.0, 0.0, 0.0, 0.0], 384 | dtype='float'), 385 | 'improvement_surcharge': np.array([ 386 | 0.3, 0.3, 0.3, 0.3, 0.3], 387 | dtype='float'), 388 | 'congestion_surcharge': np.array([ 389 | 0.0, 0.0, 0.0, 0.0, 0.0], 390 | dtype='float'), 391 | 'total_amount': np.array([ 392 | 27.3, 20.3, 4.8, 10.3, 12.8], 393 | dtype='float'), 394 | 'payment_type': np.array([ 395 | 'Cash', 'Cash', 'Cash', 'Cash', 'Cash'], 396 | dtype='object'), 397 | 'trip_type': np.array([ 398 | 'na', 'na', 'na', 'na', 'na'], 399 | dtype='object'), 400 | 'pickup_location_id': np.array([ 401 | 0, 0, 0, 0, 0], 402 | dtype='int32'), 403 | 'dropoff_location_id': np.array([ 404 | 0, 0, 0, 0, 0], 405 | dtype='int32')} 406 | self.assertEqual(act.keys(), exp.keys()) 407 | for k in act: 408 | np.testing.assert_array_equal(act[k], exp[k]) 409 | self.assertEqual(act[k].dtype, exp[k].dtype) 410 | 411 | def _test_chunked_pandas(self, limit=None): 412 | qry = f'SELECT * FROM trips' 413 | if limit is not None: 414 | qry += f' limit {limit}' 415 | orig = self.s_pandas_query(qry, chunks=1) 416 | chunkings = [1, 2, 3, 7, 10, 11, 20, 100, 117] 417 | others = [self.s_pandas_query(qry, chunks=c) for c in chunkings] 418 | for other in others: 419 | assert_frame_equal(orig, other, check_column_type=True) 420 | 421 | def test_chunked_pandas_10(self): 422 | self._test_chunked_pandas(10) 423 | 424 | def test_chunked_pandas_133(self): 425 | self._test_chunked_pandas(133) 426 | 427 | def test_chunked_pandas(self): 428 | self._test_chunked_pandas() 429 | 430 | def test_almost_all_types(self): 431 | act = self.s_pandas_query('SELECT * FROM almost_all_types') 432 | schema = { 433 | name: str(val) 434 | for name, val 435 | in act.dtypes.to_dict().items()} 436 | exp_schema = { 437 | 'id': 'Int32', 438 | 'active': 'bool', 439 | 'ip_address': 'string', 440 | 'age': 'int8', 441 | 'temperature': 'int16', 442 | 'grade': 'string', 443 | 'account_balance': 'float32', 444 | 'currency_symbol': 'string', 445 | 'description': 'string', 446 | 'comment': 'string', 447 | 'record_date': 'datetime64[ns]', 448 | 'event_timestamp': 'datetime64[ns]', 449 | 'revenue': 'float64', 450 | 'user_uuid': 'string', 451 | 'long_number': 'Int64', 452 | 'crypto_hash': 'string', 453 | } 454 | self.assertEqual(exp_schema.keys(), schema.keys()) 455 | for key in exp_schema: 456 | self.assertEqual((key, exp_schema[key]), (key, schema[key])) 457 | 458 | exp_df = pd.DataFrame({ 459 | 'id': pd.Series([1, 2, 3], dtype='Int32'), 460 | 'active': pd.Series([True, False, None], dtype='bool'), 461 | 'ip_address': pd.Series(['192.168.1.1', None, '10.0.0.1'], dtype='string'), 462 | 'age': pd.Series([25, 30, 35], dtype='int8'), 463 | 'temperature': pd.Series([72, 68, -40], dtype='int16'), 464 | 'grade': pd.Series(['A', 'B', 'C'], dtype='string'), 465 | 'account_balance': pd.Series([1000.5, 1500.25, None], dtype='float32'), 466 | 'currency_symbol': pd.Series(['USD', 'EUR', 'JPY'], dtype='string'), 467 | 'description': pd.Series(['Test record 1', None, 'Test record 3'], dtype='string'), 468 | 'comment': pd.Series(['pink', 'lightgoldenrodyellow', None], dtype='string'), 469 | 'record_date': pd.Series(['2023-01-01T00:00:00.000', None, '2023-01-03T00:00:00.000'], dtype='datetime64[ns]'), 470 | 'event_timestamp': pd.Series(['2023-01-01T00:00:00.000000', '2023-01-02T00:00:00.000000', '2023-01-03T00:00:00.000000'], dtype='datetime64[ns]'), 471 | 'revenue': pd.Series([200.00, 300.00, None], dtype='float64'), 472 | 'user_uuid': pd.Series(['123e4567-e89b-12d3-a456-426614174000', '123e4567-e89b-12d3-a456-426614174001', '123e4567-e89b-12d3-a456-426614174002'], dtype='string'), 473 | 'long_number': pd.Series([123456789012345, 987654321098765, None], dtype='Int64'), 474 | 'crypto_hash': pd.Series(['0x7fffffffffffffffffffffffffffffff', None, '0x1fffffffffffffffffffffffffffffff'], dtype='string'), 475 | }) 476 | assert_frame_equal(act, exp_df, check_column_type=True) 477 | 478 | def test_almost_all_types_0_rows(self): 479 | act = self.s_pandas_query('SELECT * FROM almost_all_types WHERE id = 0') 480 | schema = { 481 | name: str(val) 482 | for name, val 483 | in act.dtypes.to_dict().items()} 484 | exp_schema = { 485 | 'id': 'Int32', 486 | 'active': 'bool', 487 | 'ip_address': 'string', 488 | 'age': 'int8', 489 | 'temperature': 'int16', 490 | 'grade': 'string', 491 | 'account_balance': 'float32', 492 | 'currency_symbol': 'string', 493 | 'description': 'string', 494 | 'comment': 'string', 495 | 'record_date': 'datetime64[ns]', 496 | 'event_timestamp': 'datetime64[ns]', 497 | 'revenue': 'float64', 498 | 'user_uuid': 'string', 499 | 'long_number': 'Int64', 500 | 'crypto_hash': 'string', 501 | } 502 | self.assertEqual(exp_schema.keys(), schema.keys()) 503 | for key in exp_schema: 504 | self.assertEqual((key, exp_schema[key]), (key, schema[key])) 505 | 506 | exp_df = pd.DataFrame({ 507 | 'id': pd.Series([], dtype='Int32'), 508 | 'active': pd.Series([], dtype='bool'), 509 | 'ip_address': pd.Series([], dtype='string'), 510 | 'age': pd.Series([], dtype='int8'), 511 | 'temperature': pd.Series([], dtype='int16'), 512 | 'grade': pd.Series([], dtype='string'), 513 | 'account_balance': pd.Series([], dtype='float32'), 514 | 'currency_symbol': pd.Series([], dtype='string'), 515 | 'description': pd.Series([], dtype='string'), 516 | 'comment': pd.Series([], dtype='string'), 517 | 'record_date': pd.Series([], dtype='datetime64[ns]'), 518 | 'event_timestamp': pd.Series([], dtype='datetime64[ns]'), 519 | 'revenue': pd.Series([], dtype='float64'), 520 | 'user_uuid': pd.Series([], dtype='string'), 521 | 'long_number': pd.Series([], dtype='Int64'), 522 | 'crypto_hash': pd.Series([], dtype='string'), 523 | }) 524 | assert_frame_equal(act, exp_df, check_column_type=True) 525 | 526 | async def test_async_pandas(self): 527 | act = await self.a_pandas_query('SELECT count() FROM trips') 528 | exp = pd.DataFrame({'count': pd.Series([10000], dtype='Int64')}) 529 | assert_frame_equal(act, exp, check_column_type=True) 530 | 531 | async def test_async_numpy(self): 532 | act = await self.a_numpy_query('SELECT count() FROM trips') 533 | exp = {'count': np.array([10000], dtype='int64')} 534 | self.assertEqual(act, exp) 535 | 536 | def test_basic_auth(self): 537 | endpoint = Endpoint(self.qdb.host, self.qdb.http_server_port, auth=AUTH) 538 | act = qdbq_s.pandas_query('SELECT count() FROM trips', endpoint=endpoint) 539 | exp = pd.DataFrame({'count': pd.Series([10000], dtype='Int64')}) 540 | assert_frame_equal(act, exp, check_column_type=True) 541 | 542 | def _do_auth_test(self, exp_auth_header, username=None, password=None, token=None): 543 | with HttpServer() as server: 544 | server.responses.append(( 545 | 0, 546 | 200, 547 | 'application/json', 548 | ( 549 | b'{"columns": [{"name": "count", "type": "LONG"}], ' + 550 | b'"count": 1, "dataset": [[10000]], "query": "SELECT count() ' + 551 | b'FROM trips", "timestamp": -1}' 552 | ))) 553 | server.responses.append(( 554 | 0, 555 | 200, 556 | 'text/csv', 557 | b'"count"\r\n10000\r\n' 558 | )) 559 | 560 | endpoint = Endpoint( 561 | 'localhost', 562 | server.port, 563 | username=username, 564 | password=password, 565 | token=token) 566 | act = qdbq_s.pandas_query('SELECT count() FROM trips', endpoint=endpoint) 567 | exp = pd.DataFrame({'count': pd.Series([10000], dtype='Int64')}) 568 | assert_frame_equal(act, exp, check_column_type=True) 569 | 570 | auth0 = server.headers[0]['Authorization'] 571 | auth1 = server.headers[1]['Authorization'] 572 | self.assertEqual(auth0, auth1) 573 | self.assertEqual(auth0, exp_auth_header) 574 | 575 | def test_basic_auth(self): 576 | self._do_auth_test( 577 | 'Basic YWRtaW46cXVlc3Q=', 578 | username='admin', 579 | password='quest') 580 | 581 | def test_token_auth(self): 582 | self._do_auth_test( 583 | 'Bearer 1234567890', 584 | token='1234567890') 585 | 586 | def test_timeout(self): 587 | with HttpServer() as server: 588 | server.responses.append(( 589 | 2000, # 2 seconds 590 | 200, 591 | 'application/json', 592 | ( 593 | b'{"columns": [{"name": "count", "type": "LONG"}], ' + 594 | b'"count": 1, "dataset": [[10000]], "query": "SELECT count() ' + 595 | b'FROM trips", "timestamp": -1}' 596 | ))) 597 | server.responses.append(( 598 | 2000, # 2 seconds 599 | 200, 600 | 'text/csv', 601 | b'"count"\r\n10000\r\n' 602 | )) 603 | 604 | endpoint = Endpoint('localhost', server.port) 605 | import asyncio 606 | with self.assertRaises(asyncio.TimeoutError): 607 | qdbq_s.pandas_query( 608 | 'SELECT count() FROM trips', 609 | endpoint=endpoint, 610 | timeout=1) 611 | 612 | 613 | if __name__ == '__main__': 614 | unittest.main() 615 | --------------------------------------------------------------------------------