├── pydruid ├── __init__.py ├── utils │ ├── __init__.py │ ├── query_utils.py │ ├── aggregators.py │ ├── having.py │ ├── dimensions.py │ ├── postaggregator.py │ └── filters.py ├── db │ ├── exceptions.py │ ├── __init__.py │ ├── sqlalchemy.py │ └── api.py ├── console.py └── async_client.py ├── requirements.in ├── pypi_push.sh ├── docs ├── figures │ ├── twitter_graph.png │ └── avg_tweet_length.png ├── source │ ├── index.rst │ └── conf.py └── Makefile ├── requirements-dev.in ├── MANIFEST.in ├── .flake8 ├── .gitignore ├── gen_changelog.sh ├── .travis.yml ├── setup.cfg ├── tox.ini ├── LICENSE ├── RELEASE.md ├── .pre-commit-config.yaml ├── requirements.txt ├── tests ├── utils │ ├── test_query_utils.py │ ├── test_having.py │ ├── test_aggregators.py │ ├── test_dimensions.py │ └── test_filters.py ├── db │ ├── test_bearer_auth.py │ ├── test_rows_from_chunks.py │ ├── test_connection.py │ ├── test_druid_dialect.py │ └── test_cursor.py ├── test_async_client.py ├── test_client.py └── test_query.py ├── setup.py ├── requirements-dev.txt └── README.md /pydruid/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pydruid/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | -e .[async,cli,pandas,sqlalchemy] 2 | -------------------------------------------------------------------------------- /pypi_push.sh: -------------------------------------------------------------------------------- 1 | python setup.py sdist 2 | echo "RUN: twine upload dist/pydruid-{VERSION}.tar.gz" 3 | -------------------------------------------------------------------------------- /docs/figures/twitter_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/druid-io/pydruid/HEAD/docs/figures/twitter_graph.png -------------------------------------------------------------------------------- /docs/figures/avg_tweet_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/druid-io/pydruid/HEAD/docs/figures/avg_tweet_length.png -------------------------------------------------------------------------------- /requirements-dev.in: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | flake8-mypy 4 | ipdb 5 | isort 6 | pip-tools 7 | pre-commit 8 | pycurl 9 | pytest 10 | tox 11 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | recursive-include docs *.txt 4 | recursive-include pydruid *.py 5 | global-exclude *.py[co] 6 | include LICENSE 7 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | application-import-names = pydruid 3 | exclude = 4 | docs 5 | env 6 | tests 7 | .eggs 8 | build 9 | import-order-style = google 10 | max-line-length = 90 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_store 3 | .idea/ 4 | build/ 5 | dist/ 6 | pydruid.egg-info/ 7 | __pycache__ 8 | .pytest_cache/ 9 | .cache 10 | .eggs 11 | *.egg/ 12 | \#*# 13 | .#* 14 | *~ 15 | .tox 16 | env* 17 | venv 18 | Pipfile* 19 | -------------------------------------------------------------------------------- /gen_changelog.sh: -------------------------------------------------------------------------------- 1 | # requires github-changes, run 2 | # `npm install -g github-changes` 3 | # requires $GITHUB_TOKEN to be set 4 | 5 | # usage: ./github-changes 0.20.0 0.20.1 6 | # will overwrites the local CHANGELOG.md, somehow you need to merge it in 7 | github-changes -o druid-io -r pydruid --token $GITHUB_TOKEN --between-tags $1...$2 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | matrix: 3 | include: 4 | - env: TOXENV=black 5 | - env: TOXENV=flake8 6 | - env: TOXENV=isort 7 | - env: TOXENV=py36 8 | python: 3.6 9 | - env: TOXENV=py37 10 | python: 3.7 11 | - env: TOXENV=py38 12 | python: 3.8 13 | before_install: 14 | - sudo apt-get update 15 | - sudo apt-get install libgnutls28-dev 16 | install: 17 | - pip install tox 18 | script: 19 | - tox 20 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_sphinx] 2 | source-dir = docs/source 3 | build-dir = docs/build 4 | all_files = 1 5 | 6 | [upload_sphinx] 7 | upload-dir = docs/build/html 8 | 9 | [bdist_wheel] 10 | universal=1 11 | 12 | [isort] 13 | combine_as_imports = true 14 | include_trailing_comma = true 15 | line_length = 88 16 | known_first_party = pydruid 17 | known_third_party = pandas,prompt_toolkit,pygments,pytest,requests,setuptools,sqlalchemy,tabulate,tornado 18 | multi_line_output = 3 19 | order_by_type = false 20 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [testenv] 2 | setenv = 3 | PYTHONPATH = {toxinidir} 4 | commands = 5 | pytest {posargs} 6 | deps = 7 | -rrequirements.txt 8 | -rrequirements-dev.txt 9 | parallel_show_output = true 10 | usedevelop = true 11 | 12 | [testenv:black] 13 | commands = 14 | black --check pydruid setup.py tests 15 | 16 | [testenv:flake8] 17 | commands = 18 | flake8 pydruid setup.py 19 | 20 | [testenv:isort] 21 | commands = 22 | isort --check-only --recursive setup.py pydruid tests 23 | 24 | [tox] 25 | envlist = 26 | black 27 | flake8 28 | isort 29 | skipdist = true 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 Metamarkets Group Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # How to craft a PyDruid release and ship to Pypi 2 | 3 | Prep 4 | * `git remote add druid-io git@github.com:druid-io/pydruid.git` 5 | 6 | New minor release: 7 | * branch off of master to minor `git checkout -b 0.X` 8 | * pick cherries if any 9 | 10 | New micro release: 11 | * checkout existing minor release branch `git checkout 0.X` 12 | * pick cherries 13 | 14 | Finally: 15 | * run tests 16 | * update version in `setup.py` to `0.X.N` 17 | * commit with commit message `0.X.N` 18 | * `git tag 0.X.N` 19 | * Push release to repo `git push druid-io 0.X 0.X.N` 20 | * Push to pypi `./pypi_push.sh` 21 | 22 | Post changelog 23 | * `./gen_changelog.sh 0.0.0...0.X.N` 24 | -------------------------------------------------------------------------------- /pydruid/db/exceptions.py: -------------------------------------------------------------------------------- 1 | class Error(Exception): 2 | pass 3 | 4 | 5 | class Warning(Exception): 6 | pass 7 | 8 | 9 | class InterfaceError(Error): 10 | pass 11 | 12 | 13 | class CompileError(Error): 14 | pass 15 | 16 | 17 | class DatabaseError(Error): 18 | pass 19 | 20 | 21 | class InternalError(DatabaseError): 22 | pass 23 | 24 | 25 | class OperationalError(DatabaseError): 26 | pass 27 | 28 | 29 | class ProgrammingError(DatabaseError): 30 | pass 31 | 32 | 33 | class IntegrityError(DatabaseError): 34 | pass 35 | 36 | 37 | class DataError(DatabaseError): 38 | pass 39 | 40 | 41 | class NotSupportedError(CompileError): 42 | pass 43 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 19.10b0 4 | hooks: 5 | - id: black 6 | additional_dependencies: ['click<8.1'] 7 | language_version: python3 8 | 9 | - repo: https://github.com/asottile/seed-isort-config 10 | rev: v2.1.1 11 | hooks: 12 | - id: seed-isort-config 13 | 14 | - repo: https://github.com/pre-commit/pre-commit-hooks 15 | rev: v3.0.1 16 | hooks: 17 | - id: trailing-whitespace 18 | - id: end-of-file-fixer 19 | - id: check-docstring-first 20 | - id: check-json 21 | - id: check-added-large-files 22 | - id: check-yaml 23 | - id: debug-statements 24 | 25 | - repo: https://gitlab.com/pycqa/flake8 26 | rev: 3.8.2 27 | hooks: 28 | - id: flake8 29 | -------------------------------------------------------------------------------- /pydruid/db/__init__.py: -------------------------------------------------------------------------------- 1 | from pydruid.db.api import connect 2 | from pydruid.db.exceptions import ( 3 | DatabaseError, 4 | DataError, 5 | Error, 6 | IntegrityError, 7 | InterfaceError, 8 | InternalError, 9 | NotSupportedError, 10 | OperationalError, 11 | ProgrammingError, 12 | Warning, 13 | ) 14 | 15 | __all__ = [ 16 | "connect", 17 | "apilevel", 18 | "threadsafety", 19 | "paramstyle", 20 | "DataError", 21 | "DatabaseError", 22 | "Error", 23 | "IntegrityError", 24 | "InterfaceError", 25 | "InternalError", 26 | "NotSupportedError", 27 | "OperationalError", 28 | "ProgrammingError", 29 | "Warning", 30 | ] 31 | 32 | 33 | apilevel = "2.0" 34 | # Threads may share the module and connections 35 | threadsafety = 2 36 | paramstyle = "pyformat" 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | certifi==2020.4.5.1 # via requests 8 | chardet==3.0.4 # via requests 9 | idna==2.9 # via requests 10 | numpy==1.18.5 # via pandas 11 | pandas==1.0.4 # via pydruid 12 | prompt-toolkit==3.0.5 # via pydruid 13 | pygments==2.6.1 # via pydruid 14 | python-dateutil==2.8.1 # via pandas 15 | pytz==2020.1 # via pandas 16 | requests==2.23.0 # via pydruid 17 | six==1.15.0 # via python-dateutil 18 | sqlalchemy==1.3.17 # via pydruid 19 | tabulate==0.8.7 # via pydruid 20 | tornado==6.0.4 # via pydruid 21 | urllib3==1.25.9 # via requests 22 | wcwidth==0.2.3 # via prompt-toolkit 23 | -------------------------------------------------------------------------------- /tests/utils/test_query_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import os 4 | 5 | from pydruid.utils import query_utils 6 | 7 | 8 | def open_file(file_path): 9 | return open(file_path, "w", newline="", encoding="utf-8") 10 | 11 | 12 | def line_ending(): 13 | return os.linesep 14 | 15 | 16 | class TestUnicodeWriter: 17 | def test_writerow(self, tmpdir): 18 | file_path = tmpdir.join("out.tsv") 19 | f = open_file(str(file_path)) 20 | w = query_utils.UnicodeWriter(f) 21 | w.writerow(["value1", "㬓"]) 22 | f.close() 23 | assert file_path.read() == "value1\t㬓" + line_ending() 24 | 25 | def test_writerows(self, tmpdir): 26 | file_path = tmpdir.join("out.tsv") 27 | f = open_file(str(file_path)) 28 | w = query_utils.UnicodeWriter(f) 29 | w.writerows([["header1", "header2"], ["value1", "㬓"]]) 30 | f.close() 31 | assert ( 32 | file_path.read() 33 | == "header1\theader2" + line_ending() + "value1\t㬓" + line_ending() 34 | ) 35 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyDruid documentation master file, created by 2 | sphinx-quickstart on Mon Mar 3 16:38:17 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pydruid's documentation 7 | =================================== 8 | 9 | pydruid exposes a simple API to create, execute, and analyze `Druid `_ queries. pydruid can parse query results into `Pandas `_ DataFrame objects for subsequent data analysis, which offers a tight integration between `Druid `_, the `SciPy `_ stack (for scientific computing) and `scikit-learn `_ (for machine learning). Additionally, pydruid can export query results into TSV or JSON for further processing with your favorite tool, e.g., R, Julia, Matlab, or Excel. 10 | 11 | Below is a reference for the PyDruid class, describing the functions to use for querying and exporting, complete with examples. For additional examples, see the `pydruid README `_. 12 | 13 | PyDruid Reference 14 | ================= 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | 19 | .. automodule:: pydruid 20 | 21 | .. autoclass:: client.PyDruid 22 | :members: 23 | 24 | Indices and tables 25 | ================== 26 | 27 | * :ref:`genindex` 28 | * :ref:`modindex` 29 | * :ref:`search` 30 | 31 | -------------------------------------------------------------------------------- /pydruid/utils/query_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Metamarkets Group Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import codecs 17 | import csv 18 | 19 | # A special CSV writer which will write rows to TSV file "f", which is encoded in utf-8. 20 | # this is necessary because the values in druid are not all ASCII. 21 | 22 | 23 | class UnicodeWriter(object): 24 | 25 | # delimiter="\t" 26 | def __init__(self, f, dialect="excel-tab", encoding="utf-8", **kwds): 27 | self.stream = f 28 | self.writer = csv.writer(self.stream, dialect=dialect, **kwds) 29 | self.encoder = codecs.getincrementalencoder(encoding)() 30 | 31 | @staticmethod 32 | def __encode(data): 33 | return str(data) if isinstance(data, str) else data 34 | 35 | def writerow(self, row): 36 | row = [self.__encode(s) for s in row] 37 | self.writer.writerow(row) 38 | 39 | def writerows(self, rows): 40 | for row in rows: 41 | self.writerow(row) 42 | -------------------------------------------------------------------------------- /tests/db/test_bearer_auth.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pydruid.db.api import BearerAuth 4 | import pytest 5 | import requests 6 | import unittest 7 | 8 | 9 | class TestBearerAuth: 10 | 11 | # BearerAuth object is initialized with a token 12 | def test_initialized_with_token(self): 13 | token = "my_token" 14 | auth = BearerAuth(token) 15 | assert auth.token == token 16 | 17 | # The token is None 18 | def test_token_is_none(self): 19 | token = None 20 | auth = BearerAuth(token) 21 | assert auth.token is None 22 | 23 | # The __call__ method adds an Authorization header with the token to the request object 24 | def test_adds_authorization_header(self): 25 | token = "my_token" 26 | auth = BearerAuth(token) 27 | request = requests.Request() 28 | modified_request = auth(request) 29 | assert modified_request.headers["Authorization"] == f"Bearer {token}" 30 | 31 | # The __call__ method returns the modified request object 32 | def test_returns_modified_request_object(self): 33 | token = "my_token" 34 | auth = BearerAuth(token) 35 | request = requests.Request() 36 | modified_request = auth(request) 37 | assert modified_request is request 38 | 39 | # The token is an empty string 40 | def test_token_is_empty_string(self): 41 | token = "" 42 | auth = BearerAuth(token) 43 | request = requests.Request() 44 | modified_request = auth(request) 45 | assert modified_request.headers["Authorization"] == "Bearer " 46 | -------------------------------------------------------------------------------- /tests/db/test_rows_from_chunks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | from pydruid.db.api import rows_from_chunks 6 | 7 | 8 | class RowsFromChunksTestSuite(unittest.TestCase): 9 | def test_rows_from_chunks_empty(self): 10 | chunks = [] 11 | expected = [] 12 | result = list(rows_from_chunks(chunks)) 13 | self.assertEqual(result, expected) 14 | 15 | def test_rows_from_chunks_single_chunk(self): 16 | chunks = ['[{"name": "alice"}, {"name": "bob"}, {"name": "charlie"}]'] 17 | expected = [{"name": "alice"}, {"name": "bob"}, {"name": "charlie"}] 18 | result = list(rows_from_chunks(chunks)) 19 | self.assertEqual(result, expected) 20 | 21 | def test_rows_from_chunks_multiple_chunks(self): 22 | chunks = ['[{"name": "alice"}, {"name": "b', 'ob"}, {"name": "charlie"}]'] 23 | expected = [{"name": "alice"}, {"name": "bob"}, {"name": "charlie"}] 24 | result = list(rows_from_chunks(chunks)) 25 | self.assertEqual(result, expected) 26 | 27 | def test_rows_from_chunks_bracket_in_string(self): 28 | chunks = ['[{"name": "ali{ce"}, {"name": "bob"}]'] 29 | expected = [{"name": "ali{ce"}, {"name": "bob"}] 30 | result = list(rows_from_chunks(chunks)) 31 | self.assertEqual(result, expected) 32 | 33 | def test_rows_from_chunks_quote_in_string(self): 34 | chunks = [r'[{"name": "ali\"ce"}, {"name": "bob"}]'] 35 | expected = [{"name": 'ali"ce'}, {"name": "bob"}] 36 | result = list(rows_from_chunks(chunks)) 37 | self.assertEqual(result, expected) 38 | 39 | 40 | if __name__ == "__main__": 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from setuptools import find_packages, setup 4 | 5 | install_requires = ["requests"] 6 | 7 | extras_require = { 8 | "pandas": ["pandas"], 9 | "async": ["tornado"], 10 | "sqlalchemy": ["sqlalchemy"], 11 | "cli": ["pygments", "prompt_toolkit>=2.0.0", "tabulate"], 12 | } 13 | 14 | with io.open("README.md", encoding="utf-8") as f: 15 | long_description = f.read() 16 | 17 | setup( 18 | name="pydruid", 19 | version="0.6.9", 20 | author="Druid Developers", 21 | author_email="druid-development@googlegroups.com", 22 | packages=find_packages(), 23 | url="https://druid.apache.org", 24 | project_urls={ 25 | "Bug Tracker": "https://github.com/druid-io/pydruid/issues", 26 | "Documentation": "https://pythonhosted.org/pydruid/", 27 | "Source Code": "https://github.com/druid-io/pydruid", 28 | }, 29 | license="Apache License, Version 2.0", 30 | description="A Python connector for Druid.", 31 | long_description=long_description, 32 | long_description_content_type="text/markdown", 33 | install_requires=install_requires, 34 | extras_require=extras_require, 35 | tests_require=["pytest"], 36 | entry_points={ 37 | "console_scripts": ["pydruid = pydruid.console:main"], 38 | "sqlalchemy.dialects": [ 39 | "druid = pydruid.db.sqlalchemy:DruidHTTPDialect", 40 | "druid.http = pydruid.db.sqlalchemy:DruidHTTPDialect", 41 | "druid.https = pydruid.db.sqlalchemy:DruidHTTPSDialect", 42 | ], 43 | }, 44 | include_package_data=True, 45 | classifiers=[ 46 | "License :: OSI Approved :: Apache Software License", 47 | "Programming Language :: Python", 48 | "Programming Language :: Python :: 3", 49 | "Programming Language :: Python :: 3.6", 50 | "Programming Language :: Python :: 3.7", 51 | "Programming Language :: Python :: 3.8", 52 | ], 53 | ) 54 | -------------------------------------------------------------------------------- /tests/db/test_connection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pytest 4 | import unittest 5 | 6 | from pydruid.db.api import Connection, Cursor 7 | from pydruid.db.exceptions import Error 8 | 9 | # Generated by CodiumAI 10 | class TestConnection: 11 | 12 | # Returns a new Cursor object using the connection. 13 | 14 | def test_returns_new_cursor_object(self): 15 | conn = Connection( 16 | host="localhost", port=8082, user="admin", password="password" 17 | ) 18 | cursor = conn.cursor() 19 | assert isinstance(cursor, Cursor) 20 | 21 | # Raises an exception if the connection is closed. 22 | def test_raises_exception_if_connection_closed(self): 23 | conn = Connection( 24 | host="localhost", port=8082, user="admin", password="password" 25 | ) 26 | conn.close() 27 | with pytest.raises(Error): 28 | conn.cursor() 29 | 30 | # Appends the new cursor to the list of cursors. 31 | def test_appends_new_cursor(self): 32 | conn = Connection( 33 | host="localhost", port=8082, user="admin", password="password" 34 | ) 35 | cursor = conn.cursor() 36 | assert cursor in conn.cursors 37 | 38 | # Returns the new cursor. 39 | def test_returns_new_cursor(self): 40 | conn = Connection( 41 | host="localhost", port=8082, user="admin", password="password" 42 | ) 43 | cursor = conn.cursor() 44 | assert isinstance(cursor, Cursor) 45 | 46 | # The new cursor has the same url, user, password, context, header, ssl_verify_cert, ssl_client_cert, proxies, and jwt as the connection. 47 | def test_new_cursor_has_same_attributes_as_connection(self): 48 | conn = Connection( 49 | host="localhost", port=8082, user="admin", password="password" 50 | ) 51 | cursor = conn.cursor() 52 | assert cursor.url == conn.url 53 | assert cursor.user == conn.user 54 | assert cursor.password == conn.password 55 | assert cursor.context == conn.context 56 | assert cursor.header == conn.header 57 | assert cursor.ssl_verify_cert == conn.ssl_verify_cert 58 | assert cursor.ssl_client_cert == conn.ssl_client_cert 59 | assert cursor.proxies == conn.proxies 60 | assert cursor.jwt == conn.jwt 61 | 62 | 63 | if __name__ == "__main__": 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements-dev.in 6 | # 7 | appdirs==1.4.4 # via black, virtualenv 8 | appnope==0.1.0 # via ipython 9 | aspy.yaml==1.3.0 # via pre-commit 10 | attrs==19.3.0 # via black, flake8-mypy, pytest 11 | backcall==0.1.0 # via ipython 12 | black==19.10b0 # via -r requirements-dev.in 13 | cfgv==3.1.0 # via pre-commit 14 | click==7.1.2 # via black, pip-tools 15 | decorator==4.4.2 # via ipython, traitlets 16 | distlib==0.3.0 # via virtualenv 17 | filelock==3.0.12 # via tox, virtualenv 18 | flake8-mypy==17.8.0 # via -r requirements-dev.in 19 | flake8==3.8.2 # via -r requirements-dev.in, flake8-mypy 20 | identify==1.4.19 # via pre-commit 21 | importlib-metadata==1.6.1 # via flake8, pluggy, pre-commit, pytest, virtualenv 22 | ipdb==0.12 # via -r requirements-dev.in 23 | ipython-genutils==0.2.0 # via traitlets 24 | ipython==7.15.0 # via ipdb 25 | isort==4.3.21 # via -r requirements-dev.in 26 | jedi==0.17.0 # via ipython 27 | mccabe==0.6.1 # via flake8 28 | more-itertools==8.3.0 # via pytest 29 | mypy-extensions==0.4.3 # via mypy 30 | mypy==0.780 # via flake8-mypy 31 | nodeenv==1.4.0 # via pre-commit 32 | packaging==20.4 # via pytest 33 | parso==0.7.0 # via jedi 34 | pathspec==0.8.0 # via black 35 | pexpect==4.8.0 # via ipython 36 | pickleshare==0.7.5 # via ipython 37 | pip-tools==5.2.0 # via -r requirements-dev.in 38 | pluggy==0.13.1 # via pytest, tox 39 | pre-commit==1.17.0 # via -r requirements-dev.in 40 | prompt-toolkit==3.0.5 # via ipython 41 | ptyprocess==0.6.0 # via pexpect 42 | py==1.8.1 # via pytest, tox 43 | pycodestyle==2.6.0 # via flake8 44 | pycurl==7.43.0.5 # via -r requirements-dev.in 45 | pyflakes==2.2.0 # via flake8 46 | pygments==2.6.1 # via ipython 47 | pyparsing==2.4.7 # via packaging 48 | pytest==5.4.3 # via -r requirements-dev.in 49 | pyyaml==5.3.1 # via aspy.yaml, pre-commit 50 | regex==2020.5.14 # via black 51 | six==1.15.0 # via packaging, pip-tools, pre-commit, tox, traitlets, virtualenv 52 | toml==0.10.1 # via black, pre-commit, tox 53 | tox==3.11.1 # via -r requirements-dev.in 54 | traitlets==4.3.3 # via ipython 55 | typed-ast==1.4.1 # via black, mypy 56 | typing-extensions==3.7.4.2 # via mypy 57 | virtualenv==20.0.21 # via pre-commit, tox 58 | wcwidth==0.2.3 # via prompt-toolkit, pytest 59 | zipp==3.1.0 # via importlib-metadata 60 | 61 | # The following packages are considered to be unsafe in a requirements file: 62 | # pip 63 | # setuptools 64 | -------------------------------------------------------------------------------- /pydruid/utils/aggregators.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Metamarkets Group Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from .filters import Filter 17 | 18 | 19 | def thetasketch(raw_column, isinputthetasketch=False, size=16384): 20 | return { 21 | "type": "thetaSketch", 22 | "fieldName": raw_column, 23 | "isInputThetaSketch": isinputthetasketch, 24 | "size": size, 25 | } 26 | 27 | 28 | def quantilesDoublesSketch(raw_column, k=128): 29 | return {"type": "quantilesDoublesSketch", "fieldName": raw_column, "k": k} 30 | 31 | 32 | def min(raw_metric): 33 | """ 34 | .. note:: Deprecated use `longMin`, `doubleMin' instead 35 | """ 36 | return {"type": "min", "fieldName": raw_metric} 37 | 38 | 39 | def max(raw_metric): 40 | """ 41 | .. note:: Deprecated use `longMax`, `doubleMax' instead 42 | """ 43 | return {"type": "max", "fieldName": raw_metric} 44 | 45 | 46 | def longsum(raw_metric): 47 | return {"type": "longSum", "fieldName": raw_metric} 48 | 49 | 50 | def longmin(raw_metric): 51 | return {"type": "longMin", "fieldName": raw_metric} 52 | 53 | 54 | def longmax(raw_metric): 55 | return {"type": "longMax", "fieldName": raw_metric} 56 | 57 | 58 | def doublesum(raw_metric): 59 | return {"type": "doubleSum", "fieldName": raw_metric} 60 | 61 | 62 | def doublemin(raw_metric): 63 | return {"type": "doubleMin", "fieldName": raw_metric} 64 | 65 | 66 | def doublemax(raw_metric): 67 | return {"type": "doubleMax", "fieldName": raw_metric} 68 | 69 | 70 | def count(raw_metric): 71 | return {"type": "count", "fieldName": raw_metric} 72 | 73 | 74 | def hyperunique(raw_metric): 75 | return {"type": "hyperUnique", "fieldName": raw_metric} 76 | 77 | 78 | def cardinality(raw_column, by_row=False): 79 | if type(raw_column) is not list: 80 | raw_column = [raw_column] 81 | return {"type": "cardinality", "fieldNames": raw_column, "byRow": by_row} 82 | 83 | 84 | def filtered(filter, agg): 85 | return { 86 | "type": "filtered", 87 | "filter": Filter.build_filter(filter), 88 | "aggregator": agg, 89 | } 90 | 91 | 92 | def javascript(columns_list, fn_aggregate, fn_combine, fn_reset): 93 | return { 94 | "type": "javascript", 95 | "fieldNames": columns_list, 96 | "fnAggregate": fn_aggregate, 97 | "fnCombine": fn_combine, 98 | "fnReset": fn_reset, 99 | } 100 | 101 | 102 | def stringfirst(raw_metric): 103 | return {"type": "stringFirst", "fieldName": raw_metric} 104 | 105 | 106 | def stringlast(raw_metric): 107 | return {"type": "stringLast", "fieldName": raw_metric} 108 | 109 | 110 | def build_aggregators(agg_input): 111 | return [_build_aggregator(name, kwargs) for (name, kwargs) in agg_input.items()] 112 | 113 | 114 | def _build_aggregator(name, kwargs): 115 | if kwargs["type"] == "filtered": 116 | kwargs["aggregator"] = _build_aggregator(name, kwargs["aggregator"]) 117 | else: 118 | kwargs.update({"name": name}) 119 | 120 | return kwargs 121 | -------------------------------------------------------------------------------- /pydruid/utils/having.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Metamarkets Group Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | try: 17 | import simplejson as json 18 | except ImportError: 19 | import json 20 | 21 | 22 | class Having: 23 | def __init__(self, **args): 24 | 25 | if args["type"] in ("equalTo", "lessThan", "greaterThan"): 26 | self.having = { 27 | "having": { 28 | "type": args["type"], 29 | "aggregation": args["aggregation"], 30 | "value": args["value"], 31 | } 32 | } 33 | 34 | elif args["type"] == "and": 35 | self.having = { 36 | "having": {"type": "and", "havingSpecs": args["havingSpecs"]} 37 | } 38 | 39 | elif args["type"] == "or": 40 | self.having = {"having": {"type": "or", "havingSpecs": args["havingSpecs"]}} 41 | 42 | elif args["type"] == "not": 43 | self.having = {"having": {"type": "not", "havingSpec": args["havingSpec"]}} 44 | 45 | elif args["type"] == "filter": 46 | self.having = {"having": {"type": args["type"], "filter": args["filter"]}} 47 | 48 | elif args["type"] == "dimSelector": 49 | self.having = { 50 | "having": { 51 | "type": args["type"], 52 | "dimension": args["dimension"], 53 | "value": args["value"], 54 | } 55 | } 56 | 57 | else: 58 | raise NotImplementedError( 59 | "Having type: {0} does not exist".format(args["type"]) 60 | ) 61 | 62 | def show(self): 63 | print(json.dumps(self.having, indent=4)) 64 | 65 | def _combine(self, typ, x): 66 | # collapse nested and/ors 67 | if self.having["having"]["type"] == typ: 68 | havingSpecs = self.having["having"]["havingSpecs"] + [x.having["having"]] 69 | return Having(type=typ, havingSpecs=havingSpecs) 70 | elif x.having["having"]["type"] == typ: 71 | havingSpecs = [self.having["having"]] + x.having["having"]["havingSpecs"] 72 | return Having(type=typ, havingSpecs=havingSpecs) 73 | else: 74 | return Having( 75 | type=typ, havingSpecs=[self.having["having"], x.having["having"]] 76 | ) 77 | 78 | def __and__(self, x): 79 | return self._combine("and", x) 80 | 81 | def __or__(self, x): 82 | return self._combine("or", x) 83 | 84 | def __invert__(self): 85 | return Having(type="not", havingSpec=self.having["having"]) 86 | 87 | @staticmethod 88 | def build_having(having_obj): 89 | return having_obj.having["having"] 90 | 91 | 92 | class Aggregation: 93 | def __init__(self, agg): 94 | self.aggregation = agg 95 | 96 | def __eq__(self, other): 97 | return Having(type="equalTo", aggregation=self.aggregation, value=other) 98 | 99 | def __lt__(self, other): 100 | return Having(type="lessThan", aggregation=self.aggregation, value=other) 101 | 102 | def __gt__(self, other): 103 | return Having(type="greaterThan", aggregation=self.aggregation, value=other) 104 | 105 | 106 | class Dimension: 107 | def __init__(self, dim): 108 | self.dimension = dim 109 | 110 | def __eq__(self, other): 111 | return Having(type="dimSelector", dimension=self.dimension, value=other) 112 | -------------------------------------------------------------------------------- /tests/utils/test_having.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import pytest 4 | 5 | from pydruid.utils.filters import Filter 6 | from pydruid.utils.having import Aggregation, Dimension, Having 7 | 8 | 9 | class TestHaving: 10 | def test_equalTo_having(self): 11 | h1 = Having(type="equalTo", aggregation="sum", value=1) 12 | actual = Having.build_having(h1) 13 | expected = {"type": "equalTo", "aggregation": "sum", "value": 1} 14 | assert actual == expected 15 | 16 | def test_equalTo_having_Aggregation(self): 17 | h1 = Aggregation("sum") == 1 18 | actual = Having.build_having(h1) 19 | expected = {"type": "equalTo", "aggregation": "sum", "value": 1} 20 | assert actual == expected 21 | 22 | def test_lessThan_having(self): 23 | h1 = Having(type="lessThan", aggregation="sum", value=1) 24 | actual = Having.build_having(h1) 25 | expected = {"type": "lessThan", "aggregation": "sum", "value": 1} 26 | assert actual == expected 27 | 28 | def test_lessThan_having_Aggregation(self): 29 | h1 = Aggregation("sum") < 1 30 | actual = Having.build_having(h1) 31 | expected = {"type": "lessThan", "aggregation": "sum", "value": 1} 32 | assert actual == expected 33 | 34 | def test_greaterThan_having(self): 35 | h1 = Having(type="greaterThan", aggregation="sum", value=1) 36 | actual = Having.build_having(h1) 37 | expected = {"type": "greaterThan", "aggregation": "sum", "value": 1} 38 | assert actual == expected 39 | 40 | def test_greaterThan_having_Aggregation(self): 41 | h1 = Aggregation("sum") > 1 42 | actual = Having.build_having(h1) 43 | expected = {"type": "greaterThan", "aggregation": "sum", "value": 1} 44 | assert actual == expected 45 | 46 | def test_and_having(self): 47 | h1 = Aggregation("sum1") > 1 48 | h2 = Aggregation("sum2") > 2 49 | actual = Having.build_having(h1 & h2) 50 | expected = { 51 | "type": "and", 52 | "havingSpecs": [ 53 | {"type": "greaterThan", "aggregation": "sum1", "value": 1}, 54 | {"type": "greaterThan", "aggregation": "sum2", "value": 2}, 55 | ], 56 | } 57 | assert actual == expected 58 | 59 | def test_or_having(self): 60 | h1 = Aggregation("sum1") > 1 61 | h2 = Aggregation("sum2") > 2 62 | actual = Having.build_having(h1 | h2) 63 | expected = { 64 | "type": "or", 65 | "havingSpecs": [ 66 | {"type": "greaterThan", "aggregation": "sum1", "value": 1}, 67 | {"type": "greaterThan", "aggregation": "sum2", "value": 2}, 68 | ], 69 | } 70 | assert actual == expected 71 | 72 | def test_not_having(self): 73 | h1 = Aggregation("sum") > 1 74 | actual = Having.build_having(~h1) 75 | expected = { 76 | "type": "not", 77 | "havingSpec": {"type": "greaterThan", "aggregation": "sum", "value": 1}, 78 | } 79 | assert actual == expected 80 | 81 | def test_dimSelector_having(self): 82 | h1 = Having(type="dimSelector", dimension="foo", value="bar") 83 | actual = Having.build_having(h1) 84 | expected = {"type": "dimSelector", "dimension": "foo", "value": "bar"} 85 | assert actual == expected 86 | 87 | def test_dimension(self): 88 | h1 = Dimension("foo") == "bar" 89 | actual = Having.build_having(h1) 90 | expected = {"type": "dimSelector", "dimension": "foo", "value": "bar"} 91 | assert actual == expected 92 | 93 | def test_query_filter_having(self): 94 | f1 = Filter(type="selector", dimension="foo", value="bar") 95 | query_filter = Filter.build_filter(f1) 96 | h1 = Having(type="filter", filter=query_filter) 97 | actual = Having.build_having(h1) 98 | expected = { 99 | "type": "filter", 100 | "filter": {"type": "selector", "dimension": "foo", "value": "bar"}, 101 | } 102 | assert actual == expected 103 | 104 | def test_not_exists_having_type(self): 105 | with pytest.raises(NotImplementedError): 106 | Having(type="notExists") 107 | -------------------------------------------------------------------------------- /pydruid/console.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | from urllib import parse 5 | 6 | from prompt_toolkit import prompt 7 | from prompt_toolkit.completion.word_completer import WordCompleter 8 | from prompt_toolkit.history import FileHistory 9 | from pygments.lexers import SqlLexer 10 | from pygments.style import Style 11 | from pygments.styles.default import DefaultStyle 12 | from pygments.token import Token 13 | from tabulate import tabulate 14 | 15 | from pydruid.db.api import connect 16 | 17 | keywords = [ 18 | "EXPLAIN PLAN FOR", 19 | "WITH", 20 | "SELECT", 21 | "ALL", 22 | "DISTINCT", 23 | "FROM", 24 | "WHERE", 25 | "GROUP BY", 26 | "HAVING", 27 | "ORDER BY", 28 | "ASC", 29 | "DESC", 30 | "LIMIT", 31 | "FILTER", 32 | "UNION ALL", 33 | ] 34 | 35 | aggregate_functions = [ 36 | "COUNT", 37 | "SUM", 38 | "MIN", 39 | "MAX", 40 | "AVG", 41 | "APPROX_COUNT_DISTINCT", 42 | "APPROX_QUANTILE", 43 | ] 44 | 45 | numeric_functions = [ 46 | "ABS", 47 | "CEIL", 48 | "EXP", 49 | "FLOOR", 50 | "LN", 51 | "LOG10", 52 | "POWER", 53 | "SQRT", 54 | "TRUNCATE", 55 | "TRUNC", 56 | "MOD", 57 | ] 58 | 59 | string_functions = [ 60 | "LENGTH", 61 | "CHAR_LENGTH", 62 | "CHARACTER_LENGTH", 63 | "STRLEN", 64 | "LOOKUP", 65 | "LOWER", 66 | "REGEXP_EXTRACT", 67 | "REPLACE", 68 | "STRPOS", 69 | "SUBSTRING", 70 | "SUBSTR", 71 | "TRIM", 72 | "BTRIM", 73 | "RTRIM", 74 | "LTRIM", 75 | "UPPER", 76 | ] 77 | 78 | time_functions = [ 79 | "CURRENT_TIMESTAMP", 80 | "CURRENT_DATE", 81 | "DATE_TRUNC", 82 | "TIME_FLOOR", 83 | "TIME_SHIFT", 84 | "TIME_EXTRACT", 85 | "TIME_PARSE", 86 | "TIME_FORMAT", 87 | "MILLIS_TO_TIMESTAMP", 88 | "TIMESTAMP_TO_MILLIS", 89 | "EXTRACT", 90 | "FLOOR", 91 | "CEIL", 92 | "TIMESTAMPADD", 93 | ] 94 | 95 | other_functions = ["CAST", "CASE", "WHEN", "THEN", "END", "NULLIF", "COALESCE"] 96 | 97 | 98 | replacements = { 99 | "^SHOW SCHEMAS": "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA", 100 | "^SHOW TABLES": "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES", 101 | "^DESC (?P[^;\\s]*)": r""" 102 | SELECT COLUMN_NAME, 103 | ORDINAL_POSITION, 104 | COLUMN_DEFAULT, 105 | IS_NULLABLE, 106 | DATA_TYPE 107 | FROM INFORMATION_SCHEMA.COLUMNS 108 | WHERE TABLE_NAME='\1' 109 | """.strip(), 110 | } 111 | 112 | 113 | class DocumentStyle(Style): 114 | styles = { 115 | Token.Menu.Completions.Completion.Current: "bg:#00aaaa #000000", 116 | Token.Menu.Completions.Completion: "bg:#008888 #ffffff", 117 | Token.Menu.Completions.ProgressButton: "bg:#003333", 118 | Token.Menu.Completions.ProgressBar: "bg:#00aaaa", 119 | } 120 | styles.update(DefaultStyle.styles) 121 | 122 | 123 | def get_connection_kwargs(url): 124 | parts = parse.urlparse(url) 125 | if ":" in parts.netloc: 126 | host, port = parts.netloc.split(":", 1) 127 | port = int(port) 128 | else: 129 | host = parts.netloc 130 | port = 8082 131 | 132 | return {"host": host, "port": port, "path": parts.path, "scheme": parts.scheme} 133 | 134 | 135 | def get_tables(connection): 136 | cursor = connection.cursor() 137 | return [ 138 | row.TABLE_NAME 139 | for row in cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES") 140 | ] 141 | 142 | 143 | def get_autocomplete(connection): 144 | return ( 145 | keywords 146 | + aggregate_functions 147 | + numeric_functions 148 | + string_functions 149 | + time_functions 150 | + other_functions 151 | + get_tables(connection) 152 | ) 153 | 154 | 155 | def main(): 156 | history = FileHistory(os.path.expanduser("~/.pydruid_history")) 157 | 158 | try: 159 | url = sys.argv[1] 160 | except IndexError: 161 | url = "http://localhost:8082/druid/v2/sql/" 162 | kwargs = get_connection_kwargs(url) 163 | connection = connect(**kwargs) 164 | cursor = connection.cursor() 165 | 166 | words = get_autocomplete(connection) 167 | sql_completer = WordCompleter(words, ignore_case=True) 168 | 169 | while True: 170 | try: 171 | query = prompt( 172 | "> ", 173 | lexer=SqlLexer, 174 | completer=sql_completer, 175 | style=DocumentStyle, 176 | history=history, 177 | ) 178 | except (EOFError, KeyboardInterrupt): 179 | break # Control-D pressed. 180 | 181 | # run query 182 | query = query.strip("; ") 183 | if query: 184 | # shortcuts 185 | if query.lower() in ("bye", "exit", "quit"): 186 | break 187 | for pattern, repl in replacements.items(): 188 | query = re.sub(pattern, repl, query) 189 | 190 | try: 191 | result = cursor.execute(query) 192 | except Exception as e: 193 | print(e) 194 | continue 195 | 196 | headers = [t[0] for t in cursor.description or []] 197 | print(tabulate(result, headers=headers)) 198 | 199 | print("GoodBye!") 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | -------------------------------------------------------------------------------- /tests/db/test_druid_dialect.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | import warnings 5 | from unittest.mock import patch 6 | 7 | from sqlalchemy import exc, types 8 | 9 | from pydruid.db.sqlalchemy import DruidDialect 10 | 11 | 12 | def anonymous_object(**kwargs): 13 | return type("Object", (), kwargs) 14 | 15 | 16 | class DruidDialectTestSuite(unittest.TestCase): 17 | dialect = DruidDialect() 18 | 19 | @patch("pydruid.db.api.Connection") 20 | def test_get_columns_type_mappings(self, connection_mock): 21 | # fmt: off 22 | connection_mock.execute.return_value = [ 23 | anonymous_object(COLUMN_NAME="BigInteger1", JDBC_TYPE=-6, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 24 | anonymous_object(COLUMN_NAME="BigInteger2", JDBC_TYPE=-5, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 25 | anonymous_object(COLUMN_NAME="BigInteger3", JDBC_TYPE=4, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 26 | anonymous_object(COLUMN_NAME="BigInteger4", JDBC_TYPE=5, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 27 | anonymous_object(COLUMN_NAME="String1", JDBC_TYPE=1, IS_NULLABLE="NO", COLUMN_DEFAULT="default_string"), 28 | anonymous_object(COLUMN_NAME="String2", JDBC_TYPE=12, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 29 | anonymous_object(COLUMN_NAME="Float1", JDBC_TYPE=3, IS_NULLABLE="NO", COLUMN_DEFAULT=1.23), 30 | anonymous_object(COLUMN_NAME="Float2", JDBC_TYPE=6, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 31 | anonymous_object(COLUMN_NAME="Float3", JDBC_TYPE=7, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 32 | anonymous_object(COLUMN_NAME="Float4", JDBC_TYPE=8, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 33 | anonymous_object(COLUMN_NAME="Boolean", JDBC_TYPE=16, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 34 | anonymous_object(COLUMN_NAME="DATE", JDBC_TYPE=91, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 35 | anonymous_object(COLUMN_NAME="TIMESTAMP", JDBC_TYPE=93, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 36 | anonymous_object(COLUMN_NAME="BLOB", JDBC_TYPE=1111, IS_NULLABLE="YES", COLUMN_DEFAULT=""), 37 | ] 38 | # fmt: on 39 | result = self.dialect.get_columns(connection_mock, "table_name") 40 | 41 | # fmt: off 42 | expected = [ 43 | {"name": "BigInteger1", "type": types.BigInteger, "nullable": True, "default": None}, 44 | {"name": "BigInteger2", "type": types.BigInteger, "nullable": True, "default": None}, 45 | {"name": "BigInteger3", "type": types.BigInteger, "nullable": True, "default": None}, 46 | {"name": "BigInteger4", "type": types.BigInteger, "nullable": True, "default": None}, 47 | {"name": "String1", "type": types.String, "nullable": False, "default": "default_string"}, 48 | {"name": "String2", "type": types.String, "nullable": True, "default": None}, 49 | {"name": "Float1", "type": types.Float, "nullable": False, "default": "1.23"}, 50 | {"name": "Float2", "type": types.Float, "nullable": True, "default": None}, 51 | {"name": "Float3", "type": types.Float, "nullable": True, "default": None}, 52 | {"name": "Float4", "type": types.Float, "nullable": True, "default": None}, 53 | {"name": "Boolean", "type": types.Boolean, "nullable": True, "default": None}, 54 | {"name": "DATE", "type": types.DATE, "nullable": True, "default": None}, 55 | {"name": "TIMESTAMP", "type": types.TIMESTAMP, "nullable": True, "default": None}, 56 | {"name": "BLOB", "type": types.BLOB, "nullable": True, "default": None}, 57 | ] 58 | # fmt: on 59 | 60 | self.assertListEqual(expected, result) 61 | 62 | @patch("pydruid.db.api.Connection") 63 | def test_get_columns_type_mappings_with_unknown_type(self, connection_mock): 64 | connection_mock.execute.return_value = [ 65 | anonymous_object( 66 | COLUMN_NAME="UnknownType", 67 | JDBC_TYPE=-42, 68 | IS_NULLABLE="YES", 69 | COLUMN_DEFAULT="", 70 | ), 71 | ] 72 | 73 | with warnings.catch_warnings(): 74 | # avoid any noise due to our expected warn logs 75 | warnings.simplefilter("ignore", category=exc.SAWarning) 76 | result = self.dialect.get_columns(connection_mock, "table_name") 77 | 78 | expected = { 79 | "name": "UnknownType", 80 | "type": types.NullType, 81 | "nullable": True, 82 | "default": None, 83 | } 84 | self.assertEqual(1, len(result)) 85 | self.assertEqual(expected, result[0]) 86 | 87 | @patch("pydruid.db.api.Connection") 88 | def test_do_ping_success(self, connection_mock): 89 | connection_mock.execute.return_value = [1] 90 | 91 | result = self.dialect.do_ping(connection_mock) 92 | 93 | # asserts the ping executes with a raw string 94 | connection_mock.execute.assert_called_once_with("SELECT 1") 95 | self.assertTrue(result) 96 | 97 | @patch("pydruid.db.api.Connection") 98 | def test_do_ping_with_exception(self, connection_mock): 99 | connection_mock.execute.side_effect = Exception("Something's wrong :(") 100 | 101 | result = self.dialect.do_ping(connection_mock) 102 | 103 | self.assertFalse(result) 104 | 105 | 106 | if __name__ == "__main__": 107 | unittest.main() 108 | -------------------------------------------------------------------------------- /tests/test_async_client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # 3 | # Copyright 2016 Metamarkets Group Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from unittest.mock import Mock 19 | 20 | import pytest 21 | import tornado 22 | import tornado.ioloop 23 | import tornado.web 24 | from tornado.httpclient import HTTPError 25 | from tornado.testing import AsyncHTTPTestCase 26 | 27 | from pydruid.async_client import AsyncPyDruid 28 | from pydruid.utils.aggregators import doublesum 29 | from pydruid.utils.filters import Dimension 30 | 31 | 32 | class FailureHandler(tornado.web.RequestHandler): 33 | def post(self): 34 | raise HTTPError(500, "Druid error", response="Druid error") 35 | 36 | 37 | class SuccessHandler(tornado.web.RequestHandler): 38 | def post(self): 39 | self.write( 40 | """ 41 | [ { 42 | "timestamp" : "2015-12-30T14:14:49.000Z", 43 | "result" : [ { 44 | "dimension" : "aaaa", 45 | "metric" : 100 46 | } ] 47 | } ] 48 | """ 49 | ) 50 | 51 | 52 | class TestAsyncPyDruid(AsyncHTTPTestCase): 53 | def get_app(self): 54 | return tornado.web.Application( 55 | [ 56 | (r"/druid/v2/fail_request", FailureHandler), 57 | (r"/druid/v2/return_results", SuccessHandler), 58 | ] 59 | ) 60 | 61 | @tornado.testing.gen_test 62 | def test_druid_returns_error(self): 63 | # given 64 | client = AsyncPyDruid( 65 | "http://localhost:%s" % (self.get_http_port(),), "druid/v2/fail_request" 66 | ) 67 | 68 | # when / then 69 | with pytest.raises(IOError): 70 | yield client.topn( 71 | datasource="testdatasource", 72 | granularity="all", 73 | intervals="2015-12-29/pt1h", 74 | aggregations={"count": doublesum("count")}, 75 | dimension="user_name", 76 | metric="count", 77 | filter=Dimension("user_lang") == "en", 78 | threshold=1, 79 | context={"timeout": 1000}, 80 | ) 81 | 82 | @tornado.testing.gen_test 83 | def test_druid_returns_results(self): 84 | # given 85 | client = AsyncPyDruid( 86 | "http://localhost:%s" % (self.get_http_port(),), "druid/v2/return_results" 87 | ) 88 | 89 | # when 90 | top = yield client.topn( 91 | datasource="testdatasource", 92 | granularity="all", 93 | intervals="2015-12-29/pt1h", 94 | aggregations={"count": doublesum("count")}, 95 | dimension="user_name", 96 | metric="count", 97 | filter=Dimension("user_lang") == "en", 98 | threshold=1, 99 | context={"timeout": 1000}, 100 | ) 101 | 102 | # then 103 | self.assertIsNotNone(top) 104 | self.assertEqual(len(top.result), 1) 105 | self.assertEqual(len(top.result[0]["result"]), 1) 106 | 107 | @tornado.testing.gen_test 108 | def test_client_allows_to_export_last_query(self): 109 | # given 110 | client = AsyncPyDruid( 111 | "http://localhost:%s" % (self.get_http_port(),), "druid/v2/return_results" 112 | ) 113 | yield client.topn( 114 | datasource="testdatasource", 115 | granularity="all", 116 | intervals="2015-12-29/pt1h", 117 | aggregations={"count": doublesum("count")}, 118 | dimension="user_name", 119 | metric="count", 120 | filter=Dimension("user_lang") == "en", 121 | threshold=1, 122 | context={"timeout": 1000}, 123 | ) 124 | 125 | # when / then 126 | # assert that last_query.export_tsv method was called (it should throw an exception, given empty path) 127 | with pytest.raises(TypeError): 128 | client.export_tsv(None) 129 | 130 | @tornado.testing.gen_test 131 | def test_client_allows_passing_default_parameters(self): 132 | # given 133 | client = AsyncPyDruid( 134 | "http://localhost:%s" % (self.get_http_port(),), 135 | "druid/v2/return_results", 136 | defaults=dict(request_timeout=120), 137 | ) 138 | top = yield client.topn( 139 | datasource="testdatasource", 140 | granularity="all", 141 | intervals="2015-12-29/pt1h", 142 | aggregations={"count": doublesum("count")}, 143 | dimension="user_name", 144 | metric="count", 145 | filter=Dimension("user_lang") == "en", 146 | threshold=1, 147 | context={"timeout": 1000}, 148 | ) 149 | 150 | # then 151 | self.assertIsNotNone(top) 152 | self.assertEqual(len(top.result), 1) 153 | self.assertEqual(len(top.result[0]["result"]), 1) 154 | 155 | @tornado.testing.gen_test 156 | def test_client_allows_passing_http_client(self): 157 | # given 158 | client = AsyncPyDruid( 159 | "http://localhost:%s" % (self.get_http_port(),), 160 | "druid/v2/return_results", 161 | http_client="tornado.curl_httpclient.CurlAsyncHTTPClient", 162 | ) 163 | top = yield client.topn( 164 | datasource="testdatasource", 165 | granularity="all", 166 | intervals="2015-12-29/pt1h", 167 | aggregations={"count": doublesum("count")}, 168 | dimension="user_name", 169 | metric="count", 170 | filter=Dimension("user_lang") == "en", 171 | threshold=1, 172 | context={"timeout": 1000}, 173 | ) 174 | 175 | # then 176 | self.assertIsNotNone(top) 177 | self.assertEqual(len(top.result), 1) 178 | self.assertEqual(len(top.result[0]["result"]), 1) 179 | -------------------------------------------------------------------------------- /pydruid/utils/dimensions.py: -------------------------------------------------------------------------------- 1 | def build_dimension(dim): 2 | if isinstance(dim, DimensionSpec): 3 | dim = dim.build() 4 | 5 | return dim 6 | 7 | 8 | class DimensionSpec(object): 9 | def __init__( 10 | self, dimension, output_name, extraction_function=None, filter_spec=None 11 | ): 12 | self._dimension = dimension 13 | self._output_name = output_name 14 | self._extraction_function = extraction_function 15 | self._filter_spec = filter_spec 16 | 17 | def build(self): 18 | dimension_spec = { 19 | "type": "default", 20 | "dimension": self._dimension, 21 | "outputName": self._output_name, 22 | } 23 | 24 | if self._extraction_function is not None: 25 | dimension_spec["type"] = "extraction" 26 | dimension_spec["extractionFn"] = self._extraction_function.build() 27 | 28 | if self._filter_spec is not None: 29 | dimension_spec = self._filter_spec.build(dimension_spec) 30 | 31 | return dimension_spec 32 | 33 | 34 | class FilteredSpec(object): 35 | 36 | filter_type = None 37 | 38 | def build(self, delegate): 39 | dimension_spec = {"type": self.filter_type, "delegate": delegate} 40 | return dimension_spec 41 | 42 | 43 | class ListFilteredSpec(FilteredSpec): 44 | 45 | filter_type = "listFiltered" 46 | 47 | def __init__(self, values, is_whitelist=True): 48 | self._values = values 49 | self._is_whitelist = is_whitelist 50 | 51 | def build(self, dimension_spec): 52 | filtered_dimension_spec = super(ListFilteredSpec, self).build(dimension_spec) 53 | filtered_dimension_spec["values"] = self._values 54 | 55 | if not self._is_whitelist: 56 | filtered_dimension_spec["isWhitelist"] = False 57 | 58 | return filtered_dimension_spec 59 | 60 | 61 | class RegexFilteredSpec(FilteredSpec): 62 | 63 | filter_type = "regexFiltered" 64 | 65 | def __init__(self, pattern): 66 | self._pattern = pattern 67 | 68 | def build(self, dimension_spec): 69 | filtered_dimension_spec = super(RegexFilteredSpec, self).build(dimension_spec) 70 | filtered_dimension_spec["pattern"] = self._pattern 71 | 72 | return filtered_dimension_spec 73 | 74 | 75 | class ExtractionFunction(object): 76 | 77 | extraction_type = None 78 | 79 | def build(self): 80 | return {"type": self.extraction_type} 81 | 82 | 83 | class BaseRegexExtraction(ExtractionFunction): 84 | def __init__(self, expr): 85 | super(BaseRegexExtraction, self).__init__() 86 | self._expr = expr 87 | 88 | def build(self): 89 | extractor = super(BaseRegexExtraction, self).build() 90 | extractor["expr"] = self._expr 91 | 92 | return extractor 93 | 94 | 95 | class RegexExtraction(BaseRegexExtraction): 96 | 97 | extraction_type = "regex" 98 | 99 | 100 | class PartialExtraction(BaseRegexExtraction): 101 | 102 | extraction_type = "partial" 103 | 104 | 105 | class JavascriptExtraction(ExtractionFunction): 106 | 107 | extraction_type = "javascript" 108 | 109 | def __init__(self, func, injective=False): 110 | super(JavascriptExtraction, self).__init__() 111 | self._func = func 112 | self._injective = injective 113 | 114 | def build(self): 115 | extractor = super(JavascriptExtraction, self).build() 116 | extractor["function"] = self._func 117 | extractor["injective"] = self._injective 118 | 119 | return extractor 120 | 121 | 122 | class TimeFormatExtraction(ExtractionFunction): 123 | 124 | extraction_type = "timeFormat" 125 | 126 | def __init__(self, format, locale=None, time_zone=None): 127 | super(TimeFormatExtraction, self).__init__() 128 | self._format = format 129 | self._locale = locale 130 | self._time_zone = time_zone 131 | 132 | def build(self): 133 | extractor = super(TimeFormatExtraction, self).build() 134 | extractor["format"] = self._format 135 | if self._locale: 136 | extractor["locale"] = self._locale 137 | if self._time_zone: 138 | extractor["timeZone"] = self._time_zone 139 | 140 | return extractor 141 | 142 | 143 | class LookupExtraction(ExtractionFunction): 144 | 145 | extraction_type = "lookup" 146 | lookup_type = None 147 | 148 | def __init__( 149 | self, retain_missing_values=False, replace_missing_values=None, injective=False 150 | ): 151 | super(LookupExtraction, self).__init__() 152 | self._retain_missing_values = retain_missing_values 153 | self._replace_missing_values = replace_missing_values 154 | self._injective = injective 155 | 156 | def build(self): 157 | extractor = super(LookupExtraction, self).build() 158 | extractor["lookup"] = self.build_lookup() 159 | extractor["retainMissingValue"] = self._retain_missing_values 160 | extractor["replaceMissingValueWith"] = self._replace_missing_values 161 | extractor["injective"] = self._injective 162 | 163 | return extractor 164 | 165 | def build_lookup(self): 166 | return {"type": self.lookup_type} 167 | 168 | 169 | class MapLookupExtraction(LookupExtraction): 170 | 171 | lookup_type = "map" 172 | 173 | def __init__(self, mapping, **kwargs): 174 | super(MapLookupExtraction, self).__init__(**kwargs) 175 | self._mapping = mapping 176 | 177 | def build_lookup(self): 178 | lookup = super(MapLookupExtraction, self).build_lookup() 179 | lookup["map"] = self._mapping 180 | 181 | return lookup 182 | 183 | 184 | class NamespaceLookupExtraction(LookupExtraction): 185 | 186 | lookup_type = "namespace" 187 | 188 | def __init__(self, namespace, **kwargs): 189 | super(NamespaceLookupExtraction, self).__init__(**kwargs) 190 | self._namespace = namespace 191 | 192 | def build_lookup(self): 193 | lookup = super(NamespaceLookupExtraction, self).build_lookup() 194 | lookup["namespace"] = self._namespace 195 | 196 | return lookup 197 | 198 | 199 | class RegisteredLookupExtraction(LookupExtraction): 200 | 201 | extraction_type = "registeredLookup" 202 | 203 | def __init__(self, reglookup, **kwargs): 204 | super(RegisteredLookupExtraction, self).__init__(**kwargs) 205 | self._lookup = reglookup 206 | 207 | def build_lookup(self): 208 | return self._lookup 209 | -------------------------------------------------------------------------------- /pydruid/async_client.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016 Metamarkets Group Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import json 17 | 18 | from pydruid.client import BaseDruidClient 19 | 20 | try: 21 | from tornado import gen 22 | from tornado.httpclient import AsyncHTTPClient, HTTPError 23 | except ImportError: 24 | print("Warning: unable to import Tornado. The asynchronous client will not work.") 25 | 26 | 27 | class AsyncPyDruid(BaseDruidClient): 28 | """ 29 | Asynchronous PyDruid client which mirrors functionality of the synchronous 30 | PyDruid, but it executes queries 31 | asynchronously (using an asynchronous http client from Tornado framework). 32 | 33 | Returns Query objects that can be used for exporting query results into 34 | TSV files or pandas.DataFrame objects 35 | for subsequent analysis. 36 | 37 | :param str url: URL of Broker node in the Druid cluster 38 | :param str endpoint: Endpoint that Broker listens for queries on 39 | :param dict defaults: (optional) Dict of parameters for the Async HTTP Client subclass 40 | :param str http_client: Tornado HTTP client implementation to use. 41 | Default: None (use simple_httpclient) 42 | 43 | Example 44 | 45 | .. code-block:: python 46 | :linenos: 47 | 48 | >>> from pydruid.async_client import * 49 | 50 | >>> query = AsyncPyDruid('http://localhost:8083', 'druid/v2/') 51 | 52 | >>> top = yield query.topn( 53 | datasource='twitterstream', 54 | granularity='all', 55 | intervals='2013-10-04/pt1h', 56 | aggregations={"count": doublesum("count")}, 57 | dimension='user_name', 58 | filter = Dimension('user_lang') == 'en', 59 | metric='count', 60 | threshold=2 61 | ) 62 | 63 | >>> print json.dumps(top.query_dict, indent=2) 64 | >>> { 65 | "metric": "count", 66 | "aggregations": [ 67 | { 68 | "type": "doubleSum", 69 | "fieldName": "count", 70 | "name": "count" 71 | } 72 | ], 73 | "dimension": "user_name", 74 | "filter": { 75 | "type": "selector", 76 | "dimension": "user_lang", 77 | "value": "en" 78 | }, 79 | "intervals": "2013-10-04/pt1h", 80 | "dataSource": "twitterstream", 81 | "granularity": "all", 82 | "threshold": 2, 83 | "queryType": "topN" 84 | } 85 | 86 | >>> print top.result 87 | >>> [{'timestamp': '2013-10-04T00:00:00.000Z', 88 | 'result': [{'count': 7.0, 'user_name': 'user_1'}, 89 | {'count': 6.0, 'user_name': 'user_2'}]}] 90 | 91 | >>> df = top.export_pandas() 92 | >>> print df 93 | >>> count timestamp user_name 94 | 0 7 2013-10-04T00:00:00.000Z user_1 95 | 1 6 2013-10-04T00:00:00.000Z user_2 96 | """ 97 | 98 | def __init__(self, url, endpoint, defaults=None, http_client=None): 99 | super(AsyncPyDruid, self).__init__(url, endpoint) 100 | self.async_http_defaults = defaults 101 | self.http_client = http_client 102 | 103 | @gen.coroutine 104 | def _post(self, query): 105 | AsyncHTTPClient.configure(self.http_client, defaults=self.async_http_defaults) 106 | http_client = AsyncHTTPClient() 107 | try: 108 | headers, querystr, url = self._prepare_url_headers_and_body(query) 109 | response = yield http_client.fetch( 110 | url, method="POST", headers=headers, body=querystr 111 | ) 112 | except HTTPError as e: 113 | self.__handle_http_error(e, query) 114 | else: 115 | query.parse(response.body.decode("utf-8")) 116 | raise gen.Return(query) 117 | 118 | @staticmethod 119 | def __handle_http_error(e, query): 120 | err = None 121 | if e.code == 500: 122 | # has Druid returned an error? 123 | try: 124 | err = json.loads(e.response.body.decode("utf-8")) 125 | except ValueError: 126 | pass 127 | else: 128 | err = err.get("error", None) 129 | raise IOError( 130 | "{0}\n Druid Error: {1}\n Query is: {2}".format( 131 | e, err, json.dumps(query.query_dict, indent=4) 132 | ) 133 | ) 134 | 135 | @gen.coroutine 136 | def topn(self, **kwargs): 137 | query = self.query_builder.topn(kwargs) 138 | result = yield self._post(query) 139 | raise gen.Return(result) 140 | 141 | @gen.coroutine 142 | def timeseries(self, **kwargs): 143 | query = self.query_builder.timeseries(kwargs) 144 | result = yield self._post(query) 145 | raise gen.Return(result) 146 | 147 | @gen.coroutine 148 | def groupby(self, **kwargs): 149 | query = self.query_builder.groupby(kwargs) 150 | result = yield self._post(query) 151 | raise gen.Return(result) 152 | 153 | @gen.coroutine 154 | def segment_metadata(self, **kwargs): 155 | query = self.query_builder.segment_metadata(kwargs) 156 | result = yield self._post(query) 157 | raise gen.Return(result) 158 | 159 | @gen.coroutine 160 | def time_boundary(self, **kwargs): 161 | query = self.query_builder.time_boundary(kwargs) 162 | result = yield self._post(query) 163 | raise gen.Return(result) 164 | 165 | @gen.coroutine 166 | def select(self, **kwargs): 167 | query = self.query_builder.select(kwargs) 168 | result = yield self._post(query) 169 | raise gen.Return(result) 170 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyDruid.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyDruid.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PyDruid" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PyDruid" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /pydruid/utils/postaggregator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Metamarkets Group Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | class Postaggregator: 17 | def __init__(self, fn, fields, name): 18 | self.post_aggregator = { 19 | "type": "arithmetic", 20 | "name": name, 21 | "fn": fn, 22 | "fields": fields, 23 | } 24 | self.name = name 25 | 26 | def __mul__(self, other): 27 | return Postaggregator("*", self.fields(other), self.name + "mul" + other.name) 28 | 29 | def __sub__(self, other): 30 | return Postaggregator("-", self.fields(other), self.name + "sub" + other.name) 31 | 32 | def __add__(self, other): 33 | return Postaggregator("+", self.fields(other), self.name + "add" + other.name) 34 | 35 | def __div__(self, other): 36 | return Postaggregator("/", self.fields(other), self.name + "div" + other.name) 37 | 38 | def __truediv__(self, other): 39 | return self.__div__(other) 40 | 41 | def fields(self, other): 42 | return [self.post_aggregator, other.post_aggregator] 43 | 44 | @staticmethod 45 | def build_post_aggregators(postaggs): 46 | def rename_postagg(new_name, post_aggregator): 47 | post_aggregator["name"] = new_name 48 | return post_aggregator 49 | 50 | return [ 51 | rename_postagg(new_name, postagg.post_aggregator) 52 | for (new_name, postagg) in postaggs.items() 53 | ] 54 | 55 | 56 | class QuantilesDoublesSketchToQuantile(Postaggregator): 57 | def __init__(self, name: str, field_name: str, fraction: float): 58 | self.post_aggregator = { 59 | "type": "quantilesDoublesSketchToQuantile", 60 | "name": name, 61 | "fraction": fraction, 62 | "field": { 63 | "fieldName": field_name, 64 | "name": field_name, 65 | "type": "fieldAccess", 66 | }, 67 | } 68 | 69 | 70 | class Quantile(Postaggregator): 71 | def __init__(self, name, probability): 72 | Postaggregator.__init__(self, None, None, name) 73 | self.post_aggregator = { 74 | "type": "quantile", 75 | "fieldName": name, 76 | "probability": probability, 77 | } 78 | 79 | 80 | class Quantiles(Postaggregator): 81 | def __init__(self, name, probabilities): 82 | Postaggregator.__init__(self, None, None, name) 83 | self.post_aggregator = { 84 | "type": "quantiles", 85 | "fieldName": name, 86 | "probabilities": probabilities, 87 | } 88 | 89 | 90 | class Field(Postaggregator): 91 | def __init__(self, name): 92 | Postaggregator.__init__(self, None, None, name) 93 | self.post_aggregator = {"type": "fieldAccess", "fieldName": name} 94 | 95 | 96 | class Const(Postaggregator): 97 | def __init__(self, value, output_name=None): 98 | 99 | if output_name is None: 100 | name = "const" 101 | else: 102 | name = output_name 103 | 104 | Postaggregator.__init__(self, None, None, name) 105 | self.post_aggregator = {"type": "constant", "name": name, "value": value} 106 | 107 | 108 | class HyperUniqueCardinality(Postaggregator): 109 | def __init__(self, name): 110 | Postaggregator.__init__(self, None, None, name) 111 | self.post_aggregator = {"type": "hyperUniqueCardinality", "fieldName": name} 112 | 113 | 114 | class DoubleGreatest(Postaggregator): 115 | def __init__(self, fields, output_name=None): 116 | 117 | if output_name is None: 118 | name = "doubleGreatest" 119 | else: 120 | name = output_name 121 | 122 | Postaggregator.__init__(self, None, None, name) 123 | self.post_aggregator = { 124 | "type": "doubleGreatest", 125 | "name": name, 126 | "fields": [f.post_aggregator for f in fields], 127 | } 128 | 129 | 130 | class DoubleLeast(Postaggregator): 131 | def __init__(self, fields, output_name=None): 132 | 133 | if output_name is None: 134 | name = "doubleLeast" 135 | else: 136 | name = output_name 137 | 138 | Postaggregator.__init__(self, None, None, name) 139 | self.post_aggregator = { 140 | "type": "doubleLeast", 141 | "name": name, 142 | "fields": [f.post_aggregator for f in fields], 143 | } 144 | 145 | 146 | class LongGreatest(Postaggregator): 147 | def __init__(self, fields, output_name=None): 148 | 149 | if output_name is None: 150 | name = "longGreatest" 151 | else: 152 | name = output_name 153 | 154 | Postaggregator.__init__(self, None, None, name) 155 | self.post_aggregator = { 156 | "type": "longGreatest", 157 | "name": name, 158 | "fields": [f.post_aggregator for f in fields], 159 | } 160 | 161 | 162 | class LongLeast(Postaggregator): 163 | def __init__(self, fields, output_name=None): 164 | 165 | if output_name is None: 166 | name = "longLeast" 167 | else: 168 | name = output_name 169 | 170 | Postaggregator.__init__(self, None, None, name) 171 | self.post_aggregator = { 172 | "type": "longLeast", 173 | "name": name, 174 | "fields": [f.post_aggregator for f in fields], 175 | } 176 | 177 | 178 | class ThetaSketchOp(object): 179 | def __init__(self, fn, fields, name): 180 | self.post_aggregator = { 181 | "type": "thetaSketchSetOp", 182 | "name": name, 183 | "func": fn, 184 | "fields": fields, 185 | } 186 | self.name = name 187 | 188 | def __or__(self, other): 189 | return ThetaSketchOp( 190 | "UNION", self.fields(other), self.name + "_OR_" + other.name 191 | ) 192 | 193 | def __and__(self, other): 194 | return ThetaSketchOp( 195 | "INTERSECT", self.fields(other), self.name + "_AND_" + other.name 196 | ) 197 | 198 | def __ne__(self, other): 199 | return ThetaSketchOp( 200 | "NOT", self.fields(other), self.name + "_NOT_" + other.name 201 | ) 202 | 203 | def fields(self, other): 204 | return [self.post_aggregator, other.post_aggregator] 205 | 206 | @staticmethod 207 | def build_post_aggregators(thetasketchops): 208 | def rename_thetasketchop(new_name, thetasketchop): 209 | thetasketchop["name"] = new_name 210 | return thetasketchop 211 | 212 | return [ 213 | rename_thetasketchop(new_name, thetasketchop.post_aggregator) 214 | for (new_name, thetasketchop) in thetasketchops.items() 215 | ] 216 | 217 | 218 | class ThetaSketch(ThetaSketchOp): 219 | def __init__(self, name): 220 | ThetaSketchOp.__init__(self, None, None, name) 221 | self.post_aggregator = {"type": "fieldAccess", "fieldName": name} 222 | 223 | 224 | class ThetaSketchEstimate(Postaggregator): 225 | def __init__(self, fields): 226 | field = ( 227 | fields.post_aggregator 228 | if type(fields) in [ThetaSketch, ThetaSketchOp] 229 | else fields 230 | ) 231 | self.post_aggregator = { 232 | "type": "thetaSketchEstimate", 233 | "name": "thetasketchestimate", 234 | "field": field, 235 | } 236 | self.name = "thetasketchestimate" 237 | -------------------------------------------------------------------------------- /pydruid/db/sqlalchemy.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import text, types, util 2 | from sqlalchemy.engine import default 3 | from sqlalchemy.sql import compiler 4 | 5 | import pydruid.db 6 | 7 | RESERVED_SCHEMAS = ["INFORMATION_SCHEMA"] 8 | 9 | 10 | jdbc_type_map = { 11 | -6: types.BigInteger, 12 | -5: types.BigInteger, 13 | 1: types.String, 14 | 3: types.Float, 15 | 4: types.BigInteger, 16 | 5: types.BigInteger, 17 | 6: types.Float, 18 | 7: types.Float, 19 | 8: types.Float, 20 | 12: types.String, 21 | 16: types.Boolean, 22 | 91: types.DATE, 23 | 93: types.TIMESTAMP, 24 | 1111: types.BLOB, 25 | } 26 | 27 | 28 | class UniversalSet(object): 29 | def __contains__(self, item): 30 | return True 31 | 32 | 33 | class DruidIdentifierPreparer(compiler.IdentifierPreparer): 34 | reserved_words = UniversalSet() 35 | 36 | 37 | class DruidCompiler(compiler.SQLCompiler): 38 | pass 39 | 40 | 41 | class DruidTypeCompiler(compiler.GenericTypeCompiler): 42 | def visit_REAL(self, type_, **kwargs): 43 | return "DOUBLE" 44 | 45 | def visit_NUMERIC(self, type_, **kwargs): 46 | return "LONG" 47 | 48 | visit_DECIMAL = visit_NUMERIC 49 | visit_INTEGER = visit_NUMERIC 50 | visit_SMALLINT = visit_NUMERIC 51 | visit_BIGINT = visit_NUMERIC 52 | visit_BOOLEAN = visit_NUMERIC 53 | visit_TIMESTAMP = visit_NUMERIC 54 | visit_DATE = visit_NUMERIC 55 | 56 | def visit_CHAR(self, type_, **kwargs): 57 | return "STRING" 58 | 59 | visit_NCHAR = visit_CHAR 60 | visit_VARCHAR = visit_CHAR 61 | visit_NVARCHAR = visit_CHAR 62 | visit_TEXT = visit_CHAR 63 | 64 | def visit_DATETIME(self, type_, **kwargs): 65 | return "LONG" 66 | 67 | def visit_TIME(self, type_, **kwargs): 68 | return "LONG" 69 | 70 | def visit_BLOB(self, type_, **kwargs): 71 | return "COMPLEX" 72 | 73 | visit_CLOB = visit_BLOB 74 | visit_NCLOB = visit_BLOB 75 | visit_VARBINARY = visit_BLOB 76 | visit_BINARY = visit_BLOB 77 | 78 | 79 | class DruidDialect(default.DefaultDialect): 80 | 81 | name = "druid" 82 | scheme = "http" 83 | driver = "rest" 84 | user = None 85 | password = None 86 | preparer = DruidIdentifierPreparer 87 | statement_compiler = DruidCompiler 88 | type_compiler = DruidTypeCompiler 89 | supports_alter = False 90 | supports_pk_autoincrement = False 91 | supports_default_values = False 92 | supports_empty_insert = False 93 | supports_unicode_statements = True 94 | supports_unicode_binds = True 95 | returns_unicode_strings = True 96 | description_encoding = None 97 | supports_native_boolean = True 98 | 99 | def __init__(self, context=None, *args, **kwargs): 100 | super(DruidDialect, self).__init__(*args, **kwargs) 101 | self.context = context or {} 102 | 103 | @classmethod 104 | def dbapi(cls): 105 | return pydruid.db 106 | 107 | def create_connect_args(self, url): 108 | kwargs = { 109 | **url.query, 110 | "host": url.host, 111 | "port": url.port or 8082, 112 | "user": url.username or None, 113 | "password": url.password or None, 114 | "path": url.database, 115 | "scheme": self.scheme, 116 | "context": self.context, 117 | "header": url.query.get("header") == "true", 118 | } 119 | return ([], kwargs) 120 | 121 | def do_ping(self, dbapi_connection) -> bool: 122 | """ 123 | Return if the database can be reached. 124 | """ 125 | try: 126 | dbapi_connection.execute(text("SELECT 1")) 127 | except Exception as ex: 128 | return False 129 | 130 | return True 131 | 132 | def get_schema_names(self, connection, **kwargs): 133 | # Each Druid datasource appears as a table in the "druid" schema. This 134 | # is also the default schema, so Druid datasources can be referenced as 135 | # either druid.dataSourceName or simply dataSourceName. 136 | result = connection.execute( 137 | text("SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA") 138 | ) 139 | 140 | return [ 141 | row.SCHEMA_NAME for row in result if row.SCHEMA_NAME not in RESERVED_SCHEMAS 142 | ] 143 | 144 | def has_table(self, connection, table_name, schema=None): 145 | query = """ 146 | SELECT COUNT(*) > 0 AS exists_ 147 | FROM INFORMATION_SCHEMA.TABLES 148 | WHERE TABLE_NAME = '{table_name}' 149 | """.format( 150 | table_name=table_name 151 | ) 152 | 153 | result = connection.execute(text(query)) 154 | return result.fetchone().exists_ 155 | 156 | def get_table_names(self, connection, schema=None, **kwargs): 157 | query = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES" 158 | if schema: 159 | query = "{query} WHERE TABLE_SCHEMA = '{schema}'".format( 160 | query=query, schema=schema 161 | ) 162 | 163 | result = connection.execute(text(query)) 164 | return [row.TABLE_NAME for row in result] 165 | 166 | def get_view_names(self, connection, schema=None, **kwargs): 167 | return [] 168 | 169 | def get_table_options(self, connection, table_name, schema=None, **kwargs): 170 | return {} 171 | 172 | def get_columns(self, connection, table_name, schema=None, **kwargs): 173 | query = """ 174 | SELECT COLUMN_NAME, 175 | JDBC_TYPE, 176 | IS_NULLABLE, 177 | COLUMN_DEFAULT 178 | FROM INFORMATION_SCHEMA.COLUMNS 179 | WHERE TABLE_NAME = '{table_name}' 180 | """.format( 181 | table_name=table_name 182 | ) 183 | if schema: 184 | query = "{query} AND TABLE_SCHEMA = '{schema}'".format( 185 | query=query, schema=schema 186 | ) 187 | 188 | result = connection.execute(text(query)) 189 | 190 | return [ 191 | { 192 | "name": row.COLUMN_NAME, 193 | "type": self._map_jdbc_type(row), 194 | "nullable": get_is_nullable(row.IS_NULLABLE), 195 | "default": get_default(row.COLUMN_DEFAULT), 196 | } 197 | for row in result 198 | ] 199 | 200 | def get_pk_constraint(self, connection, table_name, schema=None, **kwargs): 201 | return {"constrained_columns": [], "name": None} 202 | 203 | def get_foreign_keys(self, connection, table_name, schema=None, **kwargs): 204 | return [] 205 | 206 | def get_check_constraints(self, connection, table_name, schema=None, **kwargs): 207 | return [] 208 | 209 | def get_table_comment(self, connection, table_name, schema=None, **kwargs): 210 | return {"text": ""} 211 | 212 | def get_indexes(self, connection, table_name, schema=None, **kwargs): 213 | return [] 214 | 215 | def get_unique_constraints(self, connection, table_name, schema=None, **kwargs): 216 | return [] 217 | 218 | def get_view_definition(self, connection, view_name, schema=None, **kwargs): 219 | pass 220 | 221 | def do_rollback(self, dbapi_connection): 222 | pass 223 | 224 | def _check_unicode_returns(self, connection, additional_tests=None): 225 | return True 226 | 227 | def _check_unicode_description(self, connection): 228 | return True 229 | 230 | def _map_jdbc_type(self, row): 231 | if row.JDBC_TYPE in jdbc_type_map: 232 | return jdbc_type_map[row.JDBC_TYPE] 233 | util.warn( 234 | "Failed to map column '{row.COLUMN_NAME}' with " 235 | "JDBC type '{row.JDBC_TYPE}' to a sqlalchemy type.".format(row=row) 236 | ) 237 | return types.NullType 238 | 239 | 240 | DruidHTTPDialect = DruidDialect 241 | 242 | 243 | class DruidHTTPSDialect(DruidDialect): 244 | 245 | scheme = "https" 246 | 247 | 248 | def get_is_nullable(druid_is_nullable): 249 | # this should be 'YES' or 'NO'; we default to no 250 | return druid_is_nullable.lower() == "yes" 251 | 252 | 253 | def get_default(druid_column_default): 254 | # currently unused, returns '' 255 | return str(druid_column_default) if druid_column_default != "" else None 256 | -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import textwrap 3 | import urllib 4 | from io import StringIO 5 | from unittest.mock import Mock, patch 6 | 7 | import pytest 8 | 9 | from pydruid.client import PyDruid 10 | from pydruid.query import Query 11 | from pydruid.utils.aggregators import doublesum 12 | from pydruid.utils.filters import Dimension 13 | 14 | 15 | def create_client(http_headers=None): 16 | return PyDruid("http://localhost:8083", "druid/v2/", http_headers=http_headers) 17 | 18 | 19 | def create_blank_query(): 20 | return Query({}, "none") 21 | 22 | 23 | def _http_error(code, msg, data=""): 24 | # Need a file-like object for the response data 25 | fp = StringIO(data) 26 | return urllib.error.HTTPError( 27 | url="http://fakeurl:8080/druid/v2/", hdrs={}, code=code, msg=msg, fp=fp 28 | ) 29 | 30 | 31 | class TestPyDruid: 32 | @patch("pydruid.client.urllib.request.urlopen") 33 | def test_druid_returns_error(self, mock_urlopen): 34 | # given 35 | mock_urlopen.side_effect = _http_error(500, "Druid error") 36 | client = create_client() 37 | 38 | # when / then 39 | with pytest.raises(IOError): 40 | client.topn( 41 | datasource="testdatasource", 42 | granularity="all", 43 | intervals="2015-12-29/pt1h", 44 | aggregations={"count": doublesum("count")}, 45 | dimension="user_name", 46 | metric="count", 47 | filter=Dimension("user_lang") == "en", 48 | threshold=1, 49 | context={"timeout": 1000}, 50 | ) 51 | 52 | @patch("pydruid.client.urllib.request.urlopen") 53 | def test_druid_returns_html_error(self, mock_urlopen): 54 | # given 55 | message = textwrap.dedent( 56 | """ 57 | 58 | 59 | 60 | Error 500 61 | 62 | 63 |

HTTP ERROR: 500

64 |

Problem accessing /druid/v2/. Reason: 65 |

    javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded

66 |
Powered by Jetty:// 9.3.19.v20170502
67 | 68 | 69 | """ 70 | ).strip() 71 | mock_urlopen.side_effect = _http_error(500, "Internal Server Error", message) 72 | client = create_client() 73 | 74 | # when / then 75 | with pytest.raises(IOError) as e: 76 | client.topn( 77 | datasource="testdatasource", 78 | granularity="all", 79 | intervals="2015-12-29/pt1h", 80 | aggregations={"count": doublesum("count")}, 81 | dimension="user_name", 82 | metric="count", 83 | filter=Dimension("user_lang") == "en", 84 | threshold=1, 85 | context={"timeout": 1000}, 86 | ) 87 | 88 | assert ( 89 | str(e.value) 90 | == textwrap.dedent( 91 | """ 92 | HTTP Error 500: Internal Server Error 93 | Druid Error: javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded 94 | Query is: { 95 | "aggregations": [ 96 | { 97 | "fieldName": "count", 98 | "name": "count", 99 | "type": "doubleSum" 100 | } 101 | ], 102 | "context": { 103 | "timeout": 1000 104 | }, 105 | "dataSource": "testdatasource", 106 | "dimension": "user_name", 107 | "filter": { 108 | "dimension": "user_lang", 109 | "type": "selector", 110 | "value": "en" 111 | }, 112 | "granularity": "all", 113 | "intervals": "2015-12-29/pt1h", 114 | "metric": "count", 115 | "queryType": "topN", 116 | "threshold": 1 117 | } 118 | """ 119 | ).strip() 120 | ) 121 | 122 | @patch("pydruid.client.urllib.request.urlopen") 123 | def test_druid_returns_results(self, mock_urlopen): 124 | # given 125 | response = Mock() 126 | response.read.return_value = """ 127 | [ { 128 | "timestamp" : "2015-12-30T14:14:49.000Z", 129 | "result" : [ { 130 | "dimension" : "aaaa", 131 | "metric" : 100 132 | } ] 133 | } ] 134 | """.encode( 135 | "utf-8" 136 | ) 137 | mock_urlopen.return_value = response 138 | client = create_client() 139 | 140 | # when 141 | top = client.topn( 142 | datasource="testdatasource", 143 | granularity="all", 144 | intervals="2015-12-29/pt1h", 145 | aggregations={"count": doublesum("count")}, 146 | dimension="user_name", 147 | metric="count", 148 | filter=Dimension("user_lang") == "en", 149 | threshold=1, 150 | context={"timeout": 1000}, 151 | ) 152 | 153 | # then 154 | assert top is not None 155 | assert len(top.result) == 1 156 | assert len(top.result[0]["result"]) == 1 157 | 158 | @patch("pydruid.client.urllib.request.urlopen") 159 | def test_client_allows_to_export_last_query(self, mock_urlopen): 160 | # given 161 | response = Mock() 162 | response.read.return_value = """ 163 | [ { 164 | "timestamp" : "2015-12-30T14:14:49.000Z", 165 | "result" : [ { 166 | "dimension" : "aaaa", 167 | "metric" : 100 168 | } ] 169 | } ] 170 | """.encode( 171 | "utf-8" 172 | ) 173 | mock_urlopen.return_value = response 174 | client = create_client() 175 | client.topn( 176 | datasource="testdatasource", 177 | granularity="all", 178 | intervals="2015-12-29/pt1h", 179 | aggregations={"count": doublesum("count")}, 180 | dimension="user_name", 181 | metric="count", 182 | filter=Dimension("user_lang") == "en", 183 | threshold=1, 184 | context={"timeout": 1000}, 185 | ) 186 | 187 | # when / then 188 | # assert that last_query.export_tsv method was called (it should throw an exception, given empty path) 189 | with pytest.raises(TypeError): 190 | client.export_tsv(None) 191 | 192 | def test_client_auth_creds(self): 193 | client = create_client() 194 | query = create_blank_query() 195 | client.set_basic_auth_credentials("myUsername", "myPassword") 196 | headers, _, _ = client._prepare_url_headers_and_body(query) 197 | assert headers["Authorization"] == "Basic bXlVc2VybmFtZTpteVBhc3N3b3Jk" 198 | 199 | def test_client_custom_headers(self): 200 | client = create_client(http_headers = {"custom-header": "test"}) 201 | query = create_blank_query() 202 | headers, _, _ = client._prepare_url_headers_and_body(query) 203 | assert headers["custom-header"] == "test" 204 | 205 | @patch("pydruid.client.urllib.request.urlopen") 206 | @patch("pydruid.client.ssl.create_default_context") 207 | def test_client_with_cafile(self, mock_create_default_context, mock_urlopen): 208 | response = Mock() 209 | response.read.return_value = """ 210 | [ { 211 | "timestamp" : "2015-12-30T14:14:49.000Z", 212 | "result" : [ { 213 | "dimension" : "aaaa", 214 | "metric" : 100 215 | } ] 216 | } ] 217 | """.encode( 218 | "utf-8" 219 | ) 220 | mock_urlopen.return_value = response 221 | 222 | client = PyDruid("http://localhost:8083", "druid/v2/", cafile="tests/cert.pem") 223 | 224 | mock_create_default_context.assert_called_once() 225 | context = mock_create_default_context.return_value 226 | context.load_verify_locations.assert_called_once_with(cafile="tests/cert.pem") 227 | assert client.context == context 228 | 229 | client.topn() 230 | assert mock_urlopen.called_with(context=client.context) 231 | -------------------------------------------------------------------------------- /tests/utils/test_aggregators.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from copy import deepcopy 4 | from operator import itemgetter 5 | 6 | from pydruid.utils import aggregators, filters 7 | 8 | 9 | class TestAggregators: 10 | def test_aggregators(self): 11 | aggs = [ 12 | ("longsum", "longSum"), 13 | ("longmin", "longMin"), 14 | ("longmax", "longMax"), 15 | ("doublesum", "doubleSum"), 16 | ("doublemin", "doubleMin"), 17 | ("doublemax", "doubleMax"), 18 | ("count", "count"), 19 | ("hyperunique", "hyperUnique"), 20 | ("stringfirst", "stringFirst"), 21 | ("stringlast", "stringLast"), 22 | ] 23 | aggs_funcs = [ 24 | (getattr(aggregators, agg_name), agg_type) for agg_name, agg_type in aggs 25 | ] 26 | for f, agg_type in aggs_funcs: 27 | assert f("metric") == {"type": agg_type, "fieldName": "metric"} 28 | 29 | def test_filtered_aggregator(self): 30 | filter_ = filters.Filter(dimension="dim", value="val") 31 | aggs = [ 32 | aggregators.count("metric1"), 33 | aggregators.longsum("metric2"), 34 | aggregators.doublesum("metric3"), 35 | aggregators.doublemin("metric4"), 36 | aggregators.doublemax("metric5"), 37 | aggregators.hyperunique("metric6"), 38 | aggregators.cardinality("dim1"), 39 | aggregators.cardinality(["dim1", "dim2"], by_row=True), 40 | aggregators.thetasketch("dim1"), 41 | aggregators.thetasketch("metric7"), 42 | aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192), 43 | ] 44 | for agg in aggs: 45 | expected = { 46 | "type": "filtered", 47 | "filter": {"type": "selector", "dimension": "dim", "value": "val"}, 48 | "aggregator": agg, 49 | } 50 | actual = aggregators.filtered(filter_, agg) 51 | assert actual == expected 52 | 53 | def test_nested_filtered_aggregator(self): 54 | filter1 = filters.Filter(dimension="dim1", value="val") 55 | filter2 = filters.Filter(dimension="dim2", value="val") 56 | agg = aggregators.filtered( 57 | filter1, aggregators.filtered(filter2, aggregators.count("metric1")) 58 | ) 59 | actual = aggregators.build_aggregators({"agg_name": agg}) 60 | # the innermost aggregation must have 'agg_name' 61 | expected = [ 62 | { 63 | "type": "filtered", 64 | "aggregator": { 65 | "type": "filtered", 66 | "aggregator": { 67 | "fieldName": "metric1", 68 | "type": "count", 69 | "name": "agg_name", 70 | }, 71 | "filter": {"dimension": "dim2", "value": "val", "type": "selector"}, 72 | }, 73 | "filter": {"dimension": "dim1", "value": "val", "type": "selector"}, 74 | } 75 | ] 76 | assert expected == actual 77 | 78 | def test_build_aggregators(self): 79 | agg_input = { 80 | "agg1": aggregators.count("metric1"), 81 | "agg2": aggregators.longsum("metric2"), 82 | "agg3": aggregators.doublesum("metric3"), 83 | "agg4": aggregators.doublemin("metric4"), 84 | "agg5": aggregators.doublemax("metric5"), 85 | "agg6": aggregators.hyperunique("metric6"), 86 | "agg7": aggregators.cardinality("dim1"), 87 | "agg8": aggregators.cardinality(["dim1", "dim2"], by_row=True), 88 | "agg9": aggregators.thetasketch("dim1"), 89 | "agg10": aggregators.thetasketch("metric7"), 90 | "agg11": aggregators.thetasketch( 91 | "metric8", isinputthetasketch=True, size=8192 92 | ), 93 | } 94 | built_agg = aggregators.build_aggregators(agg_input) 95 | expected = [ 96 | {"name": "agg1", "type": "count", "fieldName": "metric1"}, 97 | {"name": "agg2", "type": "longSum", "fieldName": "metric2"}, 98 | {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"}, 99 | {"name": "agg4", "type": "doubleMin", "fieldName": "metric4"}, 100 | {"name": "agg5", "type": "doubleMax", "fieldName": "metric5"}, 101 | {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"}, 102 | { 103 | "name": "agg7", 104 | "type": "cardinality", 105 | "fieldNames": ["dim1"], 106 | "byRow": False, 107 | }, 108 | { 109 | "name": "agg8", 110 | "type": "cardinality", 111 | "fieldNames": ["dim1", "dim2"], 112 | "byRow": True, 113 | }, 114 | { 115 | "name": "agg9", 116 | "type": "thetaSketch", 117 | "fieldName": "dim1", 118 | "isInputThetaSketch": False, 119 | "size": 16384, 120 | }, 121 | { 122 | "name": "agg10", 123 | "type": "thetaSketch", 124 | "fieldName": "metric7", 125 | "isInputThetaSketch": False, 126 | "size": 16384, 127 | }, 128 | { 129 | "name": "agg11", 130 | "type": "thetaSketch", 131 | "fieldName": "metric8", 132 | "isInputThetaSketch": True, 133 | "size": 8192, 134 | }, 135 | ] 136 | assert sorted(built_agg, key=itemgetter("name")) == sorted( 137 | expected, key=itemgetter("name") 138 | ) 139 | 140 | def test_build_filtered_aggregator(self): 141 | filter_ = filters.Filter(dimension="dim", value="val") 142 | agg_input = { 143 | "agg1": aggregators.filtered(filter_, aggregators.count("metric1")), 144 | "agg2": aggregators.filtered(filter_, aggregators.longsum("metric2")), 145 | "agg3": aggregators.filtered(filter_, aggregators.doublesum("metric3")), 146 | "agg4": aggregators.filtered(filter_, aggregators.doublemin("metric4")), 147 | "agg5": aggregators.filtered(filter_, aggregators.doublemax("metric5")), 148 | "agg6": aggregators.filtered(filter_, aggregators.hyperunique("metric6")), 149 | "agg7": aggregators.filtered(filter_, aggregators.cardinality("dim1")), 150 | "agg8": aggregators.filtered( 151 | filter_, aggregators.cardinality(["dim1", "dim2"], by_row=True) 152 | ), 153 | "agg9": aggregators.filtered(filter_, aggregators.thetasketch("dim1")), 154 | "agg10": aggregators.filtered(filter_, aggregators.thetasketch("metric7")), 155 | "agg11": aggregators.filtered( 156 | filter_, 157 | aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192), 158 | ), 159 | } 160 | base = { 161 | "type": "filtered", 162 | "filter": {"type": "selector", "dimension": "dim", "value": "val"}, 163 | } 164 | 165 | aggs = [ 166 | {"name": "agg1", "type": "count", "fieldName": "metric1"}, 167 | {"name": "agg2", "type": "longSum", "fieldName": "metric2"}, 168 | {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"}, 169 | {"name": "agg4", "type": "doubleMin", "fieldName": "metric4"}, 170 | {"name": "agg5", "type": "doubleMax", "fieldName": "metric5"}, 171 | {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"}, 172 | { 173 | "name": "agg7", 174 | "type": "cardinality", 175 | "fieldNames": ["dim1"], 176 | "byRow": False, 177 | }, 178 | { 179 | "name": "agg8", 180 | "type": "cardinality", 181 | "fieldNames": ["dim1", "dim2"], 182 | "byRow": True, 183 | }, 184 | { 185 | "name": "agg9", 186 | "type": "thetaSketch", 187 | "fieldName": "dim1", 188 | "isInputThetaSketch": False, 189 | "size": 16384, 190 | }, 191 | { 192 | "name": "agg10", 193 | "type": "thetaSketch", 194 | "fieldName": "metric7", 195 | "isInputThetaSketch": False, 196 | "size": 16384, 197 | }, 198 | { 199 | "name": "agg11", 200 | "type": "thetaSketch", 201 | "fieldName": "metric8", 202 | "isInputThetaSketch": True, 203 | "size": 8192, 204 | }, 205 | ] 206 | expected = [] 207 | for agg in aggs: 208 | exp = deepcopy(base) 209 | exp.update({"aggregator": agg}) 210 | expected.append(exp) 211 | 212 | built_agg = aggregators.build_aggregators(agg_input) 213 | expected = sorted( 214 | built_agg, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k)) 215 | ) 216 | actual = sorted( 217 | expected, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k)) 218 | ) 219 | assert expected == actual 220 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # PyDruid documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Mar 3 16:38:17 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import os 16 | import sys 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | 23 | sys.path.insert(0, os.path.abspath("../../pydruid")) 24 | sys.path.insert(0, os.path.abspath("../../pydruid/pydruid")) 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.doctest"] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ["_templates"] 38 | 39 | # The suffix of source filenames. 40 | source_suffix = ".rst" 41 | 42 | # The encoding of source files. 43 | # source_encoding = 'utf-8-sig' 44 | 45 | # The master toctree document. 46 | master_doc = "index" 47 | 48 | # General information about the project. 49 | project = u"PyDruid" 50 | copyright = u"2014, Deep Ganguli" 51 | 52 | # The version info for the project you're documenting, acts as replacement for 53 | # |version| and |release|, also used in various other places throughout the 54 | # built documents. 55 | # 56 | # The short X.Y version. 57 | version = "0.2.0" 58 | # The full version, including alpha/beta/rc tags. 59 | release = "0.2.0" 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # language = None 64 | 65 | # There are two options for replacing |today|: either, you set today to some 66 | # non-false value, then it is used: 67 | # today = '' 68 | # Else, today_fmt is used as the format for a strftime call. 69 | # today_fmt = '%B %d, %Y' 70 | 71 | # List of patterns, relative to source directory, that match files and 72 | # directories to ignore when looking for source files. 73 | exclude_patterns = [] 74 | 75 | # The reST default role (used for this markup: `text`) to use for all 76 | # documents. 77 | # default_role = None 78 | 79 | # If true, '()' will be appended to :func: etc. cross-reference text. 80 | # add_function_parentheses = True 81 | 82 | # If true, the current module name will be prepended to all description 83 | # unit titles (such as .. function::). 84 | # add_module_names = True 85 | 86 | # If true, sectionauthor and moduleauthor directives will be shown in the 87 | # output. They are ignored by default. 88 | # show_authors = False 89 | 90 | # The name of the Pygments (syntax highlighting) style to use. 91 | pygments_style = "sphinx" 92 | 93 | # A list of ignored prefixes for module index sorting. 94 | # modindex_common_prefix = [] 95 | 96 | # If true, keep warnings as "system message" paragraphs in the built documents. 97 | # keep_warnings = False 98 | 99 | 100 | # -- Options for HTML output ---------------------------------------------- 101 | 102 | # The theme to use for HTML and HTML Help pages. See the documentation for 103 | # a list of builtin themes. 104 | html_theme = "default" 105 | 106 | # Theme options are theme-specific and customize the look and feel of a theme 107 | # further. For a list of options available for each theme, see the 108 | # documentation. 109 | # html_theme_options = {} 110 | 111 | # Add any paths that contain custom themes here, relative to this directory. 112 | # html_theme_path = [] 113 | 114 | # The name for this set of Sphinx documents. If None, it defaults to 115 | # " v documentation". 116 | # html_title = None 117 | 118 | # A shorter title for the navigation bar. Default is the same as html_title. 119 | # html_short_title = None 120 | 121 | # The name of an image file (relative to this directory) to place at the top 122 | # of the sidebar. 123 | # html_logo = None 124 | 125 | # The name of an image file (within the static path) to use as favicon of the 126 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 127 | # pixels large. 128 | # html_favicon = None 129 | 130 | # Add any paths that contain custom static files (such as style sheets) here, 131 | # relative to this directory. They are copied after the builtin static files, 132 | # so a file named "default.css" will overwrite the builtin "default.css". 133 | html_static_path = ["_static"] 134 | 135 | # Add any extra paths that contain custom files (such as robots.txt or 136 | # .htaccess) here, relative to this directory. These files are copied 137 | # directly to the root of the documentation. 138 | # html_extra_path = [] 139 | 140 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 141 | # using the given strftime format. 142 | # html_last_updated_fmt = '%b %d, %Y' 143 | 144 | # If true, SmartyPants will be used to convert quotes and dashes to 145 | # typographically correct entities. 146 | # html_use_smartypants = True 147 | 148 | # Custom sidebar templates, maps document names to template names. 149 | # html_sidebars = {} 150 | 151 | # Additional templates that should be rendered to pages, maps page names to 152 | # template names. 153 | # html_additional_pages = {} 154 | 155 | # If false, no module index is generated. 156 | # html_domain_indices = True 157 | 158 | # If false, no index is generated. 159 | # html_use_index = True 160 | 161 | # If true, the index is split into individual pages for each letter. 162 | # html_split_index = False 163 | 164 | # If true, links to the reST sources are added to the pages. 165 | # html_show_sourcelink = True 166 | 167 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 168 | # html_show_sphinx = True 169 | 170 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 171 | # html_show_copyright = True 172 | 173 | # If true, an OpenSearch description file will be output, and all pages will 174 | # contain a tag referring to it. The value of this option must be the 175 | # base URL from which the finished HTML is served. 176 | # html_use_opensearch = '' 177 | 178 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 179 | # html_file_suffix = None 180 | 181 | # Output file base name for HTML help builder. 182 | htmlhelp_basename = "PyDruiddoc" 183 | 184 | 185 | # -- Options for LaTeX output --------------------------------------------- 186 | 187 | latex_elements = { 188 | # The paper size ('letterpaper' or 'a4paper'). 189 | #'papersize': 'letterpaper', 190 | # The font size ('10pt', '11pt' or '12pt'). 191 | #'pointsize': '10pt', 192 | # Additional stuff for the LaTeX preamble. 193 | #'preamble': '', 194 | } 195 | 196 | # Grouping the document tree into LaTeX files. List of tuples 197 | # (source start file, target name, title, 198 | # author, documentclass [howto, manual, or own class]). 199 | latex_documents = [ 200 | ("index", "PyDruid.tex", u"PyDruid Documentation", u"Deep Ganguli", "manual") 201 | ] 202 | 203 | # The name of an image file (relative to this directory) to place at the top of 204 | # the title page. 205 | # latex_logo = None 206 | 207 | # For "manual" documents, if this is true, then toplevel headings are parts, 208 | # not chapters. 209 | # latex_use_parts = False 210 | 211 | # If true, show page references after internal links. 212 | # latex_show_pagerefs = False 213 | 214 | # If true, show URL addresses after external links. 215 | # latex_show_urls = False 216 | 217 | # Documents to append as an appendix to all manuals. 218 | # latex_appendices = [] 219 | 220 | # If false, no module index is generated. 221 | # latex_domain_indices = True 222 | 223 | 224 | # -- Options for manual page output --------------------------------------- 225 | 226 | # One entry per manual page. List of tuples 227 | # (source start file, name, description, authors, manual section). 228 | man_pages = [("index", "pydruid", u"PyDruid Documentation", [u"Deep Ganguli"], 1)] 229 | 230 | # If true, show URL addresses after external links. 231 | # man_show_urls = False 232 | 233 | 234 | # -- Options for Texinfo output ------------------------------------------- 235 | 236 | # Grouping the document tree into Texinfo files. List of tuples 237 | # (source start file, target name, title, author, 238 | # dir menu entry, description, category) 239 | texinfo_documents = [ 240 | ( 241 | "index", 242 | "PyDruid", 243 | u"PyDruid Documentation", 244 | u"Deep Ganguli", 245 | "PyDruid", 246 | "One line description of project.", 247 | "Miscellaneous", 248 | ) 249 | ] 250 | 251 | # Documents to append as an appendix to all manuals. 252 | # texinfo_appendices = [] 253 | 254 | # If false, no module index is generated. 255 | # texinfo_domain_indices = True 256 | 257 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 258 | # texinfo_show_urls = 'footnote' 259 | 260 | # If true, do not generate a @detailmenu in the "Top" node's menu. 261 | # texinfo_no_detailmenu = False 262 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pydruid 2 | 3 | pydruid exposes a simple API to create, execute, and analyze [Druid](http://druid.io/) queries. pydruid can parse query results into [Pandas](http://pandas.pydata.org/) DataFrame objects for subsequent data analysis -- this offers a tight integration between [Druid](http://druid.io/), the [SciPy](http://www.scipy.org/stackspec.html) stack (for scientific computing) and [scikit-learn](http://scikit-learn.org/stable/) (for machine learning). pydruid can export query results into TSV or JSON for further processing with your favorite tool, e.g., R, Julia, Matlab, Excel. It provides both synchronous and asynchronous clients. 4 | 5 | Additionally, pydruid implements the [Python DB API 2.0](https://www.python.org/dev/peps/pep-0249/), a [SQLAlchemy dialect](http://docs.sqlalchemy.org/en/latest/dialects/), and a provides a command line interface to interact with Druid. 6 | 7 | To install: 8 | ```python 9 | pip install pydruid 10 | # or, if you intend to use asynchronous client 11 | pip install pydruid[async] 12 | # or, if you intend to export query results into pandas 13 | pip install pydruid[pandas] 14 | # or, if you intend to do both 15 | pip install pydruid[async, pandas] 16 | # or, if you want to use the SQLAlchemy engine 17 | pip install pydruid[sqlalchemy] 18 | # or, if you want to use the CLI 19 | pip install pydruid[cli] 20 | ``` 21 | Documentation: https://pythonhosted.org/pydruid/. 22 | 23 | # examples 24 | 25 | The following exampes show how to execute and analyze the results of three types of queries: timeseries, topN, and groupby. We will use these queries to ask simple questions about twitter's public data set. 26 | 27 | ## timeseries 28 | 29 | What was the average tweet length, per day, surrounding the 2014 Sochi olympics? 30 | 31 | ```python 32 | from pydruid.client import * 33 | from pylab import plt 34 | 35 | query = PyDruid(druid_url_goes_here, 'druid/v2') 36 | 37 | ts = query.timeseries( 38 | datasource='twitterstream', 39 | granularity='day', 40 | intervals='2014-02-02/p4w', 41 | aggregations={'length': doublesum('tweet_length'), 'count': doublesum('count')}, 42 | post_aggregations={'avg_tweet_length': (Field('length') / Field('count'))}, 43 | filter=Dimension('first_hashtag') == 'sochi2014' 44 | ) 45 | df = query.export_pandas() 46 | df['timestamp'] = df['timestamp'].map(lambda x: x.split('T')[0]) 47 | df.plot(x='timestamp', y='avg_tweet_length', ylim=(80, 140), rot=20, 48 | title='Sochi 2014') 49 | plt.ylabel('avg tweet length (chars)') 50 | plt.show() 51 | ``` 52 | 53 | ![alt text](https://github.com/metamx/pydruid/raw/master/docs/figures/avg_tweet_length.png "Avg. tweet length") 54 | 55 | ## topN 56 | 57 | Who were the top ten mentions (@user_name) during the 2014 Oscars? 58 | 59 | ```python 60 | top = query.topn( 61 | datasource='twitterstream', 62 | granularity='all', 63 | intervals='2014-03-03/p1d', # utc time of 2014 oscars 64 | aggregations={'count': doublesum('count')}, 65 | dimension='user_mention_name', 66 | filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') & 67 | (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') & 68 | ~(Dimension('user_mention_name') == 'No Mention'), 69 | metric='count', 70 | threshold=10 71 | ) 72 | 73 | df = query.export_pandas() 74 | print df 75 | 76 | count timestamp user_mention_name 77 | 0 1303 2014-03-03T00:00:00.000Z TheEllenShow 78 | 1 44 2014-03-03T00:00:00.000Z TheAcademy 79 | 2 21 2014-03-03T00:00:00.000Z MTV 80 | 3 21 2014-03-03T00:00:00.000Z peoplemag 81 | 4 17 2014-03-03T00:00:00.000Z THR 82 | 5 16 2014-03-03T00:00:00.000Z ItsQueenElsa 83 | 6 16 2014-03-03T00:00:00.000Z eonline 84 | 7 15 2014-03-03T00:00:00.000Z PerezHilton 85 | 8 14 2014-03-03T00:00:00.000Z realjohngreen 86 | 9 12 2014-03-03T00:00:00.000Z KevinSpacey 87 | 88 | ``` 89 | 90 | ## groupby 91 | 92 | What does the social network of users replying to other users look like? 93 | 94 | ```python 95 | from igraph import * 96 | from cairo import * 97 | from pandas import concat 98 | 99 | group = query.groupby( 100 | datasource='twitterstream', 101 | granularity='hour', 102 | intervals='2013-10-04/pt12h', 103 | dimensions=["user_name", "reply_to_name"], 104 | filter=(~(Dimension("reply_to_name") == "Not A Reply")) & 105 | (Dimension("user_location") == "California"), 106 | aggregations={"count": doublesum("count")} 107 | ) 108 | 109 | df = query.export_pandas() 110 | 111 | # map names to categorical variables with a lookup table 112 | names = concat([df['user_name'], df['reply_to_name']]).unique() 113 | nameLookup = dict([pair[::-1] for pair in enumerate(names)]) 114 | df['user_name_lookup'] = df['user_name'].map(nameLookup.get) 115 | df['reply_to_name_lookup'] = df['reply_to_name'].map(nameLookup.get) 116 | 117 | # create the graph with igraph 118 | g = Graph(len(names), directed=False) 119 | vertices = zip(df['user_name_lookup'], df['reply_to_name_lookup']) 120 | g.vs["name"] = names 121 | g.add_edges(vertices) 122 | layout = g.layout_fruchterman_reingold() 123 | plot(g, "tweets.png", layout=layout, vertex_size=2, bbox=(400, 400), margin=25, edge_width=1, vertex_color="blue") 124 | ``` 125 | 126 | ![alt text](https://github.com/metamx/pydruid/raw/master/docs/figures/twitter_graph.png "Social Network") 127 | 128 | # asynchronous client 129 | ```pydruid.async_client.AsyncPyDruid``` implements an asynchronous client. To achieve that, it utilizes an asynchronous 130 | HTTP client from ```Tornado``` framework. The asynchronous client is suitable for use with async frameworks such as Tornado 131 | and provides much better performance at scale. It lets you serve multiple requests at the same time, without blocking on 132 | Druid executing your queries. 133 | 134 | ## example 135 | ```python 136 | from tornado import gen 137 | from pydruid.async_client import AsyncPyDruid 138 | from pydruid.utils.aggregators import longsum 139 | from pydruid.utils.filters import Dimension 140 | 141 | client = AsyncPyDruid(url_to_druid_broker, 'druid/v2') 142 | 143 | @gen.coroutine 144 | def your_asynchronous_method_serving_top10_mentions_for_day(day 145 | top_mentions = yield client.topn( 146 | datasource='twitterstream', 147 | granularity='all', 148 | intervals="%s/p1d" % (day, ), 149 | aggregations={'count': doublesum('count')}, 150 | dimension='user_mention_name', 151 | filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') & 152 | (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') & 153 | ~(Dimension('user_mention_name') == 'No Mention'), 154 | metric='count', 155 | threshold=10) 156 | 157 | # asynchronously return results 158 | # can be simply ```return top_mentions``` in python 3.x 159 | raise gen.Return(top_mentions) 160 | ``` 161 | 162 | 163 | # thetaSketches 164 | Theta sketch Post aggregators are built slightly differently to normal Post Aggregators, as they have different operators. 165 | Note: you must have the ```druid-datasketches``` extension loaded into your Druid cluster in order to use these. 166 | See the [Druid datasketches](http://druid.io/docs/latest/development/extensions-core/datasketches-aggregators.html) documentation for details. 167 | 168 | ```python 169 | from pydruid.client import * 170 | from pydruid.utils import aggregators 171 | from pydruid.utils import filters 172 | from pydruid.utils import postaggregator 173 | 174 | query = PyDruid(url_to_druid_broker, 'druid/v2') 175 | ts = query.groupby( 176 | datasource='test_datasource', 177 | granularity='all', 178 | intervals='2016-09-01/P1M', 179 | filter = ( filters.Dimension('product').in_(['product_A', 'product_B'])), 180 | aggregations={ 181 | 'product_A_users': aggregators.filtered( 182 | filters.Dimension('product') == 'product_A', 183 | aggregators.thetasketch('user_id') 184 | ), 185 | 'product_B_users': aggregators.filtered( 186 | filters.Dimension('product') == 'product_B', 187 | aggregators.thetasketch('user_id') 188 | ) 189 | }, 190 | post_aggregations={ 191 | 'both_A_and_B': postaggregator.ThetaSketchEstimate( 192 | postaggregator.ThetaSketch('product_A_users') & postaggregator.ThetaSketch('product_B_users') 193 | ) 194 | } 195 | ) 196 | ``` 197 | 198 | # DB API 199 | 200 | ```python 201 | from pydruid.db import connect 202 | 203 | conn = connect(host='localhost', port=8082, path='/druid/v2/sql/', scheme='http') 204 | curs = conn.cursor() 205 | curs.execute(""" 206 | SELECT place, 207 | CAST(REGEXP_EXTRACT(place, '(.*),', 1) AS FLOAT) AS lat, 208 | CAST(REGEXP_EXTRACT(place, ',(.*)', 1) AS FLOAT) AS lon 209 | FROM places 210 | LIMIT 10 211 | """) 212 | for row in curs: 213 | print(row) 214 | ``` 215 | 216 | # SQLAlchemy 217 | 218 | ```python 219 | from sqlalchemy import * 220 | from sqlalchemy.engine import create_engine 221 | from sqlalchemy.schema import * 222 | 223 | engine = create_engine('druid://localhost:8082/druid/v2/sql/') # uses HTTP by default :( 224 | # engine = create_engine('druid+http://localhost:8082/druid/v2/sql/') 225 | # engine = create_engine('druid+https://localhost:8082/druid/v2/sql/') 226 | 227 | places = Table('places', MetaData(bind=engine), autoload=True) 228 | print(select([func.count('*')], from_obj=places).scalar()) 229 | ``` 230 | 231 | 232 | ## Column headers 233 | 234 | In version 0.13.0 Druid SQL added support for including the column names in the 235 | response which can be requested via the "header" field in the request. This 236 | helps to ensure that the cursor description is defined (which is a requirement 237 | for SQLAlchemy query statements) regardless on whether the result set contains 238 | any rows. Historically this was problematic for result sets which contained no 239 | rows at one could not infer the expected column names. 240 | 241 | Enabling the header can be configured via the SQLAlchemy URI by using the query 242 | parameter, i.e., 243 | 244 | ```python 245 | engine = create_engine('druid://localhost:8082/druid/v2/sql?header=true') 246 | ``` 247 | 248 | Note the current default is `false` to ensure backwards compatibility but should 249 | be set to `true` for Druid versions >= 0.13.0. 250 | 251 | 252 | # Command line 253 | 254 | ```bash 255 | $ pydruid http://localhost:8082/druid/v2/sql/ 256 | > SELECT COUNT(*) AS cnt FROM places 257 | cnt 258 | ----- 259 | 12345 260 | > SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES; 261 | TABLE_NAME 262 | ---------- 263 | test_table 264 | COLUMNS 265 | SCHEMATA 266 | TABLES 267 | > BYE; 268 | GoodBye! 269 | ``` 270 | 271 | # Contributing 272 | 273 | Contributions are welcomed of course. We like to use `black` and `flake8`. 274 | 275 | ```bash 276 | pip install -r requirements-dev.txt # installs useful dev deps 277 | pre-commit install # installs useful commit hooks 278 | ``` 279 | -------------------------------------------------------------------------------- /pydruid/utils/filters.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Metamarkets Group Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | try: 17 | import simplejson as json 18 | except ImportError: 19 | import json 20 | 21 | from .dimensions import build_dimension 22 | 23 | 24 | class Filter: 25 | 26 | # filter types supporting extraction function 27 | _FILTERS_WITH_EXTR_FN = ( 28 | "selector", 29 | "regex", 30 | "javascript", 31 | "in", 32 | "bound", 33 | "interval", 34 | "extraction", 35 | ) 36 | 37 | def __init__(self, extraction_function=None, ordering="lexicographic", **args): 38 | 39 | type_ = args.get("type", "selector") 40 | 41 | if extraction_function is not None: 42 | if type_ not in self._FILTERS_WITH_EXTR_FN: 43 | raise ValueError( 44 | "Filter of type {0} doesn't support " 45 | "extraction function".format(type_) 46 | ) 47 | elif type_ == "extraction": 48 | raise ValueError( 49 | "Filter of type extraction requires extraction " "function" 50 | ) 51 | 52 | self.extraction_function = extraction_function 53 | 54 | self.filter = {"filter": {"type": type_}} 55 | 56 | if type_ == "selector": 57 | self.filter["filter"].update( 58 | {"dimension": args["dimension"], "value": args["value"]} 59 | ) 60 | elif type_ == "javascript": 61 | self.filter["filter"].update( 62 | {"dimension": args["dimension"], "function": args["function"]} 63 | ) 64 | elif type_ == "and": 65 | self.filter["filter"].update({"fields": args["fields"]}) 66 | elif type_ == "or": 67 | self.filter["filter"].update({"fields": args["fields"]}) 68 | elif type_ == "not": 69 | self.filter["filter"].update({"field": args["field"]}) 70 | elif type_ == "in": 71 | self.filter["filter"].update( 72 | {"dimension": args["dimension"], "values": args["values"]} 73 | ) 74 | elif type_ == "regex": 75 | self.filter["filter"].update( 76 | {"dimension": args["dimension"], "pattern": args["pattern"]} 77 | ) 78 | elif type_ == "bound": 79 | self.filter["filter"].update( 80 | { 81 | "dimension": args["dimension"], 82 | "lower": args["lower"], 83 | "lowerStrict": args["lowerStrict"], 84 | "upper": args["upper"], 85 | "upperStrict": args["upperStrict"], 86 | "alphaNumeric": args["alphaNumeric"], 87 | "ordering": ordering, 88 | } 89 | ) 90 | elif type_ == "columnComparison": 91 | self.filter["filter"].update({"dimensions": args["dimensions"]}) 92 | elif type_ == "interval": 93 | self.filter["filter"].update( 94 | {"dimension": args["dimension"], "intervals": args["intervals"]} 95 | ) 96 | elif type_ == "extraction": 97 | self.filter["filter"].update( 98 | {"dimension": args["dimension"], "value": args["value"]} 99 | ) 100 | elif type_ == "search": 101 | self.filter["filter"].update( 102 | { 103 | "dimension": args["dimension"], 104 | "query": { 105 | "type": "contains", 106 | "value": args["value"], 107 | "caseSensitive": args.get("caseSensitive", "false"), 108 | }, 109 | } 110 | ) 111 | elif type_ == "like": 112 | self.filter["filter"].update( 113 | {"dimension": args["dimension"], "pattern": args["pattern"]} 114 | ) 115 | elif type_ == "spatial": 116 | self.filter["filter"].update( 117 | {"dimension": args["dimension"], "bound": args["bound"]} 118 | ) 119 | else: 120 | raise NotImplementedError("Filter type: {0} does not exist".format(type_)) 121 | 122 | def show(self): 123 | print(json.dumps(self.filter, indent=4)) 124 | 125 | def __and__(self, x): 126 | if self.filter["filter"]["type"] == "and": 127 | # if `self` is already `and`, don't create a new filter 128 | # but just append `x` to the filter fields. 129 | self.filter["filter"]["fields"].append(x) 130 | return self 131 | return Filter(type="and", fields=[self, x]) 132 | 133 | def __or__(self, x): 134 | if self.filter["filter"]["type"] == "or": 135 | # if `self` is already `or`, don't create a new filter 136 | # but just append `x` to the filter fields. 137 | self.filter["filter"]["fields"].append(x) 138 | return self 139 | return Filter(type="or", fields=[self, x]) 140 | 141 | def __invert__(self): 142 | return Filter(type="not", field=self) 143 | 144 | @staticmethod 145 | def build_filter(filter_obj): 146 | filter = filter_obj.filter["filter"] 147 | if filter["type"] in ["and", "or"]: 148 | filter = filter.copy() # make a copy so we don't overwrite `fields` 149 | filter["fields"] = [Filter.build_filter(f) for f in filter["fields"]] 150 | elif filter["type"] in ["not"]: 151 | filter = filter.copy() 152 | filter["field"] = Filter.build_filter(filter["field"]) 153 | elif filter["type"] in ["columnComparison"]: 154 | filter = filter.copy() 155 | filter["dimensions"] = [build_dimension(d) for d in filter["dimensions"]] 156 | 157 | if filter_obj.extraction_function is not None: 158 | if filter is filter_obj.filter["filter"]: # copy if not yet copied 159 | filter = filter.copy() 160 | filter["extractionFn"] = filter_obj.extraction_function.build() 161 | 162 | return filter 163 | 164 | 165 | class Dimension: 166 | def __init__(self, dim): 167 | self.dimension = dim 168 | 169 | def __eq__(self, other): 170 | return Filter(dimension=self.dimension, value=other) 171 | 172 | def __ne__(self, other): 173 | return ~Filter(dimension=self.dimension, value=other) 174 | 175 | 176 | class JavaScript: 177 | def __init__(self, dim): 178 | self.dimension = dim 179 | 180 | def __eq__(self, func): 181 | return Filter(type="javascript", dimension=self.dimension, function=func) 182 | 183 | 184 | class Bound(Filter): 185 | """ 186 | Bound filter can be used to filter by comparing dimension values to an 187 | upper value or/and a lower value. 188 | 189 | :ivar str dimension: Dimension to filter on. 190 | :ivar str lower: Lower bound. 191 | :ivar str upper: Upper bound. 192 | :ivar bool lowerStrict: Strict lower inclusion. Initial value: False 193 | :ivar bool upperStrict: Strict upper inclusion. Initial value: False 194 | :ivar bool alphaNumeric: Numeric comparison. Initial value: False 195 | NOTE: For backwards compatibility - Use "ordering" instead. 196 | :ivar str ordering: Sorting Order. Initial value: lexicographic 197 | Specifies the sorting order to use when comparing values against the bound. 198 | Can be one of the following values: "lexicographic", "alphanumeric", "numeric", 199 | "strlen", "version". See Sorting Orders 200 | https://druid.apache.org/docs/latest/querying/filters.html#bound-filter 201 | for more details. 202 | :ivar ExtractionFunction extraction_function: extraction function to use, 203 | if not None 204 | """ 205 | 206 | def __init__( 207 | self, 208 | dimension, 209 | lower=None, 210 | upper=None, 211 | lowerStrict=False, 212 | upperStrict=False, 213 | alphaNumeric=False, 214 | ordering="lexicographic", 215 | extraction_function=None, 216 | ): 217 | if not lower and not upper: 218 | raise ValueError("Must include either lower or upper or both") 219 | Filter.__init__( 220 | self, 221 | type="bound", 222 | dimension=dimension, 223 | lower=lower, 224 | upper=upper, 225 | lowerStrict=lowerStrict, 226 | upperStrict=upperStrict, 227 | alphaNumeric=alphaNumeric, 228 | ordering=ordering, 229 | extraction_function=extraction_function, 230 | ) 231 | 232 | 233 | class Interval(Filter): 234 | """ 235 | Interval filter can be used to filter by comparing dimension(__time) 236 | values to a list of intervals. 237 | 238 | :ivar str dimension: Dimension to filter on. 239 | :ivar list intervals: List of ISO-8601 intervals of data to filter out. 240 | :ivar ExtractionFunction extraction_function: extraction function to use, 241 | if not None 242 | """ 243 | 244 | def __init__(self, dimension, intervals, extraction_function=None): 245 | 246 | Filter.__init__( 247 | self, 248 | type="interval", 249 | dimension=dimension, 250 | intervals=intervals, 251 | extraction_function=extraction_function, 252 | ) 253 | 254 | 255 | class Spatial(Filter): 256 | """ 257 | Spatial filter can be used to filter by spatial bounds 258 | 259 | :ivar str dimension: Dimension to filter on. 260 | :ivar str bound_type: Spatial bound type: ['rectangle','radius','polygon']. 261 | :param `**kwargs`: addition arguments required for the selected bound type: 262 | 'rectange': 'minCoords' and 'maxCoords' 263 | 'radius': 'coords' and 'radius' 264 | 'polygon': 'abscissa' and 'ordinate' 265 | """ 266 | 267 | def __init__(self, dimension, bound_type, **args): 268 | 269 | _bound = {"type": bound_type} 270 | 271 | if bound_type == "rectangle": 272 | if not args["minCoords"] or not args["maxCoords"]: 273 | raise ValueError( 274 | "Rectangle bound must include both minCoords and maxCoords" 275 | ) 276 | _bound["minCoords"] = args["minCoords"] 277 | _bound["maxCoords"] = args["maxCoords"] 278 | elif bound_type == "radius": 279 | if not args["coords"] or not args["radius"]: 280 | raise ValueError("Radius bound must include both coords and radius") 281 | _bound["coords"] = args["coords"] 282 | _bound["radius"] = args["radius"] 283 | elif bound_type == "polygon": 284 | if not args["abscissa"] or not args["ordinate"]: 285 | raise ValueError( 286 | "Polygon bound must include both abscissa and ordinate" 287 | ) 288 | _bound["abscissa"] = args["abscissa"] 289 | _bound["ordinate"] = args["ordinate"] 290 | else: 291 | raise ValueError("Unsupport Spatial Bound type: {0}".format(bound_type)) 292 | 293 | Filter.__init__(self, type="spatial", dimension=dimension, bound=_bound) 294 | -------------------------------------------------------------------------------- /tests/test_query.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # 3 | # Copyright 2016 Metamarkets Group Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import csv 19 | import os 20 | 21 | import pandas 22 | import pytest 23 | from pandas.testing import assert_frame_equal 24 | 25 | from pydruid.query import Query, QueryBuilder 26 | from pydruid.utils import aggregators, filters, having, postaggregator 27 | 28 | 29 | def create_query_with_results(): 30 | query = Query({}, "timeseries") 31 | query.result = [ 32 | { 33 | "result": {"value1": 1, "value2": "㬓"}, 34 | "timestamp": "2015-01-01T00:00:00.000-05:00", 35 | }, 36 | { 37 | "result": {"value1": 2, "value2": "㬓"}, 38 | "timestamp": "2015-01-02T00:00:00.000-05:00", 39 | }, 40 | ] 41 | return query 42 | 43 | 44 | EXPECTED_RESULTS_PANDAS = [ 45 | {"timestamp": "2015-01-01T00:00:00.000-05:00", "value1": 1, "value2": "㬓"}, 46 | {"timestamp": "2015-01-02T00:00:00.000-05:00", "value1": 2, "value2": "㬓"}, 47 | ] 48 | 49 | 50 | def expected_results_csv_reader(): 51 | # csv.DictReader does not perform promotion to int64 52 | expected_results = [] 53 | for element in EXPECTED_RESULTS_PANDAS: 54 | modified_elem = element.copy() 55 | modified_elem.update({"value1": str(modified_elem["value1"])}) 56 | expected_results.append(modified_elem) 57 | return expected_results 58 | 59 | 60 | class TestQueryBuilder: 61 | def test_build_query(self): 62 | # given 63 | expected_query_dict = { 64 | "queryType": None, 65 | "dataSource": "things", 66 | "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}], 67 | "postAggregations": [ 68 | { 69 | "fields": [ 70 | {"fieldName": "sum", "type": "fieldAccess"}, 71 | {"fieldName": "count", "type": "fieldAccess"}, 72 | ], 73 | "fn": "/", 74 | "name": "avg", 75 | "type": "arithmetic", 76 | } 77 | ], 78 | "pagingSpec": {"pagingIdentifies": {}, "threshold": 1}, 79 | "filter": {"dimension": "one", "type": "selector", "value": 1}, 80 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, 81 | "new_key": "value", 82 | } 83 | 84 | builder = QueryBuilder() 85 | 86 | # when 87 | query = builder.build_query( 88 | None, 89 | { 90 | "datasource": "things", 91 | "aggregations": {"count": aggregators.count("thing")}, 92 | "post_aggregations": { 93 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) 94 | }, 95 | "paging_spec": {"pagingIdentifies": {}, "threshold": 1}, 96 | "filter": filters.Dimension("one") == 1, 97 | "having": having.Aggregation("sum") > 1, 98 | "new_key": "value", 99 | }, 100 | ) 101 | 102 | # then 103 | assert query.query_dict == expected_query_dict 104 | 105 | def test_build_query_none_type(self): 106 | # given 107 | expected_query_dict = { 108 | "queryType": None, 109 | "dataSource": "things", 110 | "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}], 111 | "filter": {"dimension": "one", "type": "selector", "value": 1}, 112 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, 113 | "dimension": "dim1", 114 | } 115 | 116 | builder = QueryBuilder() 117 | 118 | # when 119 | builder_dict = { 120 | "datasource": "things", 121 | "aggregations": {"count": aggregators.count("thing")}, 122 | "filter": filters.Dimension("one") == 1, 123 | "having": having.Aggregation("sum") > 1, 124 | "dimension": "dim1", 125 | } 126 | query = builder.build_query(None, builder_dict) 127 | 128 | # then 129 | assert query.query_dict == expected_query_dict 130 | 131 | # you should be able to pass `None` to dimension/having/filter 132 | for v in ["dimension", "having", "filter"]: 133 | expected_query_dict[v] = None 134 | builder_dict[v] = None 135 | 136 | query = builder.build_query(None, builder_dict) 137 | 138 | assert query.query_dict == expected_query_dict 139 | 140 | def test_validate_query(self): 141 | # given 142 | builder = QueryBuilder() 143 | 144 | # when 145 | builder.validate_query(None, ["validkey"], {"validkey": "value"}) 146 | 147 | # then 148 | pytest.raises( 149 | ValueError, 150 | builder.validate_query, 151 | *[None, ["validkey"], {"invalidkey": "value"}] 152 | ) 153 | 154 | def test_union_datasource(self): 155 | # Given 156 | expected_query_dict = {"queryType": None, "dataSource": "things"} 157 | builder = QueryBuilder() 158 | # when 159 | builder_dict = {"datasource": "things"} 160 | query = builder.build_query(None, builder_dict) 161 | # then 162 | assert query.query_dict == expected_query_dict 163 | 164 | # Given 165 | expected_query_dict = { 166 | "queryType": None, 167 | "dataSource": { 168 | "type": "union", 169 | "dataSources": ["things", "others", "more"], 170 | }, 171 | } 172 | builder = QueryBuilder() 173 | # when 174 | builder_dict = {"datasource": ["things", "others", "more"]} 175 | query = builder.build_query(None, builder_dict) 176 | # then 177 | assert query.query_dict == expected_query_dict 178 | 179 | # Given check that it rejects non-string items 180 | builder = QueryBuilder() 181 | builder_dict = {"datasource": ["things", 123]} 182 | with pytest.raises(ValueError): 183 | query = builder.build_query(None, builder_dict) 184 | 185 | def test_build_subquery(self): 186 | # given 187 | expected_query_dict = { 188 | "query": { 189 | "queryType": "groupBy", 190 | "dataSource": "things", 191 | "aggregations": [ 192 | {"fieldName": "thing", "name": "count", "type": "count"} 193 | ], 194 | "postAggregations": [ 195 | { 196 | "fields": [ 197 | {"fieldName": "sum", "type": "fieldAccess"}, 198 | {"fieldName": "count", "type": "fieldAccess"}, 199 | ], 200 | "fn": "/", 201 | "name": "avg", 202 | "type": "arithmetic", 203 | } 204 | ], 205 | "filter": {"dimension": "one", "type": "selector", "value": 1}, 206 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, 207 | }, 208 | "type": "query", 209 | } 210 | 211 | builder = QueryBuilder() 212 | 213 | # when 214 | subquery_dict = builder.subquery( 215 | { 216 | "datasource": "things", 217 | "aggregations": {"count": aggregators.count("thing")}, 218 | "post_aggregations": { 219 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) 220 | }, 221 | "filter": filters.Dimension("one") == 1, 222 | "having": having.Aggregation("sum") > 1, 223 | } 224 | ) 225 | 226 | # then 227 | assert subquery_dict == expected_query_dict 228 | 229 | expected_nested_query_dict = { 230 | "query": { 231 | "queryType": "groupBy", 232 | "dataSource": { 233 | "query": { 234 | "queryType": "groupBy", 235 | "dataSource": "things", 236 | "aggregations": [ 237 | {"fieldName": "thing", "name": "count", "type": "count"} 238 | ], 239 | "postAggregations": [ 240 | { 241 | "fields": [ 242 | {"fieldName": "sum", "type": "fieldAccess"}, 243 | {"fieldName": "count", "type": "fieldAccess"}, 244 | ], 245 | "fn": "/", 246 | "name": "avg", 247 | "type": "arithmetic", 248 | } 249 | ], 250 | "filter": {"dimension": "one", "type": "selector", "value": 1}, 251 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, 252 | }, 253 | "type": "query", 254 | }, 255 | "aggregations": [ 256 | {"fieldName": "thing", "name": "count", "type": "count"} 257 | ], 258 | "postAggregations": [ 259 | { 260 | "fields": [ 261 | {"fieldName": "sum", "type": "fieldAccess"}, 262 | {"fieldName": "count", "type": "fieldAccess"}, 263 | ], 264 | "fn": "/", 265 | "name": "avg", 266 | "type": "arithmetic", 267 | } 268 | ], 269 | "filter": {"dimension": "one", "type": "selector", "value": 1}, 270 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, 271 | }, 272 | "type": "query", 273 | } 274 | 275 | nested_subquery_dict = builder.subquery( 276 | { 277 | "datasource": builder.subquery( 278 | { 279 | "datasource": "things", 280 | "aggregations": {"count": aggregators.count("thing")}, 281 | "post_aggregations": { 282 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) 283 | }, 284 | "filter": filters.Dimension("one") == 1, 285 | "having": having.Aggregation("sum") > 1, 286 | } 287 | ), 288 | "aggregations": {"count": aggregators.count("thing")}, 289 | "post_aggregations": { 290 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) 291 | }, 292 | "filter": filters.Dimension("one") == 1, 293 | "having": having.Aggregation("sum") > 1, 294 | } 295 | ) 296 | 297 | assert nested_subquery_dict == expected_nested_query_dict 298 | 299 | class TestQuery: 300 | def test_export_tsv(self, tmpdir): 301 | query = create_query_with_results() 302 | file_path = tmpdir.join("out.tsv") 303 | query.export_tsv(str(file_path)) 304 | 305 | with open(str(file_path)) as tsv_file: 306 | reader = csv.DictReader(tsv_file, delimiter="\t") 307 | actual = [line for line in reader] 308 | assert actual == expected_results_csv_reader() 309 | 310 | def test_export_pandas(self): 311 | query = create_query_with_results() 312 | df = query.export_pandas() 313 | expected_df = pandas.DataFrame(EXPECTED_RESULTS_PANDAS) 314 | assert_frame_equal(df, expected_df, check_like=True) 315 | 316 | query = Query({}, "timeseries") 317 | df = query.export_pandas() 318 | assert_frame_equal(df, pandas.DataFrame()) 319 | 320 | def test_query_acts_as_a_wrapper_for_raw_result(self): 321 | # given 322 | query = create_query_with_results() 323 | 324 | # then 325 | assert len(query) == 2 326 | assert isinstance(query[0], dict) 327 | assert isinstance(query[1], dict) 328 | -------------------------------------------------------------------------------- /tests/utils/test_dimensions.py: -------------------------------------------------------------------------------- 1 | from pydruid.utils.dimensions import ( 2 | build_dimension, 3 | DimensionSpec, 4 | JavascriptExtraction, 5 | ListFilteredSpec, 6 | MapLookupExtraction, 7 | NamespaceLookupExtraction, 8 | PartialExtraction, 9 | RegexExtraction, 10 | RegexFilteredSpec, 11 | RegisteredLookupExtraction, 12 | TimeFormatExtraction, 13 | ) 14 | 15 | 16 | class TestDimensionSpec(object): 17 | def test_default(self): 18 | dim_spec = DimensionSpec("dim", "out") 19 | actual = dim_spec.build() 20 | expected = {"type": "default", "dimension": "dim", "outputName": "out"} 21 | 22 | assert actual == expected 23 | 24 | def test_extraction_functions(self): 25 | js_func = "function(x) {return x};" 26 | ext_fns = [ 27 | (RegexExtraction(r"\w+"), {"type": "regex", "expr": "\\w+"}), 28 | (PartialExtraction(r"\w+"), {"type": "partial", "expr": "\\w+"}), 29 | ( 30 | JavascriptExtraction(js_func), 31 | {"type": "javascript", "function": js_func, "injective": False}, 32 | ), 33 | ( 34 | MapLookupExtraction(TestMapLookupExtraction.mapping), 35 | { 36 | "type": "lookup", 37 | "lookup": {"type": "map", "map": TestMapLookupExtraction.mapping}, 38 | "retainMissingValue": False, 39 | "replaceMissingValueWith": None, 40 | "injective": False, 41 | }, 42 | ), 43 | ] 44 | 45 | for ext_fn, expected_ext_fn in ext_fns: 46 | dim_spec = DimensionSpec("dim", "out", extraction_function=ext_fn) 47 | actual = dim_spec.build() 48 | expected = { 49 | "type": "extraction", 50 | "dimension": "dim", 51 | "outputName": "out", 52 | "extractionFn": expected_ext_fn, 53 | } 54 | 55 | assert actual == expected 56 | 57 | def test_filter_specs(self): 58 | delegate_spec = DimensionSpec("dim", "out").build() 59 | filter_specs = [ 60 | ( 61 | ListFilteredSpec(["val1", "val2"]), 62 | { 63 | "type": "listFiltered", 64 | "delegate": delegate_spec, 65 | "values": ["val1", "val2"], 66 | }, 67 | ), 68 | ( 69 | ListFilteredSpec(["val1", "val2"], is_whitelist=False), 70 | { 71 | "type": "listFiltered", 72 | "delegate": delegate_spec, 73 | "values": ["val1", "val2"], 74 | "isWhitelist": False, 75 | }, 76 | ), 77 | ( 78 | RegexFilteredSpec(r"\w+"), 79 | {"type": "regexFiltered", "delegate": delegate_spec, "pattern": "\\w+"}, 80 | ), 81 | ] 82 | 83 | for filter_spec, expected_dim_spec in filter_specs: 84 | dim_spec = DimensionSpec("dim", "out", filter_spec=filter_spec) 85 | actual = dim_spec.build() 86 | 87 | assert actual == expected_dim_spec 88 | 89 | def test_build_dimension(self): 90 | assert build_dimension("raw_dim") == "raw_dim" 91 | 92 | dim_spec = DimensionSpec("dim", "out") 93 | assert build_dimension(dim_spec) == dim_spec.build() 94 | 95 | 96 | class TestListFilteredSpec(object): 97 | def test_list_filtered_spec(self): 98 | dim_spec = DimensionSpec("dim", "out").build() 99 | list_filtered_spec = ListFilteredSpec(["val1", "val2"]) 100 | actual = list_filtered_spec.build(dim_spec) 101 | expected_dim_spec = {"type": "default", "dimension": "dim", "outputName": "out"} 102 | expected = { 103 | "type": "listFiltered", 104 | "delegate": expected_dim_spec, 105 | "values": ["val1", "val2"], 106 | } 107 | 108 | assert actual == expected 109 | 110 | def test_list_filtered_spec_whitelist(self): 111 | dim_spec = DimensionSpec("dim", "out").build() 112 | list_filtered_spec = ListFilteredSpec(["val1", "val2"], is_whitelist=False) 113 | actual = list_filtered_spec.build(dim_spec) 114 | expected_dim_spec = {"type": "default", "dimension": "dim", "outputName": "out"} 115 | expected = { 116 | "type": "listFiltered", 117 | "delegate": expected_dim_spec, 118 | "values": ["val1", "val2"], 119 | "isWhitelist": False, 120 | } 121 | 122 | assert actual == expected 123 | 124 | 125 | class TestRegexFilteredSpec(object): 126 | def test_regex_filtered_spec(self): 127 | dim_spec = DimensionSpec("dim", "out").build() 128 | regex_filtered_spec = RegexFilteredSpec(r"\w+") 129 | actual = regex_filtered_spec.build(dim_spec) 130 | expected_dim_spec = {"type": "default", "dimension": "dim", "outputName": "out"} 131 | expected = { 132 | "type": "regexFiltered", 133 | "delegate": expected_dim_spec, 134 | "pattern": "\\w+", 135 | } 136 | 137 | assert actual == expected 138 | 139 | 140 | class TestRegexExtraction(object): 141 | def test_regex(self): 142 | ext_fn = RegexExtraction(r"\w+") 143 | actual = ext_fn.build() 144 | expected = {"type": "regex", "expr": "\\w+"} 145 | 146 | assert actual == expected 147 | 148 | 149 | class TestPartialExtraction(object): 150 | def test_regex(self): 151 | ext_fn = PartialExtraction(r"\w+") 152 | actual = ext_fn.build() 153 | expected = {"type": "partial", "expr": "\\w+"} 154 | 155 | assert actual == expected 156 | 157 | 158 | class TestJavascriptExtraction(object): 159 | def test_js_injective(self): 160 | js_func = "function(x) {return x};" 161 | ext_fn = JavascriptExtraction(js_func, injective=True) 162 | actual = ext_fn.build() 163 | expected = {"type": "javascript", "function": js_func, "injective": True} 164 | 165 | assert actual == expected 166 | 167 | def test_js_not_injective(self): 168 | js_func = "function(x) {return x};" 169 | ext_fn = JavascriptExtraction(js_func) 170 | actual = ext_fn.build() 171 | expected = {"type": "javascript", "function": js_func, "injective": False} 172 | 173 | assert actual == expected 174 | 175 | 176 | class TestTimeFormatExtraction(object): 177 | def test_time_format_all_set(self): 178 | ext_fn = TimeFormatExtraction("EEEE", "en-US", "Europe/Berlin") 179 | actual = ext_fn.build() 180 | expected = { 181 | "type": "timeFormat", 182 | "format": "EEEE", 183 | "locale": "en-US", 184 | "timeZone": "Europe/Berlin", 185 | } 186 | 187 | assert actual == expected 188 | 189 | def test_time_format_no_timezone(self): 190 | ext_fn = TimeFormatExtraction("EEEE", "en-US") 191 | actual = ext_fn.build() 192 | expected = {"type": "timeFormat", "format": "EEEE", "locale": "en-US"} 193 | 194 | assert actual == expected 195 | 196 | def test_time_format_only_format(self): 197 | ext_fn = TimeFormatExtraction("EEEE") 198 | actual = ext_fn.build() 199 | expected = {"type": "timeFormat", "format": "EEEE"} 200 | 201 | assert actual == expected 202 | 203 | 204 | class TestMapLookupExtraction(object): 205 | 206 | mapping = {"foo1": "bar1", "foo2": "bar2"} 207 | 208 | def test_map_default(self): 209 | ext_fn = MapLookupExtraction(self.mapping) 210 | actual = ext_fn.build() 211 | expected = { 212 | "type": "lookup", 213 | "lookup": {"type": "map", "map": self.mapping}, 214 | "retainMissingValue": False, 215 | "replaceMissingValueWith": None, 216 | "injective": False, 217 | } 218 | 219 | assert actual == expected 220 | 221 | def test_map_retain_missing(self): 222 | ext_fn = MapLookupExtraction(self.mapping, retain_missing_values=True) 223 | actual = ext_fn.build() 224 | expected = { 225 | "type": "lookup", 226 | "lookup": {"type": "map", "map": self.mapping}, 227 | "retainMissingValue": True, 228 | "replaceMissingValueWith": None, 229 | "injective": False, 230 | } 231 | 232 | assert actual == expected 233 | 234 | def test_map_replace_missing(self): 235 | ext_fn = MapLookupExtraction(self.mapping, replace_missing_values="replacer") 236 | actual = ext_fn.build() 237 | expected = { 238 | "type": "lookup", 239 | "lookup": {"type": "map", "map": self.mapping}, 240 | "retainMissingValue": False, 241 | "replaceMissingValueWith": "replacer", 242 | "injective": False, 243 | } 244 | 245 | assert actual == expected 246 | 247 | def test_map_injective(self): 248 | ext_fn = MapLookupExtraction(self.mapping, injective=True) 249 | actual = ext_fn.build() 250 | expected = { 251 | "type": "lookup", 252 | "lookup": {"type": "map", "map": self.mapping}, 253 | "retainMissingValue": False, 254 | "replaceMissingValueWith": None, 255 | "injective": True, 256 | } 257 | 258 | assert actual == expected 259 | 260 | 261 | class TestNamespaceLookupExtraction(object): 262 | def test_map_default(self): 263 | ext_fn = NamespaceLookupExtraction("foo_namespace") 264 | actual = ext_fn.build() 265 | expected = { 266 | "type": "lookup", 267 | "lookup": {"type": "namespace", "namespace": "foo_namespace"}, 268 | "retainMissingValue": False, 269 | "replaceMissingValueWith": None, 270 | "injective": False, 271 | } 272 | 273 | assert actual == expected 274 | 275 | def test_map_retain_missing(self): 276 | ext_fn = NamespaceLookupExtraction("foo_namespace", retain_missing_values=True) 277 | actual = ext_fn.build() 278 | expected = { 279 | "type": "lookup", 280 | "lookup": {"type": "namespace", "namespace": "foo_namespace"}, 281 | "retainMissingValue": True, 282 | "replaceMissingValueWith": None, 283 | "injective": False, 284 | } 285 | 286 | assert actual == expected 287 | 288 | def test_map_replace_missing(self): 289 | ext_fn = NamespaceLookupExtraction( 290 | "foo_namespace", replace_missing_values="replacer" 291 | ) 292 | actual = ext_fn.build() 293 | expected = { 294 | "type": "lookup", 295 | "lookup": {"type": "namespace", "namespace": "foo_namespace"}, 296 | "retainMissingValue": False, 297 | "replaceMissingValueWith": "replacer", 298 | "injective": False, 299 | } 300 | 301 | assert actual == expected 302 | 303 | def test_map_injective(self): 304 | ext_fn = NamespaceLookupExtraction("foo_namespace", injective=True) 305 | actual = ext_fn.build() 306 | expected = { 307 | "type": "lookup", 308 | "lookup": {"type": "namespace", "namespace": "foo_namespace"}, 309 | "retainMissingValue": False, 310 | "replaceMissingValueWith": None, 311 | "injective": True, 312 | } 313 | 314 | assert actual == expected 315 | 316 | 317 | class TestRegisteredLookupExtraction(object): 318 | def test_map_default(self): 319 | ext_fn = RegisteredLookupExtraction("foo_namespace") 320 | actual = ext_fn.build() 321 | expected = { 322 | "type": "registeredLookup", 323 | "lookup": "foo_namespace", 324 | "retainMissingValue": False, 325 | "replaceMissingValueWith": None, 326 | "injective": False, 327 | } 328 | 329 | assert actual == expected 330 | 331 | def test_map_retain_missing(self): 332 | ext_fn = RegisteredLookupExtraction("foo_namespace", retain_missing_values=True) 333 | actual = ext_fn.build() 334 | expected = { 335 | "type": "registeredLookup", 336 | "lookup": "foo_namespace", 337 | "retainMissingValue": True, 338 | "replaceMissingValueWith": None, 339 | "injective": False, 340 | } 341 | 342 | assert actual == expected 343 | 344 | def test_map_replace_missing(self): 345 | ext_fn = RegisteredLookupExtraction( 346 | "foo_namespace", replace_missing_values="replacer" 347 | ) 348 | actual = ext_fn.build() 349 | expected = { 350 | "type": "registeredLookup", 351 | "lookup": "foo_namespace", 352 | "retainMissingValue": False, 353 | "replaceMissingValueWith": "replacer", 354 | "injective": False, 355 | } 356 | 357 | assert actual == expected 358 | 359 | def test_map_injective(self): 360 | ext_fn = RegisteredLookupExtraction("foo_namespace", injective=True) 361 | actual = ext_fn.build() 362 | expected = { 363 | "type": "registeredLookup", 364 | "lookup": "foo_namespace", 365 | "retainMissingValue": False, 366 | "replaceMissingValueWith": None, 367 | "injective": True, 368 | } 369 | 370 | assert actual == expected 371 | -------------------------------------------------------------------------------- /tests/db/test_cursor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from collections import namedtuple 5 | from io import BytesIO 6 | from unittest.mock import ANY, patch 7 | 8 | import requests 9 | from requests.models import Response 10 | from requests.auth import HTTPBasicAuth 11 | 12 | from pydruid.db.api import BearerAuth, apply_parameters, Cursor, connect 13 | 14 | 15 | class CursorTestSuite(unittest.TestCase): 16 | @patch("requests.post") 17 | def test_execute(self, requests_post_mock): 18 | response = Response() 19 | response.status_code = 200 20 | response.raw = BytesIO( 21 | b'[{"name": "alice"}, {"name": "bob"}, {"name": "charlie"}]' 22 | ) 23 | requests_post_mock.return_value = response 24 | Row = namedtuple("Row", ["name"]) 25 | 26 | cursor = Cursor("http://example.com/") 27 | cursor.execute("SELECT * FROM table") 28 | result = cursor.fetchall() 29 | expected = [Row(name="alice"), Row(name="bob"), Row(name="charlie")] 30 | self.assertEqual(result, expected) 31 | 32 | @patch("requests.post") 33 | def test_execute_empty_result(self, requests_post_mock): 34 | response = Response() 35 | response.status_code = 200 36 | response.raw = BytesIO(b"[]") 37 | requests_post_mock.return_value = response 38 | 39 | cursor = Cursor("http://example.com/") 40 | cursor.execute("SELECT * FROM table") 41 | result = cursor.fetchall() 42 | expected = [] 43 | self.assertEqual(result, expected) 44 | 45 | @patch("requests.post") 46 | def test_context(self, requests_post_mock): 47 | response = Response() 48 | response.status_code = 200 49 | response.raw = BytesIO(b"[]") 50 | requests_post_mock.return_value = response 51 | 52 | url = "http://example.com/" 53 | query = "SELECT * FROM table" 54 | context = {"source": "unittest"} 55 | 56 | cursor = Cursor(url, user=None, password=None, context=context) 57 | cursor.execute(query) 58 | 59 | requests_post_mock.assert_called_with( 60 | "http://example.com/", 61 | auth=None, 62 | stream=True, 63 | headers={"Content-Type": "application/json"}, 64 | json={"query": query, "context": context, "header": False}, 65 | verify=True, 66 | cert=None, 67 | proxies=None, 68 | ) 69 | 70 | @patch("requests.post") 71 | def test_header_false(self, requests_post_mock): 72 | response = Response() 73 | response.status_code = 200 74 | response.raw = BytesIO(b'[{"name": "alice"}]') 75 | requests_post_mock.return_value = response 76 | Row = namedtuple("Row", ["name"]) 77 | 78 | url = "http://example.com/" 79 | query = "SELECT * FROM table" 80 | 81 | cursor = Cursor(url, header=False) 82 | cursor.execute(query) 83 | result = cursor.fetchall() 84 | self.assertEqual(result, [Row(name="alice")]) 85 | 86 | self.assertEqual( 87 | cursor.description, [("name", 1, None, None, None, None, True)] 88 | ) 89 | 90 | @patch("requests.post") 91 | def test_header_true(self, requests_post_mock): 92 | response = Response() 93 | response.status_code = 200 94 | response.raw = BytesIO(b'[{"name": null}, {"name": "alice"}]') 95 | requests_post_mock.return_value = response 96 | Row = namedtuple("Row", ["name"]) 97 | 98 | url = "http://example.com/" 99 | query = "SELECT * FROM table" 100 | 101 | cursor = Cursor(url, header=True) 102 | cursor.execute(query) 103 | result = cursor.fetchall() 104 | self.assertEqual(result, [Row(name="alice")]) 105 | self.assertEqual(cursor.description, [("name", None)]) 106 | 107 | @patch("requests.post") 108 | def test_names_with_underscores(self, requests_post_mock): 109 | response = Response() 110 | response.status_code = 200 111 | response.raw = BytesIO(b'[{"_name": null}, {"_name": "alice"}]') 112 | requests_post_mock.return_value = response 113 | Row = namedtuple("Row", ["_name"], rename=True) 114 | 115 | url = "http://example.com/" 116 | query = "SELECT * FROM table" 117 | 118 | cursor = Cursor(url, header=True) 119 | cursor.execute(query) 120 | result = cursor.fetchall() 121 | self.assertEqual(result, [Row(_0="alice")]) 122 | self.assertEqual(cursor.description, [("_name", None)]) 123 | 124 | def test_apply_parameters(self): 125 | self.assertEqual( 126 | apply_parameters('SELECT 100 AS "100%"', None), 'SELECT 100 AS "100%"' 127 | ) 128 | 129 | self.assertEqual( 130 | apply_parameters('SELECT 100 AS "100%"', {}), 'SELECT 100 AS "100%"' 131 | ) 132 | 133 | self.assertEqual( 134 | apply_parameters('SELECT %(key)s AS "100%%"', {"key": 100}), 135 | 'SELECT 100 AS "100%"', 136 | ) 137 | 138 | self.assertEqual(apply_parameters("SELECT %(key)s", {"key": "*"}), "SELECT *") 139 | 140 | self.assertEqual( 141 | apply_parameters("SELECT %(key)s", {"key": "bar"}), "SELECT 'bar'" 142 | ) 143 | 144 | self.assertEqual( 145 | apply_parameters("SELECT %(key)s", {"key": True}), "SELECT TRUE" 146 | ) 147 | 148 | self.assertEqual( 149 | apply_parameters("SELECT %(key)s", {"key": False}), "SELECT FALSE" 150 | ) 151 | 152 | # Generated by CodiumAI 153 | # When `user` is not None, `HTTPBasicAuth` is used for authentication. 154 | @patch("requests.post") 155 | def test_user_not_none_http_basic_auth(self, mock_post): 156 | from unittest.mock import patch 157 | 158 | response = Response() 159 | response.raw = BytesIO(b"[]") 160 | response.status_code = 200 161 | mock_post.return_value = response 162 | 163 | user = "test_user" 164 | password = "test_password" 165 | url = "http://example.com/" 166 | query = "SELECT * FROM table" 167 | 168 | cursor = Cursor(url, user=user, password=password) 169 | cursor.execute(query) 170 | 171 | mock_post.assert_called_with( 172 | url, 173 | stream=True, 174 | headers={"Content-Type": "application/json"}, 175 | json={"query": query, "context": cursor.context, "header": cursor.header,}, 176 | auth=requests.auth.HTTPBasicAuth(user, password), 177 | verify=cursor.ssl_verify_cert, 178 | cert=cursor.ssl_client_cert, 179 | proxies=cursor.proxies, 180 | ) 181 | 182 | # When `user` is None and `jwt` is not None, `auth` is not None. 183 | @patch("requests.post") 184 | def test_user_none_jwt_not_none_auth_not_none(self, mock_post): 185 | response = Response() 186 | response.raw = BytesIO(b"[]") 187 | response.status_code = 200 188 | mock_post.return_value = response 189 | 190 | jwt = "test_jwt" 191 | url = "http://example.com/" 192 | query = "SELECT * FROM table" 193 | 194 | cursor = Cursor(url, jwt=jwt) 195 | cursor.execute(query) 196 | 197 | mock_post.assert_called_with( 198 | url, 199 | stream=True, 200 | headers={"Content-Type": "application/json"}, 201 | json={"query": query, "context": cursor.context, "header": cursor.header,}, 202 | auth=ANY, 203 | verify=cursor.ssl_verify_cert, 204 | cert=cursor.ssl_client_cert, 205 | proxies=cursor.proxies, 206 | ) 207 | 208 | last_call = mock_post.call_args 209 | auth_arg = last_call.kwargs["auth"] 210 | 211 | self.assertIsInstance(auth_arg, BearerAuth) 212 | self.assertEqual(auth_arg.token, jwt) 213 | 214 | # Test that no authentication is used when both `user` and `jwt` are None. 215 | @patch("requests.post") 216 | def test_no_authentication_used(self, requests_post_mock): 217 | response = Response() 218 | response.status_code = 200 219 | response.raw = BytesIO(b'{"result": "success"}') 220 | requests_post_mock.return_value = response 221 | 222 | conn = connect(user=None, jwt=None) 223 | curs = conn.cursor() 224 | 225 | # Perform some operation that requires authentication 226 | curs.execute("SELECT * FROM table") 227 | 228 | # Assert that no authentication was used 229 | requests_post_mock.assert_called_with( 230 | ANY, 231 | stream=True, 232 | headers=ANY, 233 | json=ANY, 234 | auth=None, 235 | verify=ANY, 236 | cert=ANY, 237 | proxies=ANY, 238 | ) 239 | 240 | # The test verifies that when `user` is not None and `jwt` is not None, `HttpBasicAuth` is used for authentication. 241 | @patch("requests.post") 242 | def test_basic_auth_used_for_authentication_when_both_provided( 243 | self, requests_post_mock 244 | ): 245 | response = Response() 246 | response.status_code = 200 247 | response.raw = BytesIO(b'{"result": "success"}') 248 | requests_post_mock.return_value = response 249 | 250 | url = "http://example.com/" 251 | user = "test_user" 252 | password = "test_password" 253 | jwt = "test_jwt" 254 | 255 | cursor = Cursor(url, user=user, password=password, jwt=jwt) 256 | cursor.execute("SELECT * FROM table") 257 | 258 | requests_post_mock.assert_called_with( 259 | url, 260 | stream=True, 261 | headers={"Content-Type": "application/json"}, 262 | json={"query": "SELECT * FROM table", "context": {}, "header": False}, 263 | auth=ANY, 264 | verify=True, 265 | cert=None, 266 | proxies=None, 267 | ) 268 | 269 | last_call = requests_post_mock.call_args 270 | auth_arg = last_call.kwargs["auth"] 271 | 272 | self.assertIsInstance(auth_arg, HTTPBasicAuth) 273 | self.assertEqual(auth_arg.username, user) 274 | self.assertEqual(auth_arg.password, password) 275 | 276 | # When `ssl_verify_cert` is False, SSL certificate is not verified. 277 | @patch("requests.post") 278 | def test_ssl_certificate_verification_disabled(self, requests_post_mock): 279 | response = Response() 280 | response.status_code = 200 281 | response.raw = BytesIO(b"[]") 282 | requests_post_mock.return_value = response 283 | user = "test_user" 284 | password = "test_password" 285 | 286 | url = "http://example.com/" 287 | query = "SELECT * FROM table" 288 | 289 | cursor = Cursor( 290 | url, user=user, password=password, header=True, ssl_verify_cert=False 291 | ) 292 | cursor.execute(query) 293 | 294 | requests_post_mock.assert_called_with( 295 | url, 296 | stream=True, 297 | headers={"Content-Type": "application/json"}, 298 | json={"query": "SELECT * FROM table", "context": {}, "header": True}, 299 | auth=ANY, 300 | verify=False, 301 | cert=None, 302 | proxies=None, 303 | ) 304 | 305 | # When `user` is not None and `password` is None, `HTTPBasicAuth` is used with empty password. 306 | @patch("requests.post") 307 | @patch("requests.auth.HTTPBasicAuth") 308 | def test_http_basic_auth_with_empty_user( 309 | self, http_basic_auth_mock, requests_post_mock 310 | ): 311 | response = Response() 312 | response.status_code = 200 313 | response.raw = BytesIO(b'[{"_name": null}, {"_name": "alice"}]') 314 | requests_post_mock.return_value = response 315 | 316 | url = "http://example.com/" 317 | user = "user" 318 | password = None 319 | jwt = None 320 | 321 | conn = connect(user=user, password=password, jwt=jwt) 322 | cursor = conn.cursor() 323 | cursor.execute("SELECT * FROM table") 324 | 325 | http_basic_auth_mock.assert_called_with(user, None) 326 | 327 | requests_post_mock.assert_called_with( 328 | ANY, 329 | stream=True, 330 | headers={"Content-Type": "application/json"}, 331 | json={"query": "SELECT * FROM table", "context": {}, "header": False}, 332 | auth=http_basic_auth_mock.return_value, 333 | verify=True, 334 | cert=None, 335 | proxies=None, 336 | ) 337 | 338 | # Test SSL client certificate authentication when `ssl_client_cert` is not None. 339 | @patch("requests.post") 340 | def test_ssl_client_cert_authentication_with_patch_imported( 341 | self, requests_post_mock 342 | ): 343 | response = Response() 344 | response.status_code = 200 345 | response.raw = BytesIO(b'[]') 346 | requests_post_mock.return_value = response 347 | Row = namedtuple("Row", ["_name"], rename=True) 348 | 349 | url = "http://example.com/" 350 | query = "SELECT * FROM table" 351 | 352 | cursor = Cursor(url, header=True, ssl_client_cert="path/to/cert") 353 | cursor.execute(query) 354 | requests_post_mock.assert_called_with( 355 | ANY, 356 | stream=True, 357 | headers={"Content-Type": "application/json"}, 358 | json={"query": "SELECT * FROM table", "context": {}, "header": False}, 359 | auth=ANY, 360 | verify=True, 361 | cert="path/to/cert", 362 | proxies=None, 363 | ) 364 | 365 | 366 | if __name__ == "__main__": 367 | unittest.main() 368 | -------------------------------------------------------------------------------- /pydruid/db/api.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | from collections import namedtuple, OrderedDict 4 | from urllib import parse 5 | 6 | import requests 7 | 8 | from pydruid.db import exceptions 9 | 10 | 11 | class Type(object): 12 | STRING = 1 13 | NUMBER = 2 14 | BOOLEAN = 3 15 | 16 | 17 | class BearerAuth(requests.auth.AuthBase): 18 | def __init__(self, token) -> None: 19 | self.token = token 20 | 21 | def __call__(self, r): 22 | r.headers["Authorization"] = f"Bearer {self.token}" 23 | return r 24 | 25 | def connect( 26 | host="localhost", 27 | port=8082, 28 | path="/druid/v2/sql/", 29 | scheme="http", 30 | user=None, 31 | password=None, 32 | context=None, 33 | header=False, 34 | ssl_verify_cert=True, 35 | ssl_client_cert=None, 36 | proxies=None, 37 | jwt=None, 38 | ): # noqa: E125 39 | """ 40 | Constructor for creating a connection to the database. 41 | 42 | >>> conn = connect('localhost', 8082) 43 | >>> curs = conn.cursor() 44 | 45 | """ 46 | context = context or {} 47 | 48 | return Connection( 49 | host, 50 | port, 51 | path, 52 | scheme, 53 | user, 54 | password, 55 | context, 56 | header, 57 | ssl_verify_cert, 58 | ssl_client_cert, 59 | proxies, 60 | jwt, 61 | ) 62 | 63 | 64 | def check_closed(f): 65 | """Decorator that checks if connection/cursor is closed.""" 66 | 67 | def g(self, *args, **kwargs): 68 | if self.closed: 69 | raise exceptions.Error( 70 | "{klass} already closed".format(klass=self.__class__.__name__) 71 | ) 72 | return f(self, *args, **kwargs) 73 | 74 | return g 75 | 76 | 77 | def check_result(f): 78 | """Decorator that checks if the cursor has results from `execute`.""" 79 | 80 | def g(self, *args, **kwargs): 81 | if self._results is None: 82 | raise exceptions.Error("Called before `execute`") 83 | return f(self, *args, **kwargs) 84 | 85 | return g 86 | 87 | 88 | def get_description_from_row(row): 89 | """ 90 | Return description from a single row. 91 | 92 | We only return the name, type (inferred from the data) and if the values 93 | can be NULL. String columns in Druid are NULLable. Numeric columns are NOT 94 | NULL. 95 | """ 96 | return [ 97 | ( 98 | name, # name 99 | get_type(value), # type_code 100 | None, # [display_size] 101 | None, # [internal_size] 102 | None, # [precision] 103 | None, # [scale] 104 | get_type(value) == Type.STRING, # [null_ok] 105 | ) 106 | for name, value in row.items() 107 | ] 108 | 109 | 110 | def get_type(value): 111 | """ 112 | Infer type from value. 113 | 114 | Note that bool is a subclass of int so order of statements matter. 115 | """ 116 | 117 | if isinstance(value, str) or value is None: 118 | return Type.STRING 119 | elif isinstance(value, bool): 120 | return Type.BOOLEAN 121 | elif isinstance(value, (int, float)): 122 | return Type.NUMBER 123 | 124 | raise exceptions.Error("Value of unknown type: {value}".format(value=value)) 125 | 126 | 127 | class Connection(object): 128 | """Connection to a Druid database.""" 129 | 130 | def __init__( 131 | self, 132 | host="localhost", 133 | port=8082, 134 | path="/druid/v2/sql/", 135 | scheme="http", 136 | user=None, 137 | password=None, 138 | context=None, 139 | header=False, 140 | ssl_verify_cert=True, 141 | ssl_client_cert=None, 142 | proxies=None, 143 | jwt=None, 144 | ): 145 | netloc = "{host}:{port}".format(host=host, port=port) 146 | self.url = parse.urlunparse((scheme, netloc, path, None, None, None)) 147 | self.context = context or {} 148 | self.closed = False 149 | self.cursors = [] 150 | self.header = header 151 | self.user = user 152 | self.password = password 153 | self.ssl_verify_cert = ssl_verify_cert 154 | self.ssl_client_cert = ssl_client_cert 155 | self.proxies = proxies 156 | self.jwt = jwt 157 | 158 | @check_closed 159 | def close(self): 160 | """Close the connection now.""" 161 | self.closed = True 162 | for cursor in self.cursors: 163 | try: 164 | cursor.close() 165 | except exceptions.Error: 166 | pass # already closed 167 | 168 | @check_closed 169 | def commit(self): 170 | """ 171 | Commit any pending transaction to the database. 172 | 173 | Not supported. 174 | """ 175 | pass 176 | 177 | @check_closed 178 | def cursor(self): 179 | """Return a new Cursor Object using the connection.""" 180 | 181 | cursor = Cursor( 182 | self.url, 183 | self.user, 184 | self.password, 185 | self.context, 186 | self.header, 187 | self.ssl_verify_cert, 188 | self.ssl_client_cert, 189 | self.proxies, 190 | self.jwt, 191 | ) 192 | 193 | self.cursors.append(cursor) 194 | 195 | return cursor 196 | 197 | @check_closed 198 | def execute(self, operation, parameters=None): 199 | cursor = self.cursor() 200 | return cursor.execute(operation, parameters) 201 | 202 | def __enter__(self): 203 | return self.cursor() 204 | 205 | def __exit__(self, *exc): 206 | self.close() 207 | 208 | 209 | class Cursor(object): 210 | """Connection cursor.""" 211 | 212 | def __init__( 213 | self, 214 | url, 215 | user=None, 216 | password=None, 217 | context=None, 218 | header=False, 219 | ssl_verify_cert=True, 220 | ssl_client_cert=None, 221 | proxies=None, 222 | jwt=None, 223 | ): 224 | self.url = url 225 | self.context = context or {} 226 | self.header = header 227 | self.user = user 228 | self.password = password 229 | self.ssl_verify_cert = ssl_verify_cert 230 | self.ssl_client_cert = ssl_client_cert 231 | self.proxies = proxies 232 | self.jwt = jwt 233 | 234 | # This read/write attribute specifies the number of rows to fetch at a 235 | # time with .fetchmany(). It defaults to 1 meaning to fetch a single 236 | # row at a time. 237 | self.arraysize = 1 238 | 239 | self.closed = False 240 | 241 | # this is updated only after a query 242 | self.description = None 243 | 244 | # this is set to an iterator after a successfull query 245 | self._results = None 246 | 247 | @property 248 | @check_result 249 | @check_closed 250 | def rowcount(self): 251 | # consume the iterator 252 | results = list(self._results) 253 | n = len(results) 254 | self._results = iter(results) 255 | return n 256 | 257 | @check_closed 258 | def close(self): 259 | """Close the cursor.""" 260 | self.closed = True 261 | 262 | @check_closed 263 | def execute(self, operation, parameters=None): 264 | query = apply_parameters(operation, parameters) 265 | results = self._stream_query(query) 266 | 267 | # `_stream_query` returns a generator that produces the rows; we need to 268 | # consume the first row so that `description` is properly set, so let's 269 | # consume it and insert it back if it is not the header. 270 | try: 271 | first_row = next(results) 272 | self._results = ( 273 | results if self.header else itertools.chain([first_row], results) 274 | ) 275 | except StopIteration: 276 | self._results = iter([]) 277 | 278 | return self 279 | 280 | @check_closed 281 | def executemany(self, operation, seq_of_parameters=None): 282 | raise exceptions.NotSupportedError( 283 | "`executemany` is not supported, use `execute` instead" 284 | ) 285 | 286 | @check_result 287 | @check_closed 288 | def fetchone(self): 289 | """ 290 | Fetch the next row of a query result set, returning a single sequence, 291 | or `None` when no more data is available. 292 | """ 293 | try: 294 | return self.next() 295 | except StopIteration: 296 | return None 297 | 298 | @check_result 299 | @check_closed 300 | def fetchmany(self, size=None): 301 | """ 302 | Fetch the next set of rows of a query result, returning a sequence of 303 | sequences (e.g. a list of tuples). An empty sequence is returned when 304 | no more rows are available. 305 | """ 306 | size = size or self.arraysize 307 | return list(itertools.islice(self._results, size)) 308 | 309 | @check_result 310 | @check_closed 311 | def fetchall(self): 312 | """ 313 | Fetch all (remaining) rows of a query result, returning them as a 314 | sequence of sequences (e.g. a list of tuples). Note that the cursor's 315 | arraysize attribute can affect the performance of this operation. 316 | """ 317 | return list(self._results) 318 | 319 | @check_closed 320 | def setinputsizes(self, sizes): 321 | # not supported 322 | pass 323 | 324 | @check_closed 325 | def setoutputsizes(self, sizes): 326 | # not supported 327 | pass 328 | 329 | @check_closed 330 | def __iter__(self): 331 | return self 332 | 333 | @check_closed 334 | def __next__(self): 335 | return next(self._results) 336 | 337 | next = __next__ 338 | 339 | def _stream_query(self, query): 340 | """ 341 | Stream rows from a query. 342 | 343 | This method will yield rows as the data is returned in chunks from the 344 | server. 345 | """ 346 | self.description = None 347 | 348 | headers = {"Content-Type": "application/json"} 349 | 350 | payload = {"query": query, "context": self.context, "header": self.header} 351 | 352 | if self.user: 353 | auth = requests.auth.HTTPBasicAuth(self.user, self.password) 354 | elif self.jwt: 355 | auth = BearerAuth(self.jwt) 356 | else: 357 | auth = None 358 | 359 | r = requests.post( 360 | self.url, 361 | stream=True, 362 | headers=headers, 363 | json=payload, 364 | auth=auth, 365 | verify=self.ssl_verify_cert, 366 | cert=self.ssl_client_cert, 367 | proxies=self.proxies, 368 | ) 369 | if r.encoding is None: 370 | r.encoding = "utf-8" 371 | # raise any error messages 372 | if r.status_code != 200: 373 | try: 374 | payload = r.json() 375 | except Exception: 376 | payload = { 377 | "error": "Unknown error", 378 | "errorClass": "Unknown", 379 | "errorMessage": r.text, 380 | } 381 | 382 | category = payload.pop("category", payload.pop("errorClass", "Unknown")) 383 | error = payload.get("error") or "Unknown" 384 | error_message = payload.get("errorMessage") or "Unknown" 385 | msg = f"{error} ({category}): {error_message}" 386 | raise exceptions.ProgrammingError(msg) 387 | 388 | # Druid will stream the data in chunks of 8k bytes, splitting the JSON 389 | # between them; setting `chunk_size` to `None` makes it use the server 390 | # size 391 | chunks = r.iter_content(chunk_size=None, decode_unicode=True) 392 | Row = None 393 | for row in rows_from_chunks(chunks): 394 | # update description 395 | if self.description is None: 396 | self.description = ( 397 | list(row.items()) if self.header else get_description_from_row(row) 398 | ) 399 | 400 | # return row in namedtuple 401 | if Row is None: 402 | Row = namedtuple("Row", row.keys(), rename=True) 403 | yield Row(*row.values()) 404 | 405 | 406 | def rows_from_chunks(chunks): 407 | """ 408 | A generator that yields rows from JSON chunks. 409 | 410 | Druid will return the data in chunks, but they are not aligned with the 411 | JSON objects. This function will parse all complete rows inside each chunk, 412 | yielding them as soon as possible. 413 | """ 414 | body = "" 415 | for chunk in chunks: 416 | if chunk: 417 | body = "".join((body, chunk)) 418 | 419 | # find last complete row 420 | boundary = 0 421 | brackets = 0 422 | in_string = False 423 | for i, char in enumerate(body): 424 | if char == '"': 425 | if not in_string: 426 | in_string = True 427 | elif body[i - 1] != "\\": 428 | in_string = False 429 | 430 | if in_string: 431 | continue 432 | 433 | if char == "{": 434 | brackets += 1 435 | elif char == "}": 436 | brackets -= 1 437 | if brackets == 0 and i > boundary: 438 | boundary = i + 1 439 | 440 | rows = body[:boundary].lstrip("[,") 441 | body = body[boundary:] 442 | 443 | for row in json.loads( 444 | "[{rows}]".format(rows=rows), object_pairs_hook=OrderedDict 445 | ): 446 | yield row 447 | 448 | 449 | def apply_parameters(operation, parameters): 450 | if not parameters: 451 | return operation 452 | 453 | escaped_parameters = {key: escape(value) for key, value in parameters.items()} 454 | return operation % escaped_parameters 455 | 456 | 457 | def escape(value): 458 | """ 459 | Escape the parameter value. 460 | 461 | Note that bool is a subclass of int so order of statements matter. 462 | """ 463 | 464 | if value == "*": 465 | return value 466 | elif isinstance(value, str): 467 | return "'{}'".format(value.replace("'", "''")) 468 | elif isinstance(value, bool): 469 | return "TRUE" if value else "FALSE" 470 | elif isinstance(value, (int, float)): 471 | return value 472 | elif isinstance(value, (list, tuple)): 473 | return ", ".join(escape(element) for element in value) 474 | -------------------------------------------------------------------------------- /tests/utils/test_filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import pytest 4 | 5 | from pydruid.utils import dimensions, filters 6 | 7 | 8 | class TestDimension: 9 | def test_dimension(self): 10 | d = filters.Dimension("dim") 11 | actual = filters.Filter.build_filter(d == "val") 12 | expected = {"type": "selector", "dimension": "dim", "value": "val"} 13 | assert actual == expected 14 | 15 | def test_ne_dimension(self): 16 | d = filters.Dimension("dim") 17 | actual = filters.Filter.build_filter(d != "val") 18 | expected = { 19 | "field": {"dimension": "dim", "type": "selector", "value": "val"}, 20 | "type": "not", 21 | } 22 | assert actual == expected 23 | 24 | 25 | class TestFilter: 26 | def test_selector_filter(self): 27 | actual = filters.Filter.build_filter( 28 | filters.Filter(dimension="dim", value="val") 29 | ) 30 | expected = {"type": "selector", "dimension": "dim", "value": "val"} 31 | assert actual == expected 32 | 33 | def test_selector_filter_extraction_fn(self): 34 | extraction_fn = dimensions.RegexExtraction("([a-b])") 35 | f = filters.Filter( 36 | dimension="dim", value="v", extraction_function=extraction_fn 37 | ) 38 | actual = filters.Filter.build_filter(f) 39 | expected = { 40 | "type": "selector", 41 | "dimension": "dim", 42 | "value": "v", 43 | "extractionFn": {"type": "regex", "expr": "([a-b])"}, 44 | } 45 | assert actual == expected 46 | 47 | def test_extraction_filter(self): 48 | extraction_fn = dimensions.PartialExtraction("([a-b])") 49 | f = filters.Filter( 50 | type="extraction", 51 | dimension="dim", 52 | value="v", 53 | extraction_function=extraction_fn, 54 | ) 55 | actual = filters.Filter.build_filter(f) 56 | expected = { 57 | "type": "extraction", 58 | "dimension": "dim", 59 | "value": "v", 60 | "extractionFn": {"type": "partial", "expr": "([a-b])"}, 61 | } 62 | assert actual == expected 63 | 64 | def test_javascript_filter(self): 65 | actual = filters.Filter.build_filter( 66 | filters.Filter( 67 | type="javascript", dimension="dim", function="function(x){return true}" 68 | ) 69 | ) 70 | expected = { 71 | "type": "javascript", 72 | "dimension": "dim", 73 | "function": "function(x){return true}", 74 | } 75 | assert actual == expected 76 | 77 | def test_bound_filter(self): 78 | actual = filters.Filter.build_filter( 79 | filters.Bound( 80 | dimension="dim", 81 | lower="1", 82 | lowerStrict=True, 83 | upper="10", 84 | upperStrict=True, 85 | ordering="numeric", 86 | ) 87 | ) 88 | expected = { 89 | "type": "bound", 90 | "dimension": "dim", 91 | "lower": "1", 92 | "lowerStrict": True, 93 | "upper": "10", 94 | "upperStrict": True, 95 | "alphaNumeric": False, 96 | "ordering": "numeric", 97 | } 98 | assert actual == expected 99 | 100 | def test_bound_filter_with_extraction_function(self): 101 | f = filters.Bound( 102 | dimension="d", 103 | lower="1", 104 | upper="3", 105 | upperStrict=True, 106 | extraction_function=dimensions.RegexExtraction(".*([0-9]+)"), 107 | ) 108 | actual = filters.Filter.build_filter(f) 109 | expected = { 110 | "type": "bound", 111 | "dimension": "d", 112 | "lower": "1", 113 | "lowerStrict": False, 114 | "upper": "3", 115 | "upperStrict": True, 116 | "ordering": "lexicographic", 117 | "alphaNumeric": False, 118 | "extractionFn": {"type": "regex", "expr": ".*([0-9]+)"}, 119 | } 120 | assert actual == expected 121 | 122 | def test_bound_filter_alphanumeric(self): 123 | actual = filters.Filter.build_filter( 124 | filters.Bound( 125 | dimension="dim", 126 | lower="1", 127 | lowerStrict=True, 128 | upper="10", 129 | upperStrict=True, 130 | alphaNumeric=True, 131 | ) 132 | ) 133 | expected = { 134 | "type": "bound", 135 | "dimension": "dim", 136 | "lower": "1", 137 | "lowerStrict": True, 138 | "upper": "10", 139 | "upperStrict": True, 140 | "alphaNumeric": True, 141 | "ordering": "lexicographic", 142 | } 143 | assert actual == expected 144 | 145 | def test_bound_filter_lower_not_included(self): 146 | actual = filters.Filter.build_filter( 147 | filters.Bound(dimension="dim", upper="10", upperStrict=True) 148 | ) 149 | expected = { 150 | "type": "bound", 151 | "dimension": "dim", 152 | "lower": None, 153 | "lowerStrict": False, 154 | "upper": "10", 155 | "upperStrict": True, 156 | "alphaNumeric": False, 157 | "ordering": "lexicographic", 158 | } 159 | assert actual == expected 160 | 161 | def test_spatial_filter_rectangle(self): 162 | actual = filters.Filter.build_filter( 163 | filters.Spatial( 164 | dimension="dim", 165 | bound_type="rectangle", 166 | minCoords=[100.0, 100.0], 167 | maxCoords=[100.1, 100.1], 168 | radius=10.0, 169 | ) 170 | ) 171 | expected = { 172 | "type": "spatial", 173 | "dimension": "dim", 174 | "bound": { 175 | "type": "rectangle", 176 | "minCoords": [100.0, 100.0], 177 | "maxCoords": [100.1, 100.1], 178 | }, 179 | } 180 | assert actual == expected 181 | 182 | def test_spatial_filter_radius(self): 183 | actual = filters.Filter.build_filter( 184 | filters.Spatial( 185 | dimension="dim", 186 | bound_type="radius", 187 | coords=[100.0, 100.0], 188 | radius=100.0, 189 | ) 190 | ) 191 | expected = { 192 | "type": "spatial", 193 | "dimension": "dim", 194 | "bound": {"type": "radius", "coords": [100.0, 100.0], "radius": 100.0}, 195 | } 196 | assert actual == expected 197 | 198 | def test_spatial_filter_polygon(self): 199 | actual = filters.Filter.build_filter( 200 | filters.Spatial( 201 | dimension="dim", 202 | bound_type="polygon", 203 | abscissa=[2.0, 3.0, 7.0, 8.0], 204 | ordinate=[4.0, 9.0, 8.0, 1.0], 205 | ) 206 | ) 207 | expected = { 208 | "type": "spatial", 209 | "dimension": "dim", 210 | "bound": { 211 | "type": "polygon", 212 | "abscissa": [2.0, 3.0, 7.0, 8.0], 213 | "ordinate": [4.0, 9.0, 8.0, 1.0], 214 | }, 215 | } 216 | assert actual == expected 217 | 218 | def test_interval_filter(self): 219 | actual = filters.Filter.build_filter( 220 | filters.Interval( 221 | dimension="dim", 222 | intervals=["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"], 223 | ) 224 | ) 225 | expected = { 226 | "type": "interval", 227 | "dimension": "dim", 228 | "intervals": ["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"], 229 | } 230 | assert actual == expected 231 | 232 | def test_interval_with_extraction_function(self): 233 | f = filters.Interval( 234 | dimension="dim", 235 | intervals=["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"], 236 | extraction_function=dimensions.RegexExtraction(".*([0-9]+)"), 237 | ) 238 | actual = filters.Filter.build_filter(f) 239 | expected = { 240 | "type": "interval", 241 | "dimension": "dim", 242 | "intervals": ["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"], 243 | "extractionFn": {"type": "regex", "expr": ".*([0-9]+)"}, 244 | } 245 | assert actual == expected 246 | 247 | def test_and_filter(self): 248 | f1 = filters.Filter(dimension="dim1", value="val1") 249 | f2 = filters.Filter(dimension="dim2", value="val2") 250 | actual = filters.Filter.build_filter(f1 & f2) 251 | expected = { 252 | "type": "and", 253 | "fields": [ 254 | {"type": "selector", "dimension": "dim1", "value": "val1"}, 255 | {"type": "selector", "dimension": "dim2", "value": "val2"}, 256 | ], 257 | } 258 | assert actual == expected 259 | 260 | def test_and_filter_multiple(self): 261 | f1 = filters.Filter(dimension="dim1", value="val1") 262 | f2 = filters.Filter(dimension="dim2", value="val2") 263 | f3 = filters.Filter(dimension="dim3", value="val3") 264 | filter = filters.Filter(type="and", fields=[f1, f2, f3]) 265 | actual = filters.Filter.build_filter(filter) 266 | expected = { 267 | "type": "and", 268 | "fields": [ 269 | {"type": "selector", "dimension": "dim1", "value": "val1"}, 270 | {"type": "selector", "dimension": "dim2", "value": "val2"}, 271 | {"type": "selector", "dimension": "dim3", "value": "val3"}, 272 | ], 273 | } 274 | assert actual == expected 275 | 276 | def test_or_filter(self): 277 | f1 = filters.Filter(dimension="dim1", value="val1") 278 | f2 = filters.Filter(dimension="dim2", value="val2") 279 | actual = filters.Filter.build_filter(f1 | f2) 280 | expected = { 281 | "type": "or", 282 | "fields": [ 283 | {"type": "selector", "dimension": "dim1", "value": "val1"}, 284 | {"type": "selector", "dimension": "dim2", "value": "val2"}, 285 | ], 286 | } 287 | assert actual == expected 288 | 289 | def test_nested_mix_filter(self): 290 | f1 = filters.Filter(dimension="dim1", value="val1") 291 | f2 = filters.Filter(dimension="dim2", value="val2") 292 | f3 = filters.Filter(dimension="dim3", value="val3") 293 | f4 = filters.Filter(dimension="dim4", value="val4") 294 | f5 = filters.Filter(dimension="dim5", value="val5") 295 | f6 = filters.Filter(dimension="dim6", value="val6") 296 | f7 = filters.Filter(dimension="dim7", value="val7") 297 | f8 = filters.Filter(dimension="dim8", value="val8") 298 | actual = filters.Filter.build_filter( 299 | f1 & ~f2 & f3 & (f4 | ~f5 | f6 | (f7 & ~f8)) 300 | ) 301 | expected = { 302 | "fields": [ 303 | {"dimension": "dim1", "type": "selector", "value": "val1"}, 304 | { 305 | "field": {"dimension": "dim2", "type": "selector", "value": "val2"}, 306 | "type": "not", 307 | }, 308 | {"dimension": "dim3", "type": "selector", "value": "val3"}, 309 | { 310 | "fields": [ 311 | {"dimension": "dim4", "type": "selector", "value": "val4"}, 312 | { 313 | "field": { 314 | "dimension": "dim5", 315 | "type": "selector", 316 | "value": "val5", 317 | }, 318 | "type": "not", 319 | }, 320 | {"dimension": "dim6", "type": "selector", "value": "val6"}, 321 | { 322 | "fields": [ 323 | { 324 | "dimension": "dim7", 325 | "type": "selector", 326 | "value": "val7", 327 | }, 328 | { 329 | "field": { 330 | "dimension": "dim8", 331 | "type": "selector", 332 | "value": "val8", 333 | }, 334 | "type": "not", 335 | }, 336 | ], 337 | "type": "and", 338 | }, 339 | ], 340 | "type": "or", 341 | }, 342 | ], 343 | "type": "and", 344 | } 345 | assert actual == expected 346 | 347 | def test_or_filter_multiple(self): 348 | f1 = filters.Filter(dimension="dim1", value="val1") 349 | f2 = filters.Filter(dimension="dim2", value="val2") 350 | f3 = filters.Filter(dimension="dim3", value="val3") 351 | filter = filters.Filter(type="or", fields=[f1, f2, f3]) 352 | actual = filters.Filter.build_filter(filter) 353 | expected = { 354 | "type": "or", 355 | "fields": [ 356 | {"type": "selector", "dimension": "dim1", "value": "val1"}, 357 | {"type": "selector", "dimension": "dim2", "value": "val2"}, 358 | {"type": "selector", "dimension": "dim3", "value": "val3"}, 359 | ], 360 | } 361 | assert actual == expected 362 | 363 | def test_not_filter(self): 364 | f = ~filters.Filter(dimension="dim", value="val") 365 | actual = filters.Filter.build_filter(f) 366 | # Call `build_filter` twice to make sure it does not 367 | # change the passed filter object argument `f`. 368 | actual = filters.Filter.build_filter(f) 369 | expected = { 370 | "type": "not", 371 | "field": {"type": "selector", "dimension": "dim", "value": "val"}, 372 | } 373 | assert actual == expected 374 | 375 | def test_nested_not_or_filter(self): 376 | f1 = filters.Filter(dimension="dim1", value="val1") 377 | f2 = filters.Filter(dimension="dim2", value="val2") 378 | actual = filters.Filter.build_filter(~(f1 | f2)) 379 | expected = { 380 | "type": "not", 381 | "field": { 382 | "type": "or", 383 | "fields": [ 384 | {"type": "selector", "dimension": "dim1", "value": "val1"}, 385 | {"type": "selector", "dimension": "dim2", "value": "val2"}, 386 | ], 387 | }, 388 | } 389 | assert actual == expected 390 | 391 | def test_in_filter(self): 392 | actual = filters.Filter.build_filter( 393 | filters.Filter(type="in", dimension="dim", values=["val1", "val2", "val3"]) 394 | ) 395 | expected = { 396 | "type": "in", 397 | "dimension": "dim", 398 | "values": ["val1", "val2", "val3"], 399 | } 400 | assert actual == expected 401 | 402 | def test_not_in_filter(self): 403 | actual = filters.Filter.build_filter( 404 | ~filters.Filter(type="in", dimension="dim", values=["val1", "val2", "val3"]) 405 | ) 406 | expected = { 407 | "type": "not", 408 | "field": { 409 | "type": "in", 410 | "dimension": "dim", 411 | "values": ["val1", "val2", "val3"], 412 | }, 413 | } 414 | assert actual == expected 415 | 416 | def test_invalid_filter(self): 417 | with pytest.raises(NotImplementedError): 418 | filters.Filter(type="invalid", dimension="dim", value="val") 419 | 420 | def test_columnComparison_filter(self): 421 | actual = filters.Filter.build_filter( 422 | filters.Filter( 423 | type="columnComparison", 424 | dimensions=["dim1", dimensions.DimensionSpec("dim2", "dim2")], 425 | ) 426 | ) 427 | expected = { 428 | "type": "columnComparison", 429 | "dimensions": [ 430 | "dim1", 431 | {"type": "default", "dimension": "dim2", "outputName": "dim2"}, 432 | ], 433 | } 434 | assert actual == expected 435 | 436 | def test_search_filter(self): 437 | # Without caseSensitive param - default:false 438 | actual = filters.Filter.build_filter( 439 | filters.Filter(type="search", dimension="dim", value="val") 440 | ) 441 | expected = { 442 | "type": "search", 443 | "dimension": "dim", 444 | "query": {"type": "contains", "caseSensitive": "false", "value": "val"}, 445 | } 446 | assert actual == expected 447 | 448 | # With caseSensitive param 449 | actual = filters.Filter.build_filter( 450 | filters.Filter( 451 | type="search", dimension="dim", value="val", caseSensitive="true" 452 | ) 453 | ) 454 | expected = { 455 | "type": "search", 456 | "dimension": "dim", 457 | "query": {"type": "contains", "caseSensitive": "true", "value": "val"}, 458 | } 459 | assert actual == expected 460 | 461 | def test_like_filter(self): 462 | actual = filters.Filter.build_filter( 463 | filters.Filter(type="like", dimension="dim", pattern="%val%") 464 | ) 465 | expected = {"type": "like", "dimension": "dim", "pattern": "%val%"} 466 | assert actual == expected 467 | --------------------------------------------------------------------------------