is one of"
26 | @echo " html to make standalone HTML files"
27 | @echo " dirhtml to make HTML files named index.html in directories"
28 | @echo " singlehtml to make a single large HTML file"
29 | @echo " pickle to make pickle files"
30 | @echo " json to make JSON files"
31 | @echo " htmlhelp to make HTML files and a HTML help project"
32 | @echo " qthelp to make HTML files and a qthelp project"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 |
49 | clean:
50 | rm -rf $(BUILDDIR)/*
51 |
52 | html:
53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
54 | @echo
55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
56 |
57 | dirhtml:
58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
59 | @echo
60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
61 |
62 | singlehtml:
63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
64 | @echo
65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
66 |
67 | pickle:
68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
69 | @echo
70 | @echo "Build finished; now you can process the pickle files."
71 |
72 | json:
73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
74 | @echo
75 | @echo "Build finished; now you can process the JSON files."
76 |
77 | htmlhelp:
78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
79 | @echo
80 | @echo "Build finished; now you can run HTML Help Workshop with the" \
81 | ".hhp project file in $(BUILDDIR)/htmlhelp."
82 |
83 | qthelp:
84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
85 | @echo
86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyDruid.qhcp"
89 | @echo "To view the help file:"
90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyDruid.qhc"
91 |
92 | devhelp:
93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
94 | @echo
95 | @echo "Build finished."
96 | @echo "To view the help file:"
97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PyDruid"
98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PyDruid"
99 | @echo "# devhelp"
100 |
101 | epub:
102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | @echo
104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 |
106 | latex:
107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | @echo
109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | "(use \`make latexpdf' here to do that automatically)."
112 |
113 | latexpdf:
114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | @echo "Running LaTeX files through pdflatex..."
116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 |
119 | latexpdfja:
120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | @echo "Running LaTeX files through platex and dvipdfmx..."
122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 |
125 | text:
126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | @echo
128 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
129 |
130 | man:
131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | @echo
133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 |
135 | texinfo:
136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | @echo
138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | @echo "Run \`make' in that directory to run these through makeinfo" \
140 | "(use \`make info' here to do that automatically)."
141 |
142 | info:
143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | @echo "Running Texinfo files through makeinfo..."
145 | make -C $(BUILDDIR)/texinfo info
146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 |
148 | gettext:
149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | @echo
151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 |
153 | changes:
154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | @echo
156 | @echo "The overview file is in $(BUILDDIR)/changes."
157 |
158 | linkcheck:
159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | @echo
161 | @echo "Link check complete; look for any errors in the above output " \
162 | "or in $(BUILDDIR)/linkcheck/output.txt."
163 |
164 | doctest:
165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | @echo "Testing of doctests in the sources finished, look at the " \
167 | "results in $(BUILDDIR)/doctest/output.txt."
168 |
169 | xml:
170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | @echo
172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 |
174 | pseudoxml:
175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | @echo
177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 |
--------------------------------------------------------------------------------
/pydruid/utils/postaggregator.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2013 Metamarkets Group Inc.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | class Postaggregator:
17 | def __init__(self, fn, fields, name):
18 | self.post_aggregator = {
19 | "type": "arithmetic",
20 | "name": name,
21 | "fn": fn,
22 | "fields": fields,
23 | }
24 | self.name = name
25 |
26 | def __mul__(self, other):
27 | return Postaggregator("*", self.fields(other), self.name + "mul" + other.name)
28 |
29 | def __sub__(self, other):
30 | return Postaggregator("-", self.fields(other), self.name + "sub" + other.name)
31 |
32 | def __add__(self, other):
33 | return Postaggregator("+", self.fields(other), self.name + "add" + other.name)
34 |
35 | def __div__(self, other):
36 | return Postaggregator("/", self.fields(other), self.name + "div" + other.name)
37 |
38 | def __truediv__(self, other):
39 | return self.__div__(other)
40 |
41 | def fields(self, other):
42 | return [self.post_aggregator, other.post_aggregator]
43 |
44 | @staticmethod
45 | def build_post_aggregators(postaggs):
46 | def rename_postagg(new_name, post_aggregator):
47 | post_aggregator["name"] = new_name
48 | return post_aggregator
49 |
50 | return [
51 | rename_postagg(new_name, postagg.post_aggregator)
52 | for (new_name, postagg) in postaggs.items()
53 | ]
54 |
55 |
56 | class QuantilesDoublesSketchToQuantile(Postaggregator):
57 | def __init__(self, name: str, field_name: str, fraction: float):
58 | self.post_aggregator = {
59 | "type": "quantilesDoublesSketchToQuantile",
60 | "name": name,
61 | "fraction": fraction,
62 | "field": {
63 | "fieldName": field_name,
64 | "name": field_name,
65 | "type": "fieldAccess",
66 | },
67 | }
68 |
69 |
70 | class Quantile(Postaggregator):
71 | def __init__(self, name, probability):
72 | Postaggregator.__init__(self, None, None, name)
73 | self.post_aggregator = {
74 | "type": "quantile",
75 | "fieldName": name,
76 | "probability": probability,
77 | }
78 |
79 |
80 | class Quantiles(Postaggregator):
81 | def __init__(self, name, probabilities):
82 | Postaggregator.__init__(self, None, None, name)
83 | self.post_aggregator = {
84 | "type": "quantiles",
85 | "fieldName": name,
86 | "probabilities": probabilities,
87 | }
88 |
89 |
90 | class Field(Postaggregator):
91 | def __init__(self, name):
92 | Postaggregator.__init__(self, None, None, name)
93 | self.post_aggregator = {"type": "fieldAccess", "fieldName": name}
94 |
95 |
96 | class Const(Postaggregator):
97 | def __init__(self, value, output_name=None):
98 |
99 | if output_name is None:
100 | name = "const"
101 | else:
102 | name = output_name
103 |
104 | Postaggregator.__init__(self, None, None, name)
105 | self.post_aggregator = {"type": "constant", "name": name, "value": value}
106 |
107 |
108 | class HyperUniqueCardinality(Postaggregator):
109 | def __init__(self, name):
110 | Postaggregator.__init__(self, None, None, name)
111 | self.post_aggregator = {"type": "hyperUniqueCardinality", "fieldName": name}
112 |
113 |
114 | class DoubleGreatest(Postaggregator):
115 | def __init__(self, fields, output_name=None):
116 |
117 | if output_name is None:
118 | name = "doubleGreatest"
119 | else:
120 | name = output_name
121 |
122 | Postaggregator.__init__(self, None, None, name)
123 | self.post_aggregator = {
124 | "type": "doubleGreatest",
125 | "name": name,
126 | "fields": [f.post_aggregator for f in fields],
127 | }
128 |
129 |
130 | class DoubleLeast(Postaggregator):
131 | def __init__(self, fields, output_name=None):
132 |
133 | if output_name is None:
134 | name = "doubleLeast"
135 | else:
136 | name = output_name
137 |
138 | Postaggregator.__init__(self, None, None, name)
139 | self.post_aggregator = {
140 | "type": "doubleLeast",
141 | "name": name,
142 | "fields": [f.post_aggregator for f in fields],
143 | }
144 |
145 |
146 | class LongGreatest(Postaggregator):
147 | def __init__(self, fields, output_name=None):
148 |
149 | if output_name is None:
150 | name = "longGreatest"
151 | else:
152 | name = output_name
153 |
154 | Postaggregator.__init__(self, None, None, name)
155 | self.post_aggregator = {
156 | "type": "longGreatest",
157 | "name": name,
158 | "fields": [f.post_aggregator for f in fields],
159 | }
160 |
161 |
162 | class LongLeast(Postaggregator):
163 | def __init__(self, fields, output_name=None):
164 |
165 | if output_name is None:
166 | name = "longLeast"
167 | else:
168 | name = output_name
169 |
170 | Postaggregator.__init__(self, None, None, name)
171 | self.post_aggregator = {
172 | "type": "longLeast",
173 | "name": name,
174 | "fields": [f.post_aggregator for f in fields],
175 | }
176 |
177 |
178 | class ThetaSketchOp(object):
179 | def __init__(self, fn, fields, name):
180 | self.post_aggregator = {
181 | "type": "thetaSketchSetOp",
182 | "name": name,
183 | "func": fn,
184 | "fields": fields,
185 | }
186 | self.name = name
187 |
188 | def __or__(self, other):
189 | return ThetaSketchOp(
190 | "UNION", self.fields(other), self.name + "_OR_" + other.name
191 | )
192 |
193 | def __and__(self, other):
194 | return ThetaSketchOp(
195 | "INTERSECT", self.fields(other), self.name + "_AND_" + other.name
196 | )
197 |
198 | def __ne__(self, other):
199 | return ThetaSketchOp(
200 | "NOT", self.fields(other), self.name + "_NOT_" + other.name
201 | )
202 |
203 | def fields(self, other):
204 | return [self.post_aggregator, other.post_aggregator]
205 |
206 | @staticmethod
207 | def build_post_aggregators(thetasketchops):
208 | def rename_thetasketchop(new_name, thetasketchop):
209 | thetasketchop["name"] = new_name
210 | return thetasketchop
211 |
212 | return [
213 | rename_thetasketchop(new_name, thetasketchop.post_aggregator)
214 | for (new_name, thetasketchop) in thetasketchops.items()
215 | ]
216 |
217 |
218 | class ThetaSketch(ThetaSketchOp):
219 | def __init__(self, name):
220 | ThetaSketchOp.__init__(self, None, None, name)
221 | self.post_aggregator = {"type": "fieldAccess", "fieldName": name}
222 |
223 |
224 | class ThetaSketchEstimate(Postaggregator):
225 | def __init__(self, fields):
226 | field = (
227 | fields.post_aggregator
228 | if type(fields) in [ThetaSketch, ThetaSketchOp]
229 | else fields
230 | )
231 | self.post_aggregator = {
232 | "type": "thetaSketchEstimate",
233 | "name": "thetasketchestimate",
234 | "field": field,
235 | }
236 | self.name = "thetasketchestimate"
237 |
--------------------------------------------------------------------------------
/pydruid/db/sqlalchemy.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import text, types, util
2 | from sqlalchemy.engine import default
3 | from sqlalchemy.sql import compiler
4 |
5 | import pydruid.db
6 |
7 | RESERVED_SCHEMAS = ["INFORMATION_SCHEMA"]
8 |
9 |
10 | jdbc_type_map = {
11 | -6: types.BigInteger,
12 | -5: types.BigInteger,
13 | 1: types.String,
14 | 3: types.Float,
15 | 4: types.BigInteger,
16 | 5: types.BigInteger,
17 | 6: types.Float,
18 | 7: types.Float,
19 | 8: types.Float,
20 | 12: types.String,
21 | 16: types.Boolean,
22 | 91: types.DATE,
23 | 93: types.TIMESTAMP,
24 | 1111: types.BLOB,
25 | }
26 |
27 |
28 | class UniversalSet(object):
29 | def __contains__(self, item):
30 | return True
31 |
32 |
33 | class DruidIdentifierPreparer(compiler.IdentifierPreparer):
34 | reserved_words = UniversalSet()
35 |
36 |
37 | class DruidCompiler(compiler.SQLCompiler):
38 | pass
39 |
40 |
41 | class DruidTypeCompiler(compiler.GenericTypeCompiler):
42 | def visit_REAL(self, type_, **kwargs):
43 | return "DOUBLE"
44 |
45 | def visit_NUMERIC(self, type_, **kwargs):
46 | return "LONG"
47 |
48 | visit_DECIMAL = visit_NUMERIC
49 | visit_INTEGER = visit_NUMERIC
50 | visit_SMALLINT = visit_NUMERIC
51 | visit_BIGINT = visit_NUMERIC
52 | visit_BOOLEAN = visit_NUMERIC
53 | visit_TIMESTAMP = visit_NUMERIC
54 | visit_DATE = visit_NUMERIC
55 |
56 | def visit_CHAR(self, type_, **kwargs):
57 | return "STRING"
58 |
59 | visit_NCHAR = visit_CHAR
60 | visit_VARCHAR = visit_CHAR
61 | visit_NVARCHAR = visit_CHAR
62 | visit_TEXT = visit_CHAR
63 |
64 | def visit_DATETIME(self, type_, **kwargs):
65 | return "LONG"
66 |
67 | def visit_TIME(self, type_, **kwargs):
68 | return "LONG"
69 |
70 | def visit_BLOB(self, type_, **kwargs):
71 | return "COMPLEX"
72 |
73 | visit_CLOB = visit_BLOB
74 | visit_NCLOB = visit_BLOB
75 | visit_VARBINARY = visit_BLOB
76 | visit_BINARY = visit_BLOB
77 |
78 |
79 | class DruidDialect(default.DefaultDialect):
80 |
81 | name = "druid"
82 | scheme = "http"
83 | driver = "rest"
84 | user = None
85 | password = None
86 | preparer = DruidIdentifierPreparer
87 | statement_compiler = DruidCompiler
88 | type_compiler = DruidTypeCompiler
89 | supports_alter = False
90 | supports_pk_autoincrement = False
91 | supports_default_values = False
92 | supports_empty_insert = False
93 | supports_unicode_statements = True
94 | supports_unicode_binds = True
95 | returns_unicode_strings = True
96 | description_encoding = None
97 | supports_native_boolean = True
98 |
99 | def __init__(self, context=None, *args, **kwargs):
100 | super(DruidDialect, self).__init__(*args, **kwargs)
101 | self.context = context or {}
102 |
103 | @classmethod
104 | def dbapi(cls):
105 | return pydruid.db
106 |
107 | def create_connect_args(self, url):
108 | kwargs = {
109 | **url.query,
110 | "host": url.host,
111 | "port": url.port or 8082,
112 | "user": url.username or None,
113 | "password": url.password or None,
114 | "path": url.database,
115 | "scheme": self.scheme,
116 | "context": self.context,
117 | "header": url.query.get("header") == "true",
118 | }
119 | return ([], kwargs)
120 |
121 | def do_ping(self, dbapi_connection) -> bool:
122 | """
123 | Return if the database can be reached.
124 | """
125 | try:
126 | dbapi_connection.execute(text("SELECT 1"))
127 | except Exception as ex:
128 | return False
129 |
130 | return True
131 |
132 | def get_schema_names(self, connection, **kwargs):
133 | # Each Druid datasource appears as a table in the "druid" schema. This
134 | # is also the default schema, so Druid datasources can be referenced as
135 | # either druid.dataSourceName or simply dataSourceName.
136 | result = connection.execute(
137 | text("SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA")
138 | )
139 |
140 | return [
141 | row.SCHEMA_NAME for row in result if row.SCHEMA_NAME not in RESERVED_SCHEMAS
142 | ]
143 |
144 | def has_table(self, connection, table_name, schema=None):
145 | query = """
146 | SELECT COUNT(*) > 0 AS exists_
147 | FROM INFORMATION_SCHEMA.TABLES
148 | WHERE TABLE_NAME = '{table_name}'
149 | """.format(
150 | table_name=table_name
151 | )
152 |
153 | result = connection.execute(text(query))
154 | return result.fetchone().exists_
155 |
156 | def get_table_names(self, connection, schema=None, **kwargs):
157 | query = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES"
158 | if schema:
159 | query = "{query} WHERE TABLE_SCHEMA = '{schema}'".format(
160 | query=query, schema=schema
161 | )
162 |
163 | result = connection.execute(text(query))
164 | return [row.TABLE_NAME for row in result]
165 |
166 | def get_view_names(self, connection, schema=None, **kwargs):
167 | return []
168 |
169 | def get_table_options(self, connection, table_name, schema=None, **kwargs):
170 | return {}
171 |
172 | def get_columns(self, connection, table_name, schema=None, **kwargs):
173 | query = """
174 | SELECT COLUMN_NAME,
175 | JDBC_TYPE,
176 | IS_NULLABLE,
177 | COLUMN_DEFAULT
178 | FROM INFORMATION_SCHEMA.COLUMNS
179 | WHERE TABLE_NAME = '{table_name}'
180 | """.format(
181 | table_name=table_name
182 | )
183 | if schema:
184 | query = "{query} AND TABLE_SCHEMA = '{schema}'".format(
185 | query=query, schema=schema
186 | )
187 |
188 | result = connection.execute(text(query))
189 |
190 | return [
191 | {
192 | "name": row.COLUMN_NAME,
193 | "type": self._map_jdbc_type(row),
194 | "nullable": get_is_nullable(row.IS_NULLABLE),
195 | "default": get_default(row.COLUMN_DEFAULT),
196 | }
197 | for row in result
198 | ]
199 |
200 | def get_pk_constraint(self, connection, table_name, schema=None, **kwargs):
201 | return {"constrained_columns": [], "name": None}
202 |
203 | def get_foreign_keys(self, connection, table_name, schema=None, **kwargs):
204 | return []
205 |
206 | def get_check_constraints(self, connection, table_name, schema=None, **kwargs):
207 | return []
208 |
209 | def get_table_comment(self, connection, table_name, schema=None, **kwargs):
210 | return {"text": ""}
211 |
212 | def get_indexes(self, connection, table_name, schema=None, **kwargs):
213 | return []
214 |
215 | def get_unique_constraints(self, connection, table_name, schema=None, **kwargs):
216 | return []
217 |
218 | def get_view_definition(self, connection, view_name, schema=None, **kwargs):
219 | pass
220 |
221 | def do_rollback(self, dbapi_connection):
222 | pass
223 |
224 | def _check_unicode_returns(self, connection, additional_tests=None):
225 | return True
226 |
227 | def _check_unicode_description(self, connection):
228 | return True
229 |
230 | def _map_jdbc_type(self, row):
231 | if row.JDBC_TYPE in jdbc_type_map:
232 | return jdbc_type_map[row.JDBC_TYPE]
233 | util.warn(
234 | "Failed to map column '{row.COLUMN_NAME}' with "
235 | "JDBC type '{row.JDBC_TYPE}' to a sqlalchemy type.".format(row=row)
236 | )
237 | return types.NullType
238 |
239 |
240 | DruidHTTPDialect = DruidDialect
241 |
242 |
243 | class DruidHTTPSDialect(DruidDialect):
244 |
245 | scheme = "https"
246 |
247 |
248 | def get_is_nullable(druid_is_nullable):
249 | # this should be 'YES' or 'NO'; we default to no
250 | return druid_is_nullable.lower() == "yes"
251 |
252 |
253 | def get_default(druid_column_default):
254 | # currently unused, returns ''
255 | return str(druid_column_default) if druid_column_default != "" else None
256 |
--------------------------------------------------------------------------------
/tests/test_client.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import textwrap
3 | import urllib
4 | from io import StringIO
5 | from unittest.mock import Mock, patch
6 |
7 | import pytest
8 |
9 | from pydruid.client import PyDruid
10 | from pydruid.query import Query
11 | from pydruid.utils.aggregators import doublesum
12 | from pydruid.utils.filters import Dimension
13 |
14 |
15 | def create_client(http_headers=None):
16 | return PyDruid("http://localhost:8083", "druid/v2/", http_headers=http_headers)
17 |
18 |
19 | def create_blank_query():
20 | return Query({}, "none")
21 |
22 |
23 | def _http_error(code, msg, data=""):
24 | # Need a file-like object for the response data
25 | fp = StringIO(data)
26 | return urllib.error.HTTPError(
27 | url="http://fakeurl:8080/druid/v2/", hdrs={}, code=code, msg=msg, fp=fp
28 | )
29 |
30 |
31 | class TestPyDruid:
32 | @patch("pydruid.client.urllib.request.urlopen")
33 | def test_druid_returns_error(self, mock_urlopen):
34 | # given
35 | mock_urlopen.side_effect = _http_error(500, "Druid error")
36 | client = create_client()
37 |
38 | # when / then
39 | with pytest.raises(IOError):
40 | client.topn(
41 | datasource="testdatasource",
42 | granularity="all",
43 | intervals="2015-12-29/pt1h",
44 | aggregations={"count": doublesum("count")},
45 | dimension="user_name",
46 | metric="count",
47 | filter=Dimension("user_lang") == "en",
48 | threshold=1,
49 | context={"timeout": 1000},
50 | )
51 |
52 | @patch("pydruid.client.urllib.request.urlopen")
53 | def test_druid_returns_html_error(self, mock_urlopen):
54 | # given
55 | message = textwrap.dedent(
56 | """
57 |
58 |
59 |
60 | Error 500
61 |
62 |
63 | HTTP ERROR: 500
64 | Problem accessing /druid/v2/. Reason:
65 |
javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded
66 |
Powered by Jetty:// 9.3.19.v20170502
67 |
68 |
69 | """
70 | ).strip()
71 | mock_urlopen.side_effect = _http_error(500, "Internal Server Error", message)
72 | client = create_client()
73 |
74 | # when / then
75 | with pytest.raises(IOError) as e:
76 | client.topn(
77 | datasource="testdatasource",
78 | granularity="all",
79 | intervals="2015-12-29/pt1h",
80 | aggregations={"count": doublesum("count")},
81 | dimension="user_name",
82 | metric="count",
83 | filter=Dimension("user_lang") == "en",
84 | threshold=1,
85 | context={"timeout": 1000},
86 | )
87 |
88 | assert (
89 | str(e.value)
90 | == textwrap.dedent(
91 | """
92 | HTTP Error 500: Internal Server Error
93 | Druid Error: javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded
94 | Query is: {
95 | "aggregations": [
96 | {
97 | "fieldName": "count",
98 | "name": "count",
99 | "type": "doubleSum"
100 | }
101 | ],
102 | "context": {
103 | "timeout": 1000
104 | },
105 | "dataSource": "testdatasource",
106 | "dimension": "user_name",
107 | "filter": {
108 | "dimension": "user_lang",
109 | "type": "selector",
110 | "value": "en"
111 | },
112 | "granularity": "all",
113 | "intervals": "2015-12-29/pt1h",
114 | "metric": "count",
115 | "queryType": "topN",
116 | "threshold": 1
117 | }
118 | """
119 | ).strip()
120 | )
121 |
122 | @patch("pydruid.client.urllib.request.urlopen")
123 | def test_druid_returns_results(self, mock_urlopen):
124 | # given
125 | response = Mock()
126 | response.read.return_value = """
127 | [ {
128 | "timestamp" : "2015-12-30T14:14:49.000Z",
129 | "result" : [ {
130 | "dimension" : "aaaa",
131 | "metric" : 100
132 | } ]
133 | } ]
134 | """.encode(
135 | "utf-8"
136 | )
137 | mock_urlopen.return_value = response
138 | client = create_client()
139 |
140 | # when
141 | top = client.topn(
142 | datasource="testdatasource",
143 | granularity="all",
144 | intervals="2015-12-29/pt1h",
145 | aggregations={"count": doublesum("count")},
146 | dimension="user_name",
147 | metric="count",
148 | filter=Dimension("user_lang") == "en",
149 | threshold=1,
150 | context={"timeout": 1000},
151 | )
152 |
153 | # then
154 | assert top is not None
155 | assert len(top.result) == 1
156 | assert len(top.result[0]["result"]) == 1
157 |
158 | @patch("pydruid.client.urllib.request.urlopen")
159 | def test_client_allows_to_export_last_query(self, mock_urlopen):
160 | # given
161 | response = Mock()
162 | response.read.return_value = """
163 | [ {
164 | "timestamp" : "2015-12-30T14:14:49.000Z",
165 | "result" : [ {
166 | "dimension" : "aaaa",
167 | "metric" : 100
168 | } ]
169 | } ]
170 | """.encode(
171 | "utf-8"
172 | )
173 | mock_urlopen.return_value = response
174 | client = create_client()
175 | client.topn(
176 | datasource="testdatasource",
177 | granularity="all",
178 | intervals="2015-12-29/pt1h",
179 | aggregations={"count": doublesum("count")},
180 | dimension="user_name",
181 | metric="count",
182 | filter=Dimension("user_lang") == "en",
183 | threshold=1,
184 | context={"timeout": 1000},
185 | )
186 |
187 | # when / then
188 | # assert that last_query.export_tsv method was called (it should throw an exception, given empty path)
189 | with pytest.raises(TypeError):
190 | client.export_tsv(None)
191 |
192 | def test_client_auth_creds(self):
193 | client = create_client()
194 | query = create_blank_query()
195 | client.set_basic_auth_credentials("myUsername", "myPassword")
196 | headers, _, _ = client._prepare_url_headers_and_body(query)
197 | assert headers["Authorization"] == "Basic bXlVc2VybmFtZTpteVBhc3N3b3Jk"
198 |
199 | def test_client_custom_headers(self):
200 | client = create_client(http_headers = {"custom-header": "test"})
201 | query = create_blank_query()
202 | headers, _, _ = client._prepare_url_headers_and_body(query)
203 | assert headers["custom-header"] == "test"
204 |
205 | @patch("pydruid.client.urllib.request.urlopen")
206 | @patch("pydruid.client.ssl.create_default_context")
207 | def test_client_with_cafile(self, mock_create_default_context, mock_urlopen):
208 | response = Mock()
209 | response.read.return_value = """
210 | [ {
211 | "timestamp" : "2015-12-30T14:14:49.000Z",
212 | "result" : [ {
213 | "dimension" : "aaaa",
214 | "metric" : 100
215 | } ]
216 | } ]
217 | """.encode(
218 | "utf-8"
219 | )
220 | mock_urlopen.return_value = response
221 |
222 | client = PyDruid("http://localhost:8083", "druid/v2/", cafile="tests/cert.pem")
223 |
224 | mock_create_default_context.assert_called_once()
225 | context = mock_create_default_context.return_value
226 | context.load_verify_locations.assert_called_once_with(cafile="tests/cert.pem")
227 | assert client.context == context
228 |
229 | client.topn()
230 | assert mock_urlopen.called_with(context=client.context)
231 |
--------------------------------------------------------------------------------
/tests/utils/test_aggregators.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | from copy import deepcopy
4 | from operator import itemgetter
5 |
6 | from pydruid.utils import aggregators, filters
7 |
8 |
9 | class TestAggregators:
10 | def test_aggregators(self):
11 | aggs = [
12 | ("longsum", "longSum"),
13 | ("longmin", "longMin"),
14 | ("longmax", "longMax"),
15 | ("doublesum", "doubleSum"),
16 | ("doublemin", "doubleMin"),
17 | ("doublemax", "doubleMax"),
18 | ("count", "count"),
19 | ("hyperunique", "hyperUnique"),
20 | ("stringfirst", "stringFirst"),
21 | ("stringlast", "stringLast"),
22 | ]
23 | aggs_funcs = [
24 | (getattr(aggregators, agg_name), agg_type) for agg_name, agg_type in aggs
25 | ]
26 | for f, agg_type in aggs_funcs:
27 | assert f("metric") == {"type": agg_type, "fieldName": "metric"}
28 |
29 | def test_filtered_aggregator(self):
30 | filter_ = filters.Filter(dimension="dim", value="val")
31 | aggs = [
32 | aggregators.count("metric1"),
33 | aggregators.longsum("metric2"),
34 | aggregators.doublesum("metric3"),
35 | aggregators.doublemin("metric4"),
36 | aggregators.doublemax("metric5"),
37 | aggregators.hyperunique("metric6"),
38 | aggregators.cardinality("dim1"),
39 | aggregators.cardinality(["dim1", "dim2"], by_row=True),
40 | aggregators.thetasketch("dim1"),
41 | aggregators.thetasketch("metric7"),
42 | aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192),
43 | ]
44 | for agg in aggs:
45 | expected = {
46 | "type": "filtered",
47 | "filter": {"type": "selector", "dimension": "dim", "value": "val"},
48 | "aggregator": agg,
49 | }
50 | actual = aggregators.filtered(filter_, agg)
51 | assert actual == expected
52 |
53 | def test_nested_filtered_aggregator(self):
54 | filter1 = filters.Filter(dimension="dim1", value="val")
55 | filter2 = filters.Filter(dimension="dim2", value="val")
56 | agg = aggregators.filtered(
57 | filter1, aggregators.filtered(filter2, aggregators.count("metric1"))
58 | )
59 | actual = aggregators.build_aggregators({"agg_name": agg})
60 | # the innermost aggregation must have 'agg_name'
61 | expected = [
62 | {
63 | "type": "filtered",
64 | "aggregator": {
65 | "type": "filtered",
66 | "aggregator": {
67 | "fieldName": "metric1",
68 | "type": "count",
69 | "name": "agg_name",
70 | },
71 | "filter": {"dimension": "dim2", "value": "val", "type": "selector"},
72 | },
73 | "filter": {"dimension": "dim1", "value": "val", "type": "selector"},
74 | }
75 | ]
76 | assert expected == actual
77 |
78 | def test_build_aggregators(self):
79 | agg_input = {
80 | "agg1": aggregators.count("metric1"),
81 | "agg2": aggregators.longsum("metric2"),
82 | "agg3": aggregators.doublesum("metric3"),
83 | "agg4": aggregators.doublemin("metric4"),
84 | "agg5": aggregators.doublemax("metric5"),
85 | "agg6": aggregators.hyperunique("metric6"),
86 | "agg7": aggregators.cardinality("dim1"),
87 | "agg8": aggregators.cardinality(["dim1", "dim2"], by_row=True),
88 | "agg9": aggregators.thetasketch("dim1"),
89 | "agg10": aggregators.thetasketch("metric7"),
90 | "agg11": aggregators.thetasketch(
91 | "metric8", isinputthetasketch=True, size=8192
92 | ),
93 | }
94 | built_agg = aggregators.build_aggregators(agg_input)
95 | expected = [
96 | {"name": "agg1", "type": "count", "fieldName": "metric1"},
97 | {"name": "agg2", "type": "longSum", "fieldName": "metric2"},
98 | {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"},
99 | {"name": "agg4", "type": "doubleMin", "fieldName": "metric4"},
100 | {"name": "agg5", "type": "doubleMax", "fieldName": "metric5"},
101 | {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"},
102 | {
103 | "name": "agg7",
104 | "type": "cardinality",
105 | "fieldNames": ["dim1"],
106 | "byRow": False,
107 | },
108 | {
109 | "name": "agg8",
110 | "type": "cardinality",
111 | "fieldNames": ["dim1", "dim2"],
112 | "byRow": True,
113 | },
114 | {
115 | "name": "agg9",
116 | "type": "thetaSketch",
117 | "fieldName": "dim1",
118 | "isInputThetaSketch": False,
119 | "size": 16384,
120 | },
121 | {
122 | "name": "agg10",
123 | "type": "thetaSketch",
124 | "fieldName": "metric7",
125 | "isInputThetaSketch": False,
126 | "size": 16384,
127 | },
128 | {
129 | "name": "agg11",
130 | "type": "thetaSketch",
131 | "fieldName": "metric8",
132 | "isInputThetaSketch": True,
133 | "size": 8192,
134 | },
135 | ]
136 | assert sorted(built_agg, key=itemgetter("name")) == sorted(
137 | expected, key=itemgetter("name")
138 | )
139 |
140 | def test_build_filtered_aggregator(self):
141 | filter_ = filters.Filter(dimension="dim", value="val")
142 | agg_input = {
143 | "agg1": aggregators.filtered(filter_, aggregators.count("metric1")),
144 | "agg2": aggregators.filtered(filter_, aggregators.longsum("metric2")),
145 | "agg3": aggregators.filtered(filter_, aggregators.doublesum("metric3")),
146 | "agg4": aggregators.filtered(filter_, aggregators.doublemin("metric4")),
147 | "agg5": aggregators.filtered(filter_, aggregators.doublemax("metric5")),
148 | "agg6": aggregators.filtered(filter_, aggregators.hyperunique("metric6")),
149 | "agg7": aggregators.filtered(filter_, aggregators.cardinality("dim1")),
150 | "agg8": aggregators.filtered(
151 | filter_, aggregators.cardinality(["dim1", "dim2"], by_row=True)
152 | ),
153 | "agg9": aggregators.filtered(filter_, aggregators.thetasketch("dim1")),
154 | "agg10": aggregators.filtered(filter_, aggregators.thetasketch("metric7")),
155 | "agg11": aggregators.filtered(
156 | filter_,
157 | aggregators.thetasketch("metric8", isinputthetasketch=True, size=8192),
158 | ),
159 | }
160 | base = {
161 | "type": "filtered",
162 | "filter": {"type": "selector", "dimension": "dim", "value": "val"},
163 | }
164 |
165 | aggs = [
166 | {"name": "agg1", "type": "count", "fieldName": "metric1"},
167 | {"name": "agg2", "type": "longSum", "fieldName": "metric2"},
168 | {"name": "agg3", "type": "doubleSum", "fieldName": "metric3"},
169 | {"name": "agg4", "type": "doubleMin", "fieldName": "metric4"},
170 | {"name": "agg5", "type": "doubleMax", "fieldName": "metric5"},
171 | {"name": "agg6", "type": "hyperUnique", "fieldName": "metric6"},
172 | {
173 | "name": "agg7",
174 | "type": "cardinality",
175 | "fieldNames": ["dim1"],
176 | "byRow": False,
177 | },
178 | {
179 | "name": "agg8",
180 | "type": "cardinality",
181 | "fieldNames": ["dim1", "dim2"],
182 | "byRow": True,
183 | },
184 | {
185 | "name": "agg9",
186 | "type": "thetaSketch",
187 | "fieldName": "dim1",
188 | "isInputThetaSketch": False,
189 | "size": 16384,
190 | },
191 | {
192 | "name": "agg10",
193 | "type": "thetaSketch",
194 | "fieldName": "metric7",
195 | "isInputThetaSketch": False,
196 | "size": 16384,
197 | },
198 | {
199 | "name": "agg11",
200 | "type": "thetaSketch",
201 | "fieldName": "metric8",
202 | "isInputThetaSketch": True,
203 | "size": 8192,
204 | },
205 | ]
206 | expected = []
207 | for agg in aggs:
208 | exp = deepcopy(base)
209 | exp.update({"aggregator": agg})
210 | expected.append(exp)
211 |
212 | built_agg = aggregators.build_aggregators(agg_input)
213 | expected = sorted(
214 | built_agg, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k))
215 | )
216 | actual = sorted(
217 | expected, key=lambda k: itemgetter("name")(itemgetter("aggregator")(k))
218 | )
219 | assert expected == actual
220 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # PyDruid documentation build configuration file, created by
4 | # sphinx-quickstart on Mon Mar 3 16:38:17 2014.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import os
16 | import sys
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | # sys.path.insert(0, os.path.abspath('.'))
22 |
23 | sys.path.insert(0, os.path.abspath("../../pydruid"))
24 | sys.path.insert(0, os.path.abspath("../../pydruid/pydruid"))
25 |
26 | # -- General configuration ------------------------------------------------
27 |
28 | # If your documentation needs a minimal Sphinx version, state it here.
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.doctest"]
35 |
36 | # Add any paths that contain templates here, relative to this directory.
37 | templates_path = ["_templates"]
38 |
39 | # The suffix of source filenames.
40 | source_suffix = ".rst"
41 |
42 | # The encoding of source files.
43 | # source_encoding = 'utf-8-sig'
44 |
45 | # The master toctree document.
46 | master_doc = "index"
47 |
48 | # General information about the project.
49 | project = u"PyDruid"
50 | copyright = u"2014, Deep Ganguli"
51 |
52 | # The version info for the project you're documenting, acts as replacement for
53 | # |version| and |release|, also used in various other places throughout the
54 | # built documents.
55 | #
56 | # The short X.Y version.
57 | version = "0.2.0"
58 | # The full version, including alpha/beta/rc tags.
59 | release = "0.2.0"
60 |
61 | # The language for content autogenerated by Sphinx. Refer to documentation
62 | # for a list of supported languages.
63 | # language = None
64 |
65 | # There are two options for replacing |today|: either, you set today to some
66 | # non-false value, then it is used:
67 | # today = ''
68 | # Else, today_fmt is used as the format for a strftime call.
69 | # today_fmt = '%B %d, %Y'
70 |
71 | # List of patterns, relative to source directory, that match files and
72 | # directories to ignore when looking for source files.
73 | exclude_patterns = []
74 |
75 | # The reST default role (used for this markup: `text`) to use for all
76 | # documents.
77 | # default_role = None
78 |
79 | # If true, '()' will be appended to :func: etc. cross-reference text.
80 | # add_function_parentheses = True
81 |
82 | # If true, the current module name will be prepended to all description
83 | # unit titles (such as .. function::).
84 | # add_module_names = True
85 |
86 | # If true, sectionauthor and moduleauthor directives will be shown in the
87 | # output. They are ignored by default.
88 | # show_authors = False
89 |
90 | # The name of the Pygments (syntax highlighting) style to use.
91 | pygments_style = "sphinx"
92 |
93 | # A list of ignored prefixes for module index sorting.
94 | # modindex_common_prefix = []
95 |
96 | # If true, keep warnings as "system message" paragraphs in the built documents.
97 | # keep_warnings = False
98 |
99 |
100 | # -- Options for HTML output ----------------------------------------------
101 |
102 | # The theme to use for HTML and HTML Help pages. See the documentation for
103 | # a list of builtin themes.
104 | html_theme = "default"
105 |
106 | # Theme options are theme-specific and customize the look and feel of a theme
107 | # further. For a list of options available for each theme, see the
108 | # documentation.
109 | # html_theme_options = {}
110 |
111 | # Add any paths that contain custom themes here, relative to this directory.
112 | # html_theme_path = []
113 |
114 | # The name for this set of Sphinx documents. If None, it defaults to
115 | # " v documentation".
116 | # html_title = None
117 |
118 | # A shorter title for the navigation bar. Default is the same as html_title.
119 | # html_short_title = None
120 |
121 | # The name of an image file (relative to this directory) to place at the top
122 | # of the sidebar.
123 | # html_logo = None
124 |
125 | # The name of an image file (within the static path) to use as favicon of the
126 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
127 | # pixels large.
128 | # html_favicon = None
129 |
130 | # Add any paths that contain custom static files (such as style sheets) here,
131 | # relative to this directory. They are copied after the builtin static files,
132 | # so a file named "default.css" will overwrite the builtin "default.css".
133 | html_static_path = ["_static"]
134 |
135 | # Add any extra paths that contain custom files (such as robots.txt or
136 | # .htaccess) here, relative to this directory. These files are copied
137 | # directly to the root of the documentation.
138 | # html_extra_path = []
139 |
140 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
141 | # using the given strftime format.
142 | # html_last_updated_fmt = '%b %d, %Y'
143 |
144 | # If true, SmartyPants will be used to convert quotes and dashes to
145 | # typographically correct entities.
146 | # html_use_smartypants = True
147 |
148 | # Custom sidebar templates, maps document names to template names.
149 | # html_sidebars = {}
150 |
151 | # Additional templates that should be rendered to pages, maps page names to
152 | # template names.
153 | # html_additional_pages = {}
154 |
155 | # If false, no module index is generated.
156 | # html_domain_indices = True
157 |
158 | # If false, no index is generated.
159 | # html_use_index = True
160 |
161 | # If true, the index is split into individual pages for each letter.
162 | # html_split_index = False
163 |
164 | # If true, links to the reST sources are added to the pages.
165 | # html_show_sourcelink = True
166 |
167 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
168 | # html_show_sphinx = True
169 |
170 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
171 | # html_show_copyright = True
172 |
173 | # If true, an OpenSearch description file will be output, and all pages will
174 | # contain a tag referring to it. The value of this option must be the
175 | # base URL from which the finished HTML is served.
176 | # html_use_opensearch = ''
177 |
178 | # This is the file name suffix for HTML files (e.g. ".xhtml").
179 | # html_file_suffix = None
180 |
181 | # Output file base name for HTML help builder.
182 | htmlhelp_basename = "PyDruiddoc"
183 |
184 |
185 | # -- Options for LaTeX output ---------------------------------------------
186 |
187 | latex_elements = {
188 | # The paper size ('letterpaper' or 'a4paper').
189 | #'papersize': 'letterpaper',
190 | # The font size ('10pt', '11pt' or '12pt').
191 | #'pointsize': '10pt',
192 | # Additional stuff for the LaTeX preamble.
193 | #'preamble': '',
194 | }
195 |
196 | # Grouping the document tree into LaTeX files. List of tuples
197 | # (source start file, target name, title,
198 | # author, documentclass [howto, manual, or own class]).
199 | latex_documents = [
200 | ("index", "PyDruid.tex", u"PyDruid Documentation", u"Deep Ganguli", "manual")
201 | ]
202 |
203 | # The name of an image file (relative to this directory) to place at the top of
204 | # the title page.
205 | # latex_logo = None
206 |
207 | # For "manual" documents, if this is true, then toplevel headings are parts,
208 | # not chapters.
209 | # latex_use_parts = False
210 |
211 | # If true, show page references after internal links.
212 | # latex_show_pagerefs = False
213 |
214 | # If true, show URL addresses after external links.
215 | # latex_show_urls = False
216 |
217 | # Documents to append as an appendix to all manuals.
218 | # latex_appendices = []
219 |
220 | # If false, no module index is generated.
221 | # latex_domain_indices = True
222 |
223 |
224 | # -- Options for manual page output ---------------------------------------
225 |
226 | # One entry per manual page. List of tuples
227 | # (source start file, name, description, authors, manual section).
228 | man_pages = [("index", "pydruid", u"PyDruid Documentation", [u"Deep Ganguli"], 1)]
229 |
230 | # If true, show URL addresses after external links.
231 | # man_show_urls = False
232 |
233 |
234 | # -- Options for Texinfo output -------------------------------------------
235 |
236 | # Grouping the document tree into Texinfo files. List of tuples
237 | # (source start file, target name, title, author,
238 | # dir menu entry, description, category)
239 | texinfo_documents = [
240 | (
241 | "index",
242 | "PyDruid",
243 | u"PyDruid Documentation",
244 | u"Deep Ganguli",
245 | "PyDruid",
246 | "One line description of project.",
247 | "Miscellaneous",
248 | )
249 | ]
250 |
251 | # Documents to append as an appendix to all manuals.
252 | # texinfo_appendices = []
253 |
254 | # If false, no module index is generated.
255 | # texinfo_domain_indices = True
256 |
257 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
258 | # texinfo_show_urls = 'footnote'
259 |
260 | # If true, do not generate a @detailmenu in the "Top" node's menu.
261 | # texinfo_no_detailmenu = False
262 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pydruid
2 |
3 | pydruid exposes a simple API to create, execute, and analyze [Druid](http://druid.io/) queries. pydruid can parse query results into [Pandas](http://pandas.pydata.org/) DataFrame objects for subsequent data analysis -- this offers a tight integration between [Druid](http://druid.io/), the [SciPy](http://www.scipy.org/stackspec.html) stack (for scientific computing) and [scikit-learn](http://scikit-learn.org/stable/) (for machine learning). pydruid can export query results into TSV or JSON for further processing with your favorite tool, e.g., R, Julia, Matlab, Excel. It provides both synchronous and asynchronous clients.
4 |
5 | Additionally, pydruid implements the [Python DB API 2.0](https://www.python.org/dev/peps/pep-0249/), a [SQLAlchemy dialect](http://docs.sqlalchemy.org/en/latest/dialects/), and a provides a command line interface to interact with Druid.
6 |
7 | To install:
8 | ```python
9 | pip install pydruid
10 | # or, if you intend to use asynchronous client
11 | pip install pydruid[async]
12 | # or, if you intend to export query results into pandas
13 | pip install pydruid[pandas]
14 | # or, if you intend to do both
15 | pip install pydruid[async, pandas]
16 | # or, if you want to use the SQLAlchemy engine
17 | pip install pydruid[sqlalchemy]
18 | # or, if you want to use the CLI
19 | pip install pydruid[cli]
20 | ```
21 | Documentation: https://pythonhosted.org/pydruid/.
22 |
23 | # examples
24 |
25 | The following exampes show how to execute and analyze the results of three types of queries: timeseries, topN, and groupby. We will use these queries to ask simple questions about twitter's public data set.
26 |
27 | ## timeseries
28 |
29 | What was the average tweet length, per day, surrounding the 2014 Sochi olympics?
30 |
31 | ```python
32 | from pydruid.client import *
33 | from pylab import plt
34 |
35 | query = PyDruid(druid_url_goes_here, 'druid/v2')
36 |
37 | ts = query.timeseries(
38 | datasource='twitterstream',
39 | granularity='day',
40 | intervals='2014-02-02/p4w',
41 | aggregations={'length': doublesum('tweet_length'), 'count': doublesum('count')},
42 | post_aggregations={'avg_tweet_length': (Field('length') / Field('count'))},
43 | filter=Dimension('first_hashtag') == 'sochi2014'
44 | )
45 | df = query.export_pandas()
46 | df['timestamp'] = df['timestamp'].map(lambda x: x.split('T')[0])
47 | df.plot(x='timestamp', y='avg_tweet_length', ylim=(80, 140), rot=20,
48 | title='Sochi 2014')
49 | plt.ylabel('avg tweet length (chars)')
50 | plt.show()
51 | ```
52 |
53 | 
54 |
55 | ## topN
56 |
57 | Who were the top ten mentions (@user_name) during the 2014 Oscars?
58 |
59 | ```python
60 | top = query.topn(
61 | datasource='twitterstream',
62 | granularity='all',
63 | intervals='2014-03-03/p1d', # utc time of 2014 oscars
64 | aggregations={'count': doublesum('count')},
65 | dimension='user_mention_name',
66 | filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') &
67 | (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') &
68 | ~(Dimension('user_mention_name') == 'No Mention'),
69 | metric='count',
70 | threshold=10
71 | )
72 |
73 | df = query.export_pandas()
74 | print df
75 |
76 | count timestamp user_mention_name
77 | 0 1303 2014-03-03T00:00:00.000Z TheEllenShow
78 | 1 44 2014-03-03T00:00:00.000Z TheAcademy
79 | 2 21 2014-03-03T00:00:00.000Z MTV
80 | 3 21 2014-03-03T00:00:00.000Z peoplemag
81 | 4 17 2014-03-03T00:00:00.000Z THR
82 | 5 16 2014-03-03T00:00:00.000Z ItsQueenElsa
83 | 6 16 2014-03-03T00:00:00.000Z eonline
84 | 7 15 2014-03-03T00:00:00.000Z PerezHilton
85 | 8 14 2014-03-03T00:00:00.000Z realjohngreen
86 | 9 12 2014-03-03T00:00:00.000Z KevinSpacey
87 |
88 | ```
89 |
90 | ## groupby
91 |
92 | What does the social network of users replying to other users look like?
93 |
94 | ```python
95 | from igraph import *
96 | from cairo import *
97 | from pandas import concat
98 |
99 | group = query.groupby(
100 | datasource='twitterstream',
101 | granularity='hour',
102 | intervals='2013-10-04/pt12h',
103 | dimensions=["user_name", "reply_to_name"],
104 | filter=(~(Dimension("reply_to_name") == "Not A Reply")) &
105 | (Dimension("user_location") == "California"),
106 | aggregations={"count": doublesum("count")}
107 | )
108 |
109 | df = query.export_pandas()
110 |
111 | # map names to categorical variables with a lookup table
112 | names = concat([df['user_name'], df['reply_to_name']]).unique()
113 | nameLookup = dict([pair[::-1] for pair in enumerate(names)])
114 | df['user_name_lookup'] = df['user_name'].map(nameLookup.get)
115 | df['reply_to_name_lookup'] = df['reply_to_name'].map(nameLookup.get)
116 |
117 | # create the graph with igraph
118 | g = Graph(len(names), directed=False)
119 | vertices = zip(df['user_name_lookup'], df['reply_to_name_lookup'])
120 | g.vs["name"] = names
121 | g.add_edges(vertices)
122 | layout = g.layout_fruchterman_reingold()
123 | plot(g, "tweets.png", layout=layout, vertex_size=2, bbox=(400, 400), margin=25, edge_width=1, vertex_color="blue")
124 | ```
125 |
126 | 
127 |
128 | # asynchronous client
129 | ```pydruid.async_client.AsyncPyDruid``` implements an asynchronous client. To achieve that, it utilizes an asynchronous
130 | HTTP client from ```Tornado``` framework. The asynchronous client is suitable for use with async frameworks such as Tornado
131 | and provides much better performance at scale. It lets you serve multiple requests at the same time, without blocking on
132 | Druid executing your queries.
133 |
134 | ## example
135 | ```python
136 | from tornado import gen
137 | from pydruid.async_client import AsyncPyDruid
138 | from pydruid.utils.aggregators import longsum
139 | from pydruid.utils.filters import Dimension
140 |
141 | client = AsyncPyDruid(url_to_druid_broker, 'druid/v2')
142 |
143 | @gen.coroutine
144 | def your_asynchronous_method_serving_top10_mentions_for_day(day
145 | top_mentions = yield client.topn(
146 | datasource='twitterstream',
147 | granularity='all',
148 | intervals="%s/p1d" % (day, ),
149 | aggregations={'count': doublesum('count')},
150 | dimension='user_mention_name',
151 | filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') &
152 | (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') &
153 | ~(Dimension('user_mention_name') == 'No Mention'),
154 | metric='count',
155 | threshold=10)
156 |
157 | # asynchronously return results
158 | # can be simply ```return top_mentions``` in python 3.x
159 | raise gen.Return(top_mentions)
160 | ```
161 |
162 |
163 | # thetaSketches
164 | Theta sketch Post aggregators are built slightly differently to normal Post Aggregators, as they have different operators.
165 | Note: you must have the ```druid-datasketches``` extension loaded into your Druid cluster in order to use these.
166 | See the [Druid datasketches](http://druid.io/docs/latest/development/extensions-core/datasketches-aggregators.html) documentation for details.
167 |
168 | ```python
169 | from pydruid.client import *
170 | from pydruid.utils import aggregators
171 | from pydruid.utils import filters
172 | from pydruid.utils import postaggregator
173 |
174 | query = PyDruid(url_to_druid_broker, 'druid/v2')
175 | ts = query.groupby(
176 | datasource='test_datasource',
177 | granularity='all',
178 | intervals='2016-09-01/P1M',
179 | filter = ( filters.Dimension('product').in_(['product_A', 'product_B'])),
180 | aggregations={
181 | 'product_A_users': aggregators.filtered(
182 | filters.Dimension('product') == 'product_A',
183 | aggregators.thetasketch('user_id')
184 | ),
185 | 'product_B_users': aggregators.filtered(
186 | filters.Dimension('product') == 'product_B',
187 | aggregators.thetasketch('user_id')
188 | )
189 | },
190 | post_aggregations={
191 | 'both_A_and_B': postaggregator.ThetaSketchEstimate(
192 | postaggregator.ThetaSketch('product_A_users') & postaggregator.ThetaSketch('product_B_users')
193 | )
194 | }
195 | )
196 | ```
197 |
198 | # DB API
199 |
200 | ```python
201 | from pydruid.db import connect
202 |
203 | conn = connect(host='localhost', port=8082, path='/druid/v2/sql/', scheme='http')
204 | curs = conn.cursor()
205 | curs.execute("""
206 | SELECT place,
207 | CAST(REGEXP_EXTRACT(place, '(.*),', 1) AS FLOAT) AS lat,
208 | CAST(REGEXP_EXTRACT(place, ',(.*)', 1) AS FLOAT) AS lon
209 | FROM places
210 | LIMIT 10
211 | """)
212 | for row in curs:
213 | print(row)
214 | ```
215 |
216 | # SQLAlchemy
217 |
218 | ```python
219 | from sqlalchemy import *
220 | from sqlalchemy.engine import create_engine
221 | from sqlalchemy.schema import *
222 |
223 | engine = create_engine('druid://localhost:8082/druid/v2/sql/') # uses HTTP by default :(
224 | # engine = create_engine('druid+http://localhost:8082/druid/v2/sql/')
225 | # engine = create_engine('druid+https://localhost:8082/druid/v2/sql/')
226 |
227 | places = Table('places', MetaData(bind=engine), autoload=True)
228 | print(select([func.count('*')], from_obj=places).scalar())
229 | ```
230 |
231 |
232 | ## Column headers
233 |
234 | In version 0.13.0 Druid SQL added support for including the column names in the
235 | response which can be requested via the "header" field in the request. This
236 | helps to ensure that the cursor description is defined (which is a requirement
237 | for SQLAlchemy query statements) regardless on whether the result set contains
238 | any rows. Historically this was problematic for result sets which contained no
239 | rows at one could not infer the expected column names.
240 |
241 | Enabling the header can be configured via the SQLAlchemy URI by using the query
242 | parameter, i.e.,
243 |
244 | ```python
245 | engine = create_engine('druid://localhost:8082/druid/v2/sql?header=true')
246 | ```
247 |
248 | Note the current default is `false` to ensure backwards compatibility but should
249 | be set to `true` for Druid versions >= 0.13.0.
250 |
251 |
252 | # Command line
253 |
254 | ```bash
255 | $ pydruid http://localhost:8082/druid/v2/sql/
256 | > SELECT COUNT(*) AS cnt FROM places
257 | cnt
258 | -----
259 | 12345
260 | > SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES;
261 | TABLE_NAME
262 | ----------
263 | test_table
264 | COLUMNS
265 | SCHEMATA
266 | TABLES
267 | > BYE;
268 | GoodBye!
269 | ```
270 |
271 | # Contributing
272 |
273 | Contributions are welcomed of course. We like to use `black` and `flake8`.
274 |
275 | ```bash
276 | pip install -r requirements-dev.txt # installs useful dev deps
277 | pre-commit install # installs useful commit hooks
278 | ```
279 |
--------------------------------------------------------------------------------
/pydruid/utils/filters.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2013 Metamarkets Group Inc.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | try:
17 | import simplejson as json
18 | except ImportError:
19 | import json
20 |
21 | from .dimensions import build_dimension
22 |
23 |
24 | class Filter:
25 |
26 | # filter types supporting extraction function
27 | _FILTERS_WITH_EXTR_FN = (
28 | "selector",
29 | "regex",
30 | "javascript",
31 | "in",
32 | "bound",
33 | "interval",
34 | "extraction",
35 | )
36 |
37 | def __init__(self, extraction_function=None, ordering="lexicographic", **args):
38 |
39 | type_ = args.get("type", "selector")
40 |
41 | if extraction_function is not None:
42 | if type_ not in self._FILTERS_WITH_EXTR_FN:
43 | raise ValueError(
44 | "Filter of type {0} doesn't support "
45 | "extraction function".format(type_)
46 | )
47 | elif type_ == "extraction":
48 | raise ValueError(
49 | "Filter of type extraction requires extraction " "function"
50 | )
51 |
52 | self.extraction_function = extraction_function
53 |
54 | self.filter = {"filter": {"type": type_}}
55 |
56 | if type_ == "selector":
57 | self.filter["filter"].update(
58 | {"dimension": args["dimension"], "value": args["value"]}
59 | )
60 | elif type_ == "javascript":
61 | self.filter["filter"].update(
62 | {"dimension": args["dimension"], "function": args["function"]}
63 | )
64 | elif type_ == "and":
65 | self.filter["filter"].update({"fields": args["fields"]})
66 | elif type_ == "or":
67 | self.filter["filter"].update({"fields": args["fields"]})
68 | elif type_ == "not":
69 | self.filter["filter"].update({"field": args["field"]})
70 | elif type_ == "in":
71 | self.filter["filter"].update(
72 | {"dimension": args["dimension"], "values": args["values"]}
73 | )
74 | elif type_ == "regex":
75 | self.filter["filter"].update(
76 | {"dimension": args["dimension"], "pattern": args["pattern"]}
77 | )
78 | elif type_ == "bound":
79 | self.filter["filter"].update(
80 | {
81 | "dimension": args["dimension"],
82 | "lower": args["lower"],
83 | "lowerStrict": args["lowerStrict"],
84 | "upper": args["upper"],
85 | "upperStrict": args["upperStrict"],
86 | "alphaNumeric": args["alphaNumeric"],
87 | "ordering": ordering,
88 | }
89 | )
90 | elif type_ == "columnComparison":
91 | self.filter["filter"].update({"dimensions": args["dimensions"]})
92 | elif type_ == "interval":
93 | self.filter["filter"].update(
94 | {"dimension": args["dimension"], "intervals": args["intervals"]}
95 | )
96 | elif type_ == "extraction":
97 | self.filter["filter"].update(
98 | {"dimension": args["dimension"], "value": args["value"]}
99 | )
100 | elif type_ == "search":
101 | self.filter["filter"].update(
102 | {
103 | "dimension": args["dimension"],
104 | "query": {
105 | "type": "contains",
106 | "value": args["value"],
107 | "caseSensitive": args.get("caseSensitive", "false"),
108 | },
109 | }
110 | )
111 | elif type_ == "like":
112 | self.filter["filter"].update(
113 | {"dimension": args["dimension"], "pattern": args["pattern"]}
114 | )
115 | elif type_ == "spatial":
116 | self.filter["filter"].update(
117 | {"dimension": args["dimension"], "bound": args["bound"]}
118 | )
119 | else:
120 | raise NotImplementedError("Filter type: {0} does not exist".format(type_))
121 |
122 | def show(self):
123 | print(json.dumps(self.filter, indent=4))
124 |
125 | def __and__(self, x):
126 | if self.filter["filter"]["type"] == "and":
127 | # if `self` is already `and`, don't create a new filter
128 | # but just append `x` to the filter fields.
129 | self.filter["filter"]["fields"].append(x)
130 | return self
131 | return Filter(type="and", fields=[self, x])
132 |
133 | def __or__(self, x):
134 | if self.filter["filter"]["type"] == "or":
135 | # if `self` is already `or`, don't create a new filter
136 | # but just append `x` to the filter fields.
137 | self.filter["filter"]["fields"].append(x)
138 | return self
139 | return Filter(type="or", fields=[self, x])
140 |
141 | def __invert__(self):
142 | return Filter(type="not", field=self)
143 |
144 | @staticmethod
145 | def build_filter(filter_obj):
146 | filter = filter_obj.filter["filter"]
147 | if filter["type"] in ["and", "or"]:
148 | filter = filter.copy() # make a copy so we don't overwrite `fields`
149 | filter["fields"] = [Filter.build_filter(f) for f in filter["fields"]]
150 | elif filter["type"] in ["not"]:
151 | filter = filter.copy()
152 | filter["field"] = Filter.build_filter(filter["field"])
153 | elif filter["type"] in ["columnComparison"]:
154 | filter = filter.copy()
155 | filter["dimensions"] = [build_dimension(d) for d in filter["dimensions"]]
156 |
157 | if filter_obj.extraction_function is not None:
158 | if filter is filter_obj.filter["filter"]: # copy if not yet copied
159 | filter = filter.copy()
160 | filter["extractionFn"] = filter_obj.extraction_function.build()
161 |
162 | return filter
163 |
164 |
165 | class Dimension:
166 | def __init__(self, dim):
167 | self.dimension = dim
168 |
169 | def __eq__(self, other):
170 | return Filter(dimension=self.dimension, value=other)
171 |
172 | def __ne__(self, other):
173 | return ~Filter(dimension=self.dimension, value=other)
174 |
175 |
176 | class JavaScript:
177 | def __init__(self, dim):
178 | self.dimension = dim
179 |
180 | def __eq__(self, func):
181 | return Filter(type="javascript", dimension=self.dimension, function=func)
182 |
183 |
184 | class Bound(Filter):
185 | """
186 | Bound filter can be used to filter by comparing dimension values to an
187 | upper value or/and a lower value.
188 |
189 | :ivar str dimension: Dimension to filter on.
190 | :ivar str lower: Lower bound.
191 | :ivar str upper: Upper bound.
192 | :ivar bool lowerStrict: Strict lower inclusion. Initial value: False
193 | :ivar bool upperStrict: Strict upper inclusion. Initial value: False
194 | :ivar bool alphaNumeric: Numeric comparison. Initial value: False
195 | NOTE: For backwards compatibility - Use "ordering" instead.
196 | :ivar str ordering: Sorting Order. Initial value: lexicographic
197 | Specifies the sorting order to use when comparing values against the bound.
198 | Can be one of the following values: "lexicographic", "alphanumeric", "numeric",
199 | "strlen", "version". See Sorting Orders
200 | https://druid.apache.org/docs/latest/querying/filters.html#bound-filter
201 | for more details.
202 | :ivar ExtractionFunction extraction_function: extraction function to use,
203 | if not None
204 | """
205 |
206 | def __init__(
207 | self,
208 | dimension,
209 | lower=None,
210 | upper=None,
211 | lowerStrict=False,
212 | upperStrict=False,
213 | alphaNumeric=False,
214 | ordering="lexicographic",
215 | extraction_function=None,
216 | ):
217 | if not lower and not upper:
218 | raise ValueError("Must include either lower or upper or both")
219 | Filter.__init__(
220 | self,
221 | type="bound",
222 | dimension=dimension,
223 | lower=lower,
224 | upper=upper,
225 | lowerStrict=lowerStrict,
226 | upperStrict=upperStrict,
227 | alphaNumeric=alphaNumeric,
228 | ordering=ordering,
229 | extraction_function=extraction_function,
230 | )
231 |
232 |
233 | class Interval(Filter):
234 | """
235 | Interval filter can be used to filter by comparing dimension(__time)
236 | values to a list of intervals.
237 |
238 | :ivar str dimension: Dimension to filter on.
239 | :ivar list intervals: List of ISO-8601 intervals of data to filter out.
240 | :ivar ExtractionFunction extraction_function: extraction function to use,
241 | if not None
242 | """
243 |
244 | def __init__(self, dimension, intervals, extraction_function=None):
245 |
246 | Filter.__init__(
247 | self,
248 | type="interval",
249 | dimension=dimension,
250 | intervals=intervals,
251 | extraction_function=extraction_function,
252 | )
253 |
254 |
255 | class Spatial(Filter):
256 | """
257 | Spatial filter can be used to filter by spatial bounds
258 |
259 | :ivar str dimension: Dimension to filter on.
260 | :ivar str bound_type: Spatial bound type: ['rectangle','radius','polygon'].
261 | :param `**kwargs`: addition arguments required for the selected bound type:
262 | 'rectange': 'minCoords' and 'maxCoords'
263 | 'radius': 'coords' and 'radius'
264 | 'polygon': 'abscissa' and 'ordinate'
265 | """
266 |
267 | def __init__(self, dimension, bound_type, **args):
268 |
269 | _bound = {"type": bound_type}
270 |
271 | if bound_type == "rectangle":
272 | if not args["minCoords"] or not args["maxCoords"]:
273 | raise ValueError(
274 | "Rectangle bound must include both minCoords and maxCoords"
275 | )
276 | _bound["minCoords"] = args["minCoords"]
277 | _bound["maxCoords"] = args["maxCoords"]
278 | elif bound_type == "radius":
279 | if not args["coords"] or not args["radius"]:
280 | raise ValueError("Radius bound must include both coords and radius")
281 | _bound["coords"] = args["coords"]
282 | _bound["radius"] = args["radius"]
283 | elif bound_type == "polygon":
284 | if not args["abscissa"] or not args["ordinate"]:
285 | raise ValueError(
286 | "Polygon bound must include both abscissa and ordinate"
287 | )
288 | _bound["abscissa"] = args["abscissa"]
289 | _bound["ordinate"] = args["ordinate"]
290 | else:
291 | raise ValueError("Unsupport Spatial Bound type: {0}".format(bound_type))
292 |
293 | Filter.__init__(self, type="spatial", dimension=dimension, bound=_bound)
294 |
--------------------------------------------------------------------------------
/tests/test_query.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | #
3 | # Copyright 2016 Metamarkets Group Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | import csv
19 | import os
20 |
21 | import pandas
22 | import pytest
23 | from pandas.testing import assert_frame_equal
24 |
25 | from pydruid.query import Query, QueryBuilder
26 | from pydruid.utils import aggregators, filters, having, postaggregator
27 |
28 |
29 | def create_query_with_results():
30 | query = Query({}, "timeseries")
31 | query.result = [
32 | {
33 | "result": {"value1": 1, "value2": "㬓"},
34 | "timestamp": "2015-01-01T00:00:00.000-05:00",
35 | },
36 | {
37 | "result": {"value1": 2, "value2": "㬓"},
38 | "timestamp": "2015-01-02T00:00:00.000-05:00",
39 | },
40 | ]
41 | return query
42 |
43 |
44 | EXPECTED_RESULTS_PANDAS = [
45 | {"timestamp": "2015-01-01T00:00:00.000-05:00", "value1": 1, "value2": "㬓"},
46 | {"timestamp": "2015-01-02T00:00:00.000-05:00", "value1": 2, "value2": "㬓"},
47 | ]
48 |
49 |
50 | def expected_results_csv_reader():
51 | # csv.DictReader does not perform promotion to int64
52 | expected_results = []
53 | for element in EXPECTED_RESULTS_PANDAS:
54 | modified_elem = element.copy()
55 | modified_elem.update({"value1": str(modified_elem["value1"])})
56 | expected_results.append(modified_elem)
57 | return expected_results
58 |
59 |
60 | class TestQueryBuilder:
61 | def test_build_query(self):
62 | # given
63 | expected_query_dict = {
64 | "queryType": None,
65 | "dataSource": "things",
66 | "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}],
67 | "postAggregations": [
68 | {
69 | "fields": [
70 | {"fieldName": "sum", "type": "fieldAccess"},
71 | {"fieldName": "count", "type": "fieldAccess"},
72 | ],
73 | "fn": "/",
74 | "name": "avg",
75 | "type": "arithmetic",
76 | }
77 | ],
78 | "pagingSpec": {"pagingIdentifies": {}, "threshold": 1},
79 | "filter": {"dimension": "one", "type": "selector", "value": 1},
80 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
81 | "new_key": "value",
82 | }
83 |
84 | builder = QueryBuilder()
85 |
86 | # when
87 | query = builder.build_query(
88 | None,
89 | {
90 | "datasource": "things",
91 | "aggregations": {"count": aggregators.count("thing")},
92 | "post_aggregations": {
93 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
94 | },
95 | "paging_spec": {"pagingIdentifies": {}, "threshold": 1},
96 | "filter": filters.Dimension("one") == 1,
97 | "having": having.Aggregation("sum") > 1,
98 | "new_key": "value",
99 | },
100 | )
101 |
102 | # then
103 | assert query.query_dict == expected_query_dict
104 |
105 | def test_build_query_none_type(self):
106 | # given
107 | expected_query_dict = {
108 | "queryType": None,
109 | "dataSource": "things",
110 | "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}],
111 | "filter": {"dimension": "one", "type": "selector", "value": 1},
112 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
113 | "dimension": "dim1",
114 | }
115 |
116 | builder = QueryBuilder()
117 |
118 | # when
119 | builder_dict = {
120 | "datasource": "things",
121 | "aggregations": {"count": aggregators.count("thing")},
122 | "filter": filters.Dimension("one") == 1,
123 | "having": having.Aggregation("sum") > 1,
124 | "dimension": "dim1",
125 | }
126 | query = builder.build_query(None, builder_dict)
127 |
128 | # then
129 | assert query.query_dict == expected_query_dict
130 |
131 | # you should be able to pass `None` to dimension/having/filter
132 | for v in ["dimension", "having", "filter"]:
133 | expected_query_dict[v] = None
134 | builder_dict[v] = None
135 |
136 | query = builder.build_query(None, builder_dict)
137 |
138 | assert query.query_dict == expected_query_dict
139 |
140 | def test_validate_query(self):
141 | # given
142 | builder = QueryBuilder()
143 |
144 | # when
145 | builder.validate_query(None, ["validkey"], {"validkey": "value"})
146 |
147 | # then
148 | pytest.raises(
149 | ValueError,
150 | builder.validate_query,
151 | *[None, ["validkey"], {"invalidkey": "value"}]
152 | )
153 |
154 | def test_union_datasource(self):
155 | # Given
156 | expected_query_dict = {"queryType": None, "dataSource": "things"}
157 | builder = QueryBuilder()
158 | # when
159 | builder_dict = {"datasource": "things"}
160 | query = builder.build_query(None, builder_dict)
161 | # then
162 | assert query.query_dict == expected_query_dict
163 |
164 | # Given
165 | expected_query_dict = {
166 | "queryType": None,
167 | "dataSource": {
168 | "type": "union",
169 | "dataSources": ["things", "others", "more"],
170 | },
171 | }
172 | builder = QueryBuilder()
173 | # when
174 | builder_dict = {"datasource": ["things", "others", "more"]}
175 | query = builder.build_query(None, builder_dict)
176 | # then
177 | assert query.query_dict == expected_query_dict
178 |
179 | # Given check that it rejects non-string items
180 | builder = QueryBuilder()
181 | builder_dict = {"datasource": ["things", 123]}
182 | with pytest.raises(ValueError):
183 | query = builder.build_query(None, builder_dict)
184 |
185 | def test_build_subquery(self):
186 | # given
187 | expected_query_dict = {
188 | "query": {
189 | "queryType": "groupBy",
190 | "dataSource": "things",
191 | "aggregations": [
192 | {"fieldName": "thing", "name": "count", "type": "count"}
193 | ],
194 | "postAggregations": [
195 | {
196 | "fields": [
197 | {"fieldName": "sum", "type": "fieldAccess"},
198 | {"fieldName": "count", "type": "fieldAccess"},
199 | ],
200 | "fn": "/",
201 | "name": "avg",
202 | "type": "arithmetic",
203 | }
204 | ],
205 | "filter": {"dimension": "one", "type": "selector", "value": 1},
206 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
207 | },
208 | "type": "query",
209 | }
210 |
211 | builder = QueryBuilder()
212 |
213 | # when
214 | subquery_dict = builder.subquery(
215 | {
216 | "datasource": "things",
217 | "aggregations": {"count": aggregators.count("thing")},
218 | "post_aggregations": {
219 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
220 | },
221 | "filter": filters.Dimension("one") == 1,
222 | "having": having.Aggregation("sum") > 1,
223 | }
224 | )
225 |
226 | # then
227 | assert subquery_dict == expected_query_dict
228 |
229 | expected_nested_query_dict = {
230 | "query": {
231 | "queryType": "groupBy",
232 | "dataSource": {
233 | "query": {
234 | "queryType": "groupBy",
235 | "dataSource": "things",
236 | "aggregations": [
237 | {"fieldName": "thing", "name": "count", "type": "count"}
238 | ],
239 | "postAggregations": [
240 | {
241 | "fields": [
242 | {"fieldName": "sum", "type": "fieldAccess"},
243 | {"fieldName": "count", "type": "fieldAccess"},
244 | ],
245 | "fn": "/",
246 | "name": "avg",
247 | "type": "arithmetic",
248 | }
249 | ],
250 | "filter": {"dimension": "one", "type": "selector", "value": 1},
251 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
252 | },
253 | "type": "query",
254 | },
255 | "aggregations": [
256 | {"fieldName": "thing", "name": "count", "type": "count"}
257 | ],
258 | "postAggregations": [
259 | {
260 | "fields": [
261 | {"fieldName": "sum", "type": "fieldAccess"},
262 | {"fieldName": "count", "type": "fieldAccess"},
263 | ],
264 | "fn": "/",
265 | "name": "avg",
266 | "type": "arithmetic",
267 | }
268 | ],
269 | "filter": {"dimension": "one", "type": "selector", "value": 1},
270 | "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
271 | },
272 | "type": "query",
273 | }
274 |
275 | nested_subquery_dict = builder.subquery(
276 | {
277 | "datasource": builder.subquery(
278 | {
279 | "datasource": "things",
280 | "aggregations": {"count": aggregators.count("thing")},
281 | "post_aggregations": {
282 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
283 | },
284 | "filter": filters.Dimension("one") == 1,
285 | "having": having.Aggregation("sum") > 1,
286 | }
287 | ),
288 | "aggregations": {"count": aggregators.count("thing")},
289 | "post_aggregations": {
290 | "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
291 | },
292 | "filter": filters.Dimension("one") == 1,
293 | "having": having.Aggregation("sum") > 1,
294 | }
295 | )
296 |
297 | assert nested_subquery_dict == expected_nested_query_dict
298 |
299 | class TestQuery:
300 | def test_export_tsv(self, tmpdir):
301 | query = create_query_with_results()
302 | file_path = tmpdir.join("out.tsv")
303 | query.export_tsv(str(file_path))
304 |
305 | with open(str(file_path)) as tsv_file:
306 | reader = csv.DictReader(tsv_file, delimiter="\t")
307 | actual = [line for line in reader]
308 | assert actual == expected_results_csv_reader()
309 |
310 | def test_export_pandas(self):
311 | query = create_query_with_results()
312 | df = query.export_pandas()
313 | expected_df = pandas.DataFrame(EXPECTED_RESULTS_PANDAS)
314 | assert_frame_equal(df, expected_df, check_like=True)
315 |
316 | query = Query({}, "timeseries")
317 | df = query.export_pandas()
318 | assert_frame_equal(df, pandas.DataFrame())
319 |
320 | def test_query_acts_as_a_wrapper_for_raw_result(self):
321 | # given
322 | query = create_query_with_results()
323 |
324 | # then
325 | assert len(query) == 2
326 | assert isinstance(query[0], dict)
327 | assert isinstance(query[1], dict)
328 |
--------------------------------------------------------------------------------
/tests/utils/test_dimensions.py:
--------------------------------------------------------------------------------
1 | from pydruid.utils.dimensions import (
2 | build_dimension,
3 | DimensionSpec,
4 | JavascriptExtraction,
5 | ListFilteredSpec,
6 | MapLookupExtraction,
7 | NamespaceLookupExtraction,
8 | PartialExtraction,
9 | RegexExtraction,
10 | RegexFilteredSpec,
11 | RegisteredLookupExtraction,
12 | TimeFormatExtraction,
13 | )
14 |
15 |
16 | class TestDimensionSpec(object):
17 | def test_default(self):
18 | dim_spec = DimensionSpec("dim", "out")
19 | actual = dim_spec.build()
20 | expected = {"type": "default", "dimension": "dim", "outputName": "out"}
21 |
22 | assert actual == expected
23 |
24 | def test_extraction_functions(self):
25 | js_func = "function(x) {return x};"
26 | ext_fns = [
27 | (RegexExtraction(r"\w+"), {"type": "regex", "expr": "\\w+"}),
28 | (PartialExtraction(r"\w+"), {"type": "partial", "expr": "\\w+"}),
29 | (
30 | JavascriptExtraction(js_func),
31 | {"type": "javascript", "function": js_func, "injective": False},
32 | ),
33 | (
34 | MapLookupExtraction(TestMapLookupExtraction.mapping),
35 | {
36 | "type": "lookup",
37 | "lookup": {"type": "map", "map": TestMapLookupExtraction.mapping},
38 | "retainMissingValue": False,
39 | "replaceMissingValueWith": None,
40 | "injective": False,
41 | },
42 | ),
43 | ]
44 |
45 | for ext_fn, expected_ext_fn in ext_fns:
46 | dim_spec = DimensionSpec("dim", "out", extraction_function=ext_fn)
47 | actual = dim_spec.build()
48 | expected = {
49 | "type": "extraction",
50 | "dimension": "dim",
51 | "outputName": "out",
52 | "extractionFn": expected_ext_fn,
53 | }
54 |
55 | assert actual == expected
56 |
57 | def test_filter_specs(self):
58 | delegate_spec = DimensionSpec("dim", "out").build()
59 | filter_specs = [
60 | (
61 | ListFilteredSpec(["val1", "val2"]),
62 | {
63 | "type": "listFiltered",
64 | "delegate": delegate_spec,
65 | "values": ["val1", "val2"],
66 | },
67 | ),
68 | (
69 | ListFilteredSpec(["val1", "val2"], is_whitelist=False),
70 | {
71 | "type": "listFiltered",
72 | "delegate": delegate_spec,
73 | "values": ["val1", "val2"],
74 | "isWhitelist": False,
75 | },
76 | ),
77 | (
78 | RegexFilteredSpec(r"\w+"),
79 | {"type": "regexFiltered", "delegate": delegate_spec, "pattern": "\\w+"},
80 | ),
81 | ]
82 |
83 | for filter_spec, expected_dim_spec in filter_specs:
84 | dim_spec = DimensionSpec("dim", "out", filter_spec=filter_spec)
85 | actual = dim_spec.build()
86 |
87 | assert actual == expected_dim_spec
88 |
89 | def test_build_dimension(self):
90 | assert build_dimension("raw_dim") == "raw_dim"
91 |
92 | dim_spec = DimensionSpec("dim", "out")
93 | assert build_dimension(dim_spec) == dim_spec.build()
94 |
95 |
96 | class TestListFilteredSpec(object):
97 | def test_list_filtered_spec(self):
98 | dim_spec = DimensionSpec("dim", "out").build()
99 | list_filtered_spec = ListFilteredSpec(["val1", "val2"])
100 | actual = list_filtered_spec.build(dim_spec)
101 | expected_dim_spec = {"type": "default", "dimension": "dim", "outputName": "out"}
102 | expected = {
103 | "type": "listFiltered",
104 | "delegate": expected_dim_spec,
105 | "values": ["val1", "val2"],
106 | }
107 |
108 | assert actual == expected
109 |
110 | def test_list_filtered_spec_whitelist(self):
111 | dim_spec = DimensionSpec("dim", "out").build()
112 | list_filtered_spec = ListFilteredSpec(["val1", "val2"], is_whitelist=False)
113 | actual = list_filtered_spec.build(dim_spec)
114 | expected_dim_spec = {"type": "default", "dimension": "dim", "outputName": "out"}
115 | expected = {
116 | "type": "listFiltered",
117 | "delegate": expected_dim_spec,
118 | "values": ["val1", "val2"],
119 | "isWhitelist": False,
120 | }
121 |
122 | assert actual == expected
123 |
124 |
125 | class TestRegexFilteredSpec(object):
126 | def test_regex_filtered_spec(self):
127 | dim_spec = DimensionSpec("dim", "out").build()
128 | regex_filtered_spec = RegexFilteredSpec(r"\w+")
129 | actual = regex_filtered_spec.build(dim_spec)
130 | expected_dim_spec = {"type": "default", "dimension": "dim", "outputName": "out"}
131 | expected = {
132 | "type": "regexFiltered",
133 | "delegate": expected_dim_spec,
134 | "pattern": "\\w+",
135 | }
136 |
137 | assert actual == expected
138 |
139 |
140 | class TestRegexExtraction(object):
141 | def test_regex(self):
142 | ext_fn = RegexExtraction(r"\w+")
143 | actual = ext_fn.build()
144 | expected = {"type": "regex", "expr": "\\w+"}
145 |
146 | assert actual == expected
147 |
148 |
149 | class TestPartialExtraction(object):
150 | def test_regex(self):
151 | ext_fn = PartialExtraction(r"\w+")
152 | actual = ext_fn.build()
153 | expected = {"type": "partial", "expr": "\\w+"}
154 |
155 | assert actual == expected
156 |
157 |
158 | class TestJavascriptExtraction(object):
159 | def test_js_injective(self):
160 | js_func = "function(x) {return x};"
161 | ext_fn = JavascriptExtraction(js_func, injective=True)
162 | actual = ext_fn.build()
163 | expected = {"type": "javascript", "function": js_func, "injective": True}
164 |
165 | assert actual == expected
166 |
167 | def test_js_not_injective(self):
168 | js_func = "function(x) {return x};"
169 | ext_fn = JavascriptExtraction(js_func)
170 | actual = ext_fn.build()
171 | expected = {"type": "javascript", "function": js_func, "injective": False}
172 |
173 | assert actual == expected
174 |
175 |
176 | class TestTimeFormatExtraction(object):
177 | def test_time_format_all_set(self):
178 | ext_fn = TimeFormatExtraction("EEEE", "en-US", "Europe/Berlin")
179 | actual = ext_fn.build()
180 | expected = {
181 | "type": "timeFormat",
182 | "format": "EEEE",
183 | "locale": "en-US",
184 | "timeZone": "Europe/Berlin",
185 | }
186 |
187 | assert actual == expected
188 |
189 | def test_time_format_no_timezone(self):
190 | ext_fn = TimeFormatExtraction("EEEE", "en-US")
191 | actual = ext_fn.build()
192 | expected = {"type": "timeFormat", "format": "EEEE", "locale": "en-US"}
193 |
194 | assert actual == expected
195 |
196 | def test_time_format_only_format(self):
197 | ext_fn = TimeFormatExtraction("EEEE")
198 | actual = ext_fn.build()
199 | expected = {"type": "timeFormat", "format": "EEEE"}
200 |
201 | assert actual == expected
202 |
203 |
204 | class TestMapLookupExtraction(object):
205 |
206 | mapping = {"foo1": "bar1", "foo2": "bar2"}
207 |
208 | def test_map_default(self):
209 | ext_fn = MapLookupExtraction(self.mapping)
210 | actual = ext_fn.build()
211 | expected = {
212 | "type": "lookup",
213 | "lookup": {"type": "map", "map": self.mapping},
214 | "retainMissingValue": False,
215 | "replaceMissingValueWith": None,
216 | "injective": False,
217 | }
218 |
219 | assert actual == expected
220 |
221 | def test_map_retain_missing(self):
222 | ext_fn = MapLookupExtraction(self.mapping, retain_missing_values=True)
223 | actual = ext_fn.build()
224 | expected = {
225 | "type": "lookup",
226 | "lookup": {"type": "map", "map": self.mapping},
227 | "retainMissingValue": True,
228 | "replaceMissingValueWith": None,
229 | "injective": False,
230 | }
231 |
232 | assert actual == expected
233 |
234 | def test_map_replace_missing(self):
235 | ext_fn = MapLookupExtraction(self.mapping, replace_missing_values="replacer")
236 | actual = ext_fn.build()
237 | expected = {
238 | "type": "lookup",
239 | "lookup": {"type": "map", "map": self.mapping},
240 | "retainMissingValue": False,
241 | "replaceMissingValueWith": "replacer",
242 | "injective": False,
243 | }
244 |
245 | assert actual == expected
246 |
247 | def test_map_injective(self):
248 | ext_fn = MapLookupExtraction(self.mapping, injective=True)
249 | actual = ext_fn.build()
250 | expected = {
251 | "type": "lookup",
252 | "lookup": {"type": "map", "map": self.mapping},
253 | "retainMissingValue": False,
254 | "replaceMissingValueWith": None,
255 | "injective": True,
256 | }
257 |
258 | assert actual == expected
259 |
260 |
261 | class TestNamespaceLookupExtraction(object):
262 | def test_map_default(self):
263 | ext_fn = NamespaceLookupExtraction("foo_namespace")
264 | actual = ext_fn.build()
265 | expected = {
266 | "type": "lookup",
267 | "lookup": {"type": "namespace", "namespace": "foo_namespace"},
268 | "retainMissingValue": False,
269 | "replaceMissingValueWith": None,
270 | "injective": False,
271 | }
272 |
273 | assert actual == expected
274 |
275 | def test_map_retain_missing(self):
276 | ext_fn = NamespaceLookupExtraction("foo_namespace", retain_missing_values=True)
277 | actual = ext_fn.build()
278 | expected = {
279 | "type": "lookup",
280 | "lookup": {"type": "namespace", "namespace": "foo_namespace"},
281 | "retainMissingValue": True,
282 | "replaceMissingValueWith": None,
283 | "injective": False,
284 | }
285 |
286 | assert actual == expected
287 |
288 | def test_map_replace_missing(self):
289 | ext_fn = NamespaceLookupExtraction(
290 | "foo_namespace", replace_missing_values="replacer"
291 | )
292 | actual = ext_fn.build()
293 | expected = {
294 | "type": "lookup",
295 | "lookup": {"type": "namespace", "namespace": "foo_namespace"},
296 | "retainMissingValue": False,
297 | "replaceMissingValueWith": "replacer",
298 | "injective": False,
299 | }
300 |
301 | assert actual == expected
302 |
303 | def test_map_injective(self):
304 | ext_fn = NamespaceLookupExtraction("foo_namespace", injective=True)
305 | actual = ext_fn.build()
306 | expected = {
307 | "type": "lookup",
308 | "lookup": {"type": "namespace", "namespace": "foo_namespace"},
309 | "retainMissingValue": False,
310 | "replaceMissingValueWith": None,
311 | "injective": True,
312 | }
313 |
314 | assert actual == expected
315 |
316 |
317 | class TestRegisteredLookupExtraction(object):
318 | def test_map_default(self):
319 | ext_fn = RegisteredLookupExtraction("foo_namespace")
320 | actual = ext_fn.build()
321 | expected = {
322 | "type": "registeredLookup",
323 | "lookup": "foo_namespace",
324 | "retainMissingValue": False,
325 | "replaceMissingValueWith": None,
326 | "injective": False,
327 | }
328 |
329 | assert actual == expected
330 |
331 | def test_map_retain_missing(self):
332 | ext_fn = RegisteredLookupExtraction("foo_namespace", retain_missing_values=True)
333 | actual = ext_fn.build()
334 | expected = {
335 | "type": "registeredLookup",
336 | "lookup": "foo_namespace",
337 | "retainMissingValue": True,
338 | "replaceMissingValueWith": None,
339 | "injective": False,
340 | }
341 |
342 | assert actual == expected
343 |
344 | def test_map_replace_missing(self):
345 | ext_fn = RegisteredLookupExtraction(
346 | "foo_namespace", replace_missing_values="replacer"
347 | )
348 | actual = ext_fn.build()
349 | expected = {
350 | "type": "registeredLookup",
351 | "lookup": "foo_namespace",
352 | "retainMissingValue": False,
353 | "replaceMissingValueWith": "replacer",
354 | "injective": False,
355 | }
356 |
357 | assert actual == expected
358 |
359 | def test_map_injective(self):
360 | ext_fn = RegisteredLookupExtraction("foo_namespace", injective=True)
361 | actual = ext_fn.build()
362 | expected = {
363 | "type": "registeredLookup",
364 | "lookup": "foo_namespace",
365 | "retainMissingValue": False,
366 | "replaceMissingValueWith": None,
367 | "injective": True,
368 | }
369 |
370 | assert actual == expected
371 |
--------------------------------------------------------------------------------
/tests/db/test_cursor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from collections import namedtuple
5 | from io import BytesIO
6 | from unittest.mock import ANY, patch
7 |
8 | import requests
9 | from requests.models import Response
10 | from requests.auth import HTTPBasicAuth
11 |
12 | from pydruid.db.api import BearerAuth, apply_parameters, Cursor, connect
13 |
14 |
15 | class CursorTestSuite(unittest.TestCase):
16 | @patch("requests.post")
17 | def test_execute(self, requests_post_mock):
18 | response = Response()
19 | response.status_code = 200
20 | response.raw = BytesIO(
21 | b'[{"name": "alice"}, {"name": "bob"}, {"name": "charlie"}]'
22 | )
23 | requests_post_mock.return_value = response
24 | Row = namedtuple("Row", ["name"])
25 |
26 | cursor = Cursor("http://example.com/")
27 | cursor.execute("SELECT * FROM table")
28 | result = cursor.fetchall()
29 | expected = [Row(name="alice"), Row(name="bob"), Row(name="charlie")]
30 | self.assertEqual(result, expected)
31 |
32 | @patch("requests.post")
33 | def test_execute_empty_result(self, requests_post_mock):
34 | response = Response()
35 | response.status_code = 200
36 | response.raw = BytesIO(b"[]")
37 | requests_post_mock.return_value = response
38 |
39 | cursor = Cursor("http://example.com/")
40 | cursor.execute("SELECT * FROM table")
41 | result = cursor.fetchall()
42 | expected = []
43 | self.assertEqual(result, expected)
44 |
45 | @patch("requests.post")
46 | def test_context(self, requests_post_mock):
47 | response = Response()
48 | response.status_code = 200
49 | response.raw = BytesIO(b"[]")
50 | requests_post_mock.return_value = response
51 |
52 | url = "http://example.com/"
53 | query = "SELECT * FROM table"
54 | context = {"source": "unittest"}
55 |
56 | cursor = Cursor(url, user=None, password=None, context=context)
57 | cursor.execute(query)
58 |
59 | requests_post_mock.assert_called_with(
60 | "http://example.com/",
61 | auth=None,
62 | stream=True,
63 | headers={"Content-Type": "application/json"},
64 | json={"query": query, "context": context, "header": False},
65 | verify=True,
66 | cert=None,
67 | proxies=None,
68 | )
69 |
70 | @patch("requests.post")
71 | def test_header_false(self, requests_post_mock):
72 | response = Response()
73 | response.status_code = 200
74 | response.raw = BytesIO(b'[{"name": "alice"}]')
75 | requests_post_mock.return_value = response
76 | Row = namedtuple("Row", ["name"])
77 |
78 | url = "http://example.com/"
79 | query = "SELECT * FROM table"
80 |
81 | cursor = Cursor(url, header=False)
82 | cursor.execute(query)
83 | result = cursor.fetchall()
84 | self.assertEqual(result, [Row(name="alice")])
85 |
86 | self.assertEqual(
87 | cursor.description, [("name", 1, None, None, None, None, True)]
88 | )
89 |
90 | @patch("requests.post")
91 | def test_header_true(self, requests_post_mock):
92 | response = Response()
93 | response.status_code = 200
94 | response.raw = BytesIO(b'[{"name": null}, {"name": "alice"}]')
95 | requests_post_mock.return_value = response
96 | Row = namedtuple("Row", ["name"])
97 |
98 | url = "http://example.com/"
99 | query = "SELECT * FROM table"
100 |
101 | cursor = Cursor(url, header=True)
102 | cursor.execute(query)
103 | result = cursor.fetchall()
104 | self.assertEqual(result, [Row(name="alice")])
105 | self.assertEqual(cursor.description, [("name", None)])
106 |
107 | @patch("requests.post")
108 | def test_names_with_underscores(self, requests_post_mock):
109 | response = Response()
110 | response.status_code = 200
111 | response.raw = BytesIO(b'[{"_name": null}, {"_name": "alice"}]')
112 | requests_post_mock.return_value = response
113 | Row = namedtuple("Row", ["_name"], rename=True)
114 |
115 | url = "http://example.com/"
116 | query = "SELECT * FROM table"
117 |
118 | cursor = Cursor(url, header=True)
119 | cursor.execute(query)
120 | result = cursor.fetchall()
121 | self.assertEqual(result, [Row(_0="alice")])
122 | self.assertEqual(cursor.description, [("_name", None)])
123 |
124 | def test_apply_parameters(self):
125 | self.assertEqual(
126 | apply_parameters('SELECT 100 AS "100%"', None), 'SELECT 100 AS "100%"'
127 | )
128 |
129 | self.assertEqual(
130 | apply_parameters('SELECT 100 AS "100%"', {}), 'SELECT 100 AS "100%"'
131 | )
132 |
133 | self.assertEqual(
134 | apply_parameters('SELECT %(key)s AS "100%%"', {"key": 100}),
135 | 'SELECT 100 AS "100%"',
136 | )
137 |
138 | self.assertEqual(apply_parameters("SELECT %(key)s", {"key": "*"}), "SELECT *")
139 |
140 | self.assertEqual(
141 | apply_parameters("SELECT %(key)s", {"key": "bar"}), "SELECT 'bar'"
142 | )
143 |
144 | self.assertEqual(
145 | apply_parameters("SELECT %(key)s", {"key": True}), "SELECT TRUE"
146 | )
147 |
148 | self.assertEqual(
149 | apply_parameters("SELECT %(key)s", {"key": False}), "SELECT FALSE"
150 | )
151 |
152 | # Generated by CodiumAI
153 | # When `user` is not None, `HTTPBasicAuth` is used for authentication.
154 | @patch("requests.post")
155 | def test_user_not_none_http_basic_auth(self, mock_post):
156 | from unittest.mock import patch
157 |
158 | response = Response()
159 | response.raw = BytesIO(b"[]")
160 | response.status_code = 200
161 | mock_post.return_value = response
162 |
163 | user = "test_user"
164 | password = "test_password"
165 | url = "http://example.com/"
166 | query = "SELECT * FROM table"
167 |
168 | cursor = Cursor(url, user=user, password=password)
169 | cursor.execute(query)
170 |
171 | mock_post.assert_called_with(
172 | url,
173 | stream=True,
174 | headers={"Content-Type": "application/json"},
175 | json={"query": query, "context": cursor.context, "header": cursor.header,},
176 | auth=requests.auth.HTTPBasicAuth(user, password),
177 | verify=cursor.ssl_verify_cert,
178 | cert=cursor.ssl_client_cert,
179 | proxies=cursor.proxies,
180 | )
181 |
182 | # When `user` is None and `jwt` is not None, `auth` is not None.
183 | @patch("requests.post")
184 | def test_user_none_jwt_not_none_auth_not_none(self, mock_post):
185 | response = Response()
186 | response.raw = BytesIO(b"[]")
187 | response.status_code = 200
188 | mock_post.return_value = response
189 |
190 | jwt = "test_jwt"
191 | url = "http://example.com/"
192 | query = "SELECT * FROM table"
193 |
194 | cursor = Cursor(url, jwt=jwt)
195 | cursor.execute(query)
196 |
197 | mock_post.assert_called_with(
198 | url,
199 | stream=True,
200 | headers={"Content-Type": "application/json"},
201 | json={"query": query, "context": cursor.context, "header": cursor.header,},
202 | auth=ANY,
203 | verify=cursor.ssl_verify_cert,
204 | cert=cursor.ssl_client_cert,
205 | proxies=cursor.proxies,
206 | )
207 |
208 | last_call = mock_post.call_args
209 | auth_arg = last_call.kwargs["auth"]
210 |
211 | self.assertIsInstance(auth_arg, BearerAuth)
212 | self.assertEqual(auth_arg.token, jwt)
213 |
214 | # Test that no authentication is used when both `user` and `jwt` are None.
215 | @patch("requests.post")
216 | def test_no_authentication_used(self, requests_post_mock):
217 | response = Response()
218 | response.status_code = 200
219 | response.raw = BytesIO(b'{"result": "success"}')
220 | requests_post_mock.return_value = response
221 |
222 | conn = connect(user=None, jwt=None)
223 | curs = conn.cursor()
224 |
225 | # Perform some operation that requires authentication
226 | curs.execute("SELECT * FROM table")
227 |
228 | # Assert that no authentication was used
229 | requests_post_mock.assert_called_with(
230 | ANY,
231 | stream=True,
232 | headers=ANY,
233 | json=ANY,
234 | auth=None,
235 | verify=ANY,
236 | cert=ANY,
237 | proxies=ANY,
238 | )
239 |
240 | # The test verifies that when `user` is not None and `jwt` is not None, `HttpBasicAuth` is used for authentication.
241 | @patch("requests.post")
242 | def test_basic_auth_used_for_authentication_when_both_provided(
243 | self, requests_post_mock
244 | ):
245 | response = Response()
246 | response.status_code = 200
247 | response.raw = BytesIO(b'{"result": "success"}')
248 | requests_post_mock.return_value = response
249 |
250 | url = "http://example.com/"
251 | user = "test_user"
252 | password = "test_password"
253 | jwt = "test_jwt"
254 |
255 | cursor = Cursor(url, user=user, password=password, jwt=jwt)
256 | cursor.execute("SELECT * FROM table")
257 |
258 | requests_post_mock.assert_called_with(
259 | url,
260 | stream=True,
261 | headers={"Content-Type": "application/json"},
262 | json={"query": "SELECT * FROM table", "context": {}, "header": False},
263 | auth=ANY,
264 | verify=True,
265 | cert=None,
266 | proxies=None,
267 | )
268 |
269 | last_call = requests_post_mock.call_args
270 | auth_arg = last_call.kwargs["auth"]
271 |
272 | self.assertIsInstance(auth_arg, HTTPBasicAuth)
273 | self.assertEqual(auth_arg.username, user)
274 | self.assertEqual(auth_arg.password, password)
275 |
276 | # When `ssl_verify_cert` is False, SSL certificate is not verified.
277 | @patch("requests.post")
278 | def test_ssl_certificate_verification_disabled(self, requests_post_mock):
279 | response = Response()
280 | response.status_code = 200
281 | response.raw = BytesIO(b"[]")
282 | requests_post_mock.return_value = response
283 | user = "test_user"
284 | password = "test_password"
285 |
286 | url = "http://example.com/"
287 | query = "SELECT * FROM table"
288 |
289 | cursor = Cursor(
290 | url, user=user, password=password, header=True, ssl_verify_cert=False
291 | )
292 | cursor.execute(query)
293 |
294 | requests_post_mock.assert_called_with(
295 | url,
296 | stream=True,
297 | headers={"Content-Type": "application/json"},
298 | json={"query": "SELECT * FROM table", "context": {}, "header": True},
299 | auth=ANY,
300 | verify=False,
301 | cert=None,
302 | proxies=None,
303 | )
304 |
305 | # When `user` is not None and `password` is None, `HTTPBasicAuth` is used with empty password.
306 | @patch("requests.post")
307 | @patch("requests.auth.HTTPBasicAuth")
308 | def test_http_basic_auth_with_empty_user(
309 | self, http_basic_auth_mock, requests_post_mock
310 | ):
311 | response = Response()
312 | response.status_code = 200
313 | response.raw = BytesIO(b'[{"_name": null}, {"_name": "alice"}]')
314 | requests_post_mock.return_value = response
315 |
316 | url = "http://example.com/"
317 | user = "user"
318 | password = None
319 | jwt = None
320 |
321 | conn = connect(user=user, password=password, jwt=jwt)
322 | cursor = conn.cursor()
323 | cursor.execute("SELECT * FROM table")
324 |
325 | http_basic_auth_mock.assert_called_with(user, None)
326 |
327 | requests_post_mock.assert_called_with(
328 | ANY,
329 | stream=True,
330 | headers={"Content-Type": "application/json"},
331 | json={"query": "SELECT * FROM table", "context": {}, "header": False},
332 | auth=http_basic_auth_mock.return_value,
333 | verify=True,
334 | cert=None,
335 | proxies=None,
336 | )
337 |
338 | # Test SSL client certificate authentication when `ssl_client_cert` is not None.
339 | @patch("requests.post")
340 | def test_ssl_client_cert_authentication_with_patch_imported(
341 | self, requests_post_mock
342 | ):
343 | response = Response()
344 | response.status_code = 200
345 | response.raw = BytesIO(b'[]')
346 | requests_post_mock.return_value = response
347 | Row = namedtuple("Row", ["_name"], rename=True)
348 |
349 | url = "http://example.com/"
350 | query = "SELECT * FROM table"
351 |
352 | cursor = Cursor(url, header=True, ssl_client_cert="path/to/cert")
353 | cursor.execute(query)
354 | requests_post_mock.assert_called_with(
355 | ANY,
356 | stream=True,
357 | headers={"Content-Type": "application/json"},
358 | json={"query": "SELECT * FROM table", "context": {}, "header": False},
359 | auth=ANY,
360 | verify=True,
361 | cert="path/to/cert",
362 | proxies=None,
363 | )
364 |
365 |
366 | if __name__ == "__main__":
367 | unittest.main()
368 |
--------------------------------------------------------------------------------
/pydruid/db/api.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import json
3 | from collections import namedtuple, OrderedDict
4 | from urllib import parse
5 |
6 | import requests
7 |
8 | from pydruid.db import exceptions
9 |
10 |
11 | class Type(object):
12 | STRING = 1
13 | NUMBER = 2
14 | BOOLEAN = 3
15 |
16 |
17 | class BearerAuth(requests.auth.AuthBase):
18 | def __init__(self, token) -> None:
19 | self.token = token
20 |
21 | def __call__(self, r):
22 | r.headers["Authorization"] = f"Bearer {self.token}"
23 | return r
24 |
25 | def connect(
26 | host="localhost",
27 | port=8082,
28 | path="/druid/v2/sql/",
29 | scheme="http",
30 | user=None,
31 | password=None,
32 | context=None,
33 | header=False,
34 | ssl_verify_cert=True,
35 | ssl_client_cert=None,
36 | proxies=None,
37 | jwt=None,
38 | ): # noqa: E125
39 | """
40 | Constructor for creating a connection to the database.
41 |
42 | >>> conn = connect('localhost', 8082)
43 | >>> curs = conn.cursor()
44 |
45 | """
46 | context = context or {}
47 |
48 | return Connection(
49 | host,
50 | port,
51 | path,
52 | scheme,
53 | user,
54 | password,
55 | context,
56 | header,
57 | ssl_verify_cert,
58 | ssl_client_cert,
59 | proxies,
60 | jwt,
61 | )
62 |
63 |
64 | def check_closed(f):
65 | """Decorator that checks if connection/cursor is closed."""
66 |
67 | def g(self, *args, **kwargs):
68 | if self.closed:
69 | raise exceptions.Error(
70 | "{klass} already closed".format(klass=self.__class__.__name__)
71 | )
72 | return f(self, *args, **kwargs)
73 |
74 | return g
75 |
76 |
77 | def check_result(f):
78 | """Decorator that checks if the cursor has results from `execute`."""
79 |
80 | def g(self, *args, **kwargs):
81 | if self._results is None:
82 | raise exceptions.Error("Called before `execute`")
83 | return f(self, *args, **kwargs)
84 |
85 | return g
86 |
87 |
88 | def get_description_from_row(row):
89 | """
90 | Return description from a single row.
91 |
92 | We only return the name, type (inferred from the data) and if the values
93 | can be NULL. String columns in Druid are NULLable. Numeric columns are NOT
94 | NULL.
95 | """
96 | return [
97 | (
98 | name, # name
99 | get_type(value), # type_code
100 | None, # [display_size]
101 | None, # [internal_size]
102 | None, # [precision]
103 | None, # [scale]
104 | get_type(value) == Type.STRING, # [null_ok]
105 | )
106 | for name, value in row.items()
107 | ]
108 |
109 |
110 | def get_type(value):
111 | """
112 | Infer type from value.
113 |
114 | Note that bool is a subclass of int so order of statements matter.
115 | """
116 |
117 | if isinstance(value, str) or value is None:
118 | return Type.STRING
119 | elif isinstance(value, bool):
120 | return Type.BOOLEAN
121 | elif isinstance(value, (int, float)):
122 | return Type.NUMBER
123 |
124 | raise exceptions.Error("Value of unknown type: {value}".format(value=value))
125 |
126 |
127 | class Connection(object):
128 | """Connection to a Druid database."""
129 |
130 | def __init__(
131 | self,
132 | host="localhost",
133 | port=8082,
134 | path="/druid/v2/sql/",
135 | scheme="http",
136 | user=None,
137 | password=None,
138 | context=None,
139 | header=False,
140 | ssl_verify_cert=True,
141 | ssl_client_cert=None,
142 | proxies=None,
143 | jwt=None,
144 | ):
145 | netloc = "{host}:{port}".format(host=host, port=port)
146 | self.url = parse.urlunparse((scheme, netloc, path, None, None, None))
147 | self.context = context or {}
148 | self.closed = False
149 | self.cursors = []
150 | self.header = header
151 | self.user = user
152 | self.password = password
153 | self.ssl_verify_cert = ssl_verify_cert
154 | self.ssl_client_cert = ssl_client_cert
155 | self.proxies = proxies
156 | self.jwt = jwt
157 |
158 | @check_closed
159 | def close(self):
160 | """Close the connection now."""
161 | self.closed = True
162 | for cursor in self.cursors:
163 | try:
164 | cursor.close()
165 | except exceptions.Error:
166 | pass # already closed
167 |
168 | @check_closed
169 | def commit(self):
170 | """
171 | Commit any pending transaction to the database.
172 |
173 | Not supported.
174 | """
175 | pass
176 |
177 | @check_closed
178 | def cursor(self):
179 | """Return a new Cursor Object using the connection."""
180 |
181 | cursor = Cursor(
182 | self.url,
183 | self.user,
184 | self.password,
185 | self.context,
186 | self.header,
187 | self.ssl_verify_cert,
188 | self.ssl_client_cert,
189 | self.proxies,
190 | self.jwt,
191 | )
192 |
193 | self.cursors.append(cursor)
194 |
195 | return cursor
196 |
197 | @check_closed
198 | def execute(self, operation, parameters=None):
199 | cursor = self.cursor()
200 | return cursor.execute(operation, parameters)
201 |
202 | def __enter__(self):
203 | return self.cursor()
204 |
205 | def __exit__(self, *exc):
206 | self.close()
207 |
208 |
209 | class Cursor(object):
210 | """Connection cursor."""
211 |
212 | def __init__(
213 | self,
214 | url,
215 | user=None,
216 | password=None,
217 | context=None,
218 | header=False,
219 | ssl_verify_cert=True,
220 | ssl_client_cert=None,
221 | proxies=None,
222 | jwt=None,
223 | ):
224 | self.url = url
225 | self.context = context or {}
226 | self.header = header
227 | self.user = user
228 | self.password = password
229 | self.ssl_verify_cert = ssl_verify_cert
230 | self.ssl_client_cert = ssl_client_cert
231 | self.proxies = proxies
232 | self.jwt = jwt
233 |
234 | # This read/write attribute specifies the number of rows to fetch at a
235 | # time with .fetchmany(). It defaults to 1 meaning to fetch a single
236 | # row at a time.
237 | self.arraysize = 1
238 |
239 | self.closed = False
240 |
241 | # this is updated only after a query
242 | self.description = None
243 |
244 | # this is set to an iterator after a successfull query
245 | self._results = None
246 |
247 | @property
248 | @check_result
249 | @check_closed
250 | def rowcount(self):
251 | # consume the iterator
252 | results = list(self._results)
253 | n = len(results)
254 | self._results = iter(results)
255 | return n
256 |
257 | @check_closed
258 | def close(self):
259 | """Close the cursor."""
260 | self.closed = True
261 |
262 | @check_closed
263 | def execute(self, operation, parameters=None):
264 | query = apply_parameters(operation, parameters)
265 | results = self._stream_query(query)
266 |
267 | # `_stream_query` returns a generator that produces the rows; we need to
268 | # consume the first row so that `description` is properly set, so let's
269 | # consume it and insert it back if it is not the header.
270 | try:
271 | first_row = next(results)
272 | self._results = (
273 | results if self.header else itertools.chain([first_row], results)
274 | )
275 | except StopIteration:
276 | self._results = iter([])
277 |
278 | return self
279 |
280 | @check_closed
281 | def executemany(self, operation, seq_of_parameters=None):
282 | raise exceptions.NotSupportedError(
283 | "`executemany` is not supported, use `execute` instead"
284 | )
285 |
286 | @check_result
287 | @check_closed
288 | def fetchone(self):
289 | """
290 | Fetch the next row of a query result set, returning a single sequence,
291 | or `None` when no more data is available.
292 | """
293 | try:
294 | return self.next()
295 | except StopIteration:
296 | return None
297 |
298 | @check_result
299 | @check_closed
300 | def fetchmany(self, size=None):
301 | """
302 | Fetch the next set of rows of a query result, returning a sequence of
303 | sequences (e.g. a list of tuples). An empty sequence is returned when
304 | no more rows are available.
305 | """
306 | size = size or self.arraysize
307 | return list(itertools.islice(self._results, size))
308 |
309 | @check_result
310 | @check_closed
311 | def fetchall(self):
312 | """
313 | Fetch all (remaining) rows of a query result, returning them as a
314 | sequence of sequences (e.g. a list of tuples). Note that the cursor's
315 | arraysize attribute can affect the performance of this operation.
316 | """
317 | return list(self._results)
318 |
319 | @check_closed
320 | def setinputsizes(self, sizes):
321 | # not supported
322 | pass
323 |
324 | @check_closed
325 | def setoutputsizes(self, sizes):
326 | # not supported
327 | pass
328 |
329 | @check_closed
330 | def __iter__(self):
331 | return self
332 |
333 | @check_closed
334 | def __next__(self):
335 | return next(self._results)
336 |
337 | next = __next__
338 |
339 | def _stream_query(self, query):
340 | """
341 | Stream rows from a query.
342 |
343 | This method will yield rows as the data is returned in chunks from the
344 | server.
345 | """
346 | self.description = None
347 |
348 | headers = {"Content-Type": "application/json"}
349 |
350 | payload = {"query": query, "context": self.context, "header": self.header}
351 |
352 | if self.user:
353 | auth = requests.auth.HTTPBasicAuth(self.user, self.password)
354 | elif self.jwt:
355 | auth = BearerAuth(self.jwt)
356 | else:
357 | auth = None
358 |
359 | r = requests.post(
360 | self.url,
361 | stream=True,
362 | headers=headers,
363 | json=payload,
364 | auth=auth,
365 | verify=self.ssl_verify_cert,
366 | cert=self.ssl_client_cert,
367 | proxies=self.proxies,
368 | )
369 | if r.encoding is None:
370 | r.encoding = "utf-8"
371 | # raise any error messages
372 | if r.status_code != 200:
373 | try:
374 | payload = r.json()
375 | except Exception:
376 | payload = {
377 | "error": "Unknown error",
378 | "errorClass": "Unknown",
379 | "errorMessage": r.text,
380 | }
381 |
382 | category = payload.pop("category", payload.pop("errorClass", "Unknown"))
383 | error = payload.get("error") or "Unknown"
384 | error_message = payload.get("errorMessage") or "Unknown"
385 | msg = f"{error} ({category}): {error_message}"
386 | raise exceptions.ProgrammingError(msg)
387 |
388 | # Druid will stream the data in chunks of 8k bytes, splitting the JSON
389 | # between them; setting `chunk_size` to `None` makes it use the server
390 | # size
391 | chunks = r.iter_content(chunk_size=None, decode_unicode=True)
392 | Row = None
393 | for row in rows_from_chunks(chunks):
394 | # update description
395 | if self.description is None:
396 | self.description = (
397 | list(row.items()) if self.header else get_description_from_row(row)
398 | )
399 |
400 | # return row in namedtuple
401 | if Row is None:
402 | Row = namedtuple("Row", row.keys(), rename=True)
403 | yield Row(*row.values())
404 |
405 |
406 | def rows_from_chunks(chunks):
407 | """
408 | A generator that yields rows from JSON chunks.
409 |
410 | Druid will return the data in chunks, but they are not aligned with the
411 | JSON objects. This function will parse all complete rows inside each chunk,
412 | yielding them as soon as possible.
413 | """
414 | body = ""
415 | for chunk in chunks:
416 | if chunk:
417 | body = "".join((body, chunk))
418 |
419 | # find last complete row
420 | boundary = 0
421 | brackets = 0
422 | in_string = False
423 | for i, char in enumerate(body):
424 | if char == '"':
425 | if not in_string:
426 | in_string = True
427 | elif body[i - 1] != "\\":
428 | in_string = False
429 |
430 | if in_string:
431 | continue
432 |
433 | if char == "{":
434 | brackets += 1
435 | elif char == "}":
436 | brackets -= 1
437 | if brackets == 0 and i > boundary:
438 | boundary = i + 1
439 |
440 | rows = body[:boundary].lstrip("[,")
441 | body = body[boundary:]
442 |
443 | for row in json.loads(
444 | "[{rows}]".format(rows=rows), object_pairs_hook=OrderedDict
445 | ):
446 | yield row
447 |
448 |
449 | def apply_parameters(operation, parameters):
450 | if not parameters:
451 | return operation
452 |
453 | escaped_parameters = {key: escape(value) for key, value in parameters.items()}
454 | return operation % escaped_parameters
455 |
456 |
457 | def escape(value):
458 | """
459 | Escape the parameter value.
460 |
461 | Note that bool is a subclass of int so order of statements matter.
462 | """
463 |
464 | if value == "*":
465 | return value
466 | elif isinstance(value, str):
467 | return "'{}'".format(value.replace("'", "''"))
468 | elif isinstance(value, bool):
469 | return "TRUE" if value else "FALSE"
470 | elif isinstance(value, (int, float)):
471 | return value
472 | elif isinstance(value, (list, tuple)):
473 | return ", ".join(escape(element) for element in value)
474 |
--------------------------------------------------------------------------------
/tests/utils/test_filters.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | import pytest
4 |
5 | from pydruid.utils import dimensions, filters
6 |
7 |
8 | class TestDimension:
9 | def test_dimension(self):
10 | d = filters.Dimension("dim")
11 | actual = filters.Filter.build_filter(d == "val")
12 | expected = {"type": "selector", "dimension": "dim", "value": "val"}
13 | assert actual == expected
14 |
15 | def test_ne_dimension(self):
16 | d = filters.Dimension("dim")
17 | actual = filters.Filter.build_filter(d != "val")
18 | expected = {
19 | "field": {"dimension": "dim", "type": "selector", "value": "val"},
20 | "type": "not",
21 | }
22 | assert actual == expected
23 |
24 |
25 | class TestFilter:
26 | def test_selector_filter(self):
27 | actual = filters.Filter.build_filter(
28 | filters.Filter(dimension="dim", value="val")
29 | )
30 | expected = {"type": "selector", "dimension": "dim", "value": "val"}
31 | assert actual == expected
32 |
33 | def test_selector_filter_extraction_fn(self):
34 | extraction_fn = dimensions.RegexExtraction("([a-b])")
35 | f = filters.Filter(
36 | dimension="dim", value="v", extraction_function=extraction_fn
37 | )
38 | actual = filters.Filter.build_filter(f)
39 | expected = {
40 | "type": "selector",
41 | "dimension": "dim",
42 | "value": "v",
43 | "extractionFn": {"type": "regex", "expr": "([a-b])"},
44 | }
45 | assert actual == expected
46 |
47 | def test_extraction_filter(self):
48 | extraction_fn = dimensions.PartialExtraction("([a-b])")
49 | f = filters.Filter(
50 | type="extraction",
51 | dimension="dim",
52 | value="v",
53 | extraction_function=extraction_fn,
54 | )
55 | actual = filters.Filter.build_filter(f)
56 | expected = {
57 | "type": "extraction",
58 | "dimension": "dim",
59 | "value": "v",
60 | "extractionFn": {"type": "partial", "expr": "([a-b])"},
61 | }
62 | assert actual == expected
63 |
64 | def test_javascript_filter(self):
65 | actual = filters.Filter.build_filter(
66 | filters.Filter(
67 | type="javascript", dimension="dim", function="function(x){return true}"
68 | )
69 | )
70 | expected = {
71 | "type": "javascript",
72 | "dimension": "dim",
73 | "function": "function(x){return true}",
74 | }
75 | assert actual == expected
76 |
77 | def test_bound_filter(self):
78 | actual = filters.Filter.build_filter(
79 | filters.Bound(
80 | dimension="dim",
81 | lower="1",
82 | lowerStrict=True,
83 | upper="10",
84 | upperStrict=True,
85 | ordering="numeric",
86 | )
87 | )
88 | expected = {
89 | "type": "bound",
90 | "dimension": "dim",
91 | "lower": "1",
92 | "lowerStrict": True,
93 | "upper": "10",
94 | "upperStrict": True,
95 | "alphaNumeric": False,
96 | "ordering": "numeric",
97 | }
98 | assert actual == expected
99 |
100 | def test_bound_filter_with_extraction_function(self):
101 | f = filters.Bound(
102 | dimension="d",
103 | lower="1",
104 | upper="3",
105 | upperStrict=True,
106 | extraction_function=dimensions.RegexExtraction(".*([0-9]+)"),
107 | )
108 | actual = filters.Filter.build_filter(f)
109 | expected = {
110 | "type": "bound",
111 | "dimension": "d",
112 | "lower": "1",
113 | "lowerStrict": False,
114 | "upper": "3",
115 | "upperStrict": True,
116 | "ordering": "lexicographic",
117 | "alphaNumeric": False,
118 | "extractionFn": {"type": "regex", "expr": ".*([0-9]+)"},
119 | }
120 | assert actual == expected
121 |
122 | def test_bound_filter_alphanumeric(self):
123 | actual = filters.Filter.build_filter(
124 | filters.Bound(
125 | dimension="dim",
126 | lower="1",
127 | lowerStrict=True,
128 | upper="10",
129 | upperStrict=True,
130 | alphaNumeric=True,
131 | )
132 | )
133 | expected = {
134 | "type": "bound",
135 | "dimension": "dim",
136 | "lower": "1",
137 | "lowerStrict": True,
138 | "upper": "10",
139 | "upperStrict": True,
140 | "alphaNumeric": True,
141 | "ordering": "lexicographic",
142 | }
143 | assert actual == expected
144 |
145 | def test_bound_filter_lower_not_included(self):
146 | actual = filters.Filter.build_filter(
147 | filters.Bound(dimension="dim", upper="10", upperStrict=True)
148 | )
149 | expected = {
150 | "type": "bound",
151 | "dimension": "dim",
152 | "lower": None,
153 | "lowerStrict": False,
154 | "upper": "10",
155 | "upperStrict": True,
156 | "alphaNumeric": False,
157 | "ordering": "lexicographic",
158 | }
159 | assert actual == expected
160 |
161 | def test_spatial_filter_rectangle(self):
162 | actual = filters.Filter.build_filter(
163 | filters.Spatial(
164 | dimension="dim",
165 | bound_type="rectangle",
166 | minCoords=[100.0, 100.0],
167 | maxCoords=[100.1, 100.1],
168 | radius=10.0,
169 | )
170 | )
171 | expected = {
172 | "type": "spatial",
173 | "dimension": "dim",
174 | "bound": {
175 | "type": "rectangle",
176 | "minCoords": [100.0, 100.0],
177 | "maxCoords": [100.1, 100.1],
178 | },
179 | }
180 | assert actual == expected
181 |
182 | def test_spatial_filter_radius(self):
183 | actual = filters.Filter.build_filter(
184 | filters.Spatial(
185 | dimension="dim",
186 | bound_type="radius",
187 | coords=[100.0, 100.0],
188 | radius=100.0,
189 | )
190 | )
191 | expected = {
192 | "type": "spatial",
193 | "dimension": "dim",
194 | "bound": {"type": "radius", "coords": [100.0, 100.0], "radius": 100.0},
195 | }
196 | assert actual == expected
197 |
198 | def test_spatial_filter_polygon(self):
199 | actual = filters.Filter.build_filter(
200 | filters.Spatial(
201 | dimension="dim",
202 | bound_type="polygon",
203 | abscissa=[2.0, 3.0, 7.0, 8.0],
204 | ordinate=[4.0, 9.0, 8.0, 1.0],
205 | )
206 | )
207 | expected = {
208 | "type": "spatial",
209 | "dimension": "dim",
210 | "bound": {
211 | "type": "polygon",
212 | "abscissa": [2.0, 3.0, 7.0, 8.0],
213 | "ordinate": [4.0, 9.0, 8.0, 1.0],
214 | },
215 | }
216 | assert actual == expected
217 |
218 | def test_interval_filter(self):
219 | actual = filters.Filter.build_filter(
220 | filters.Interval(
221 | dimension="dim",
222 | intervals=["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"],
223 | )
224 | )
225 | expected = {
226 | "type": "interval",
227 | "dimension": "dim",
228 | "intervals": ["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"],
229 | }
230 | assert actual == expected
231 |
232 | def test_interval_with_extraction_function(self):
233 | f = filters.Interval(
234 | dimension="dim",
235 | intervals=["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"],
236 | extraction_function=dimensions.RegexExtraction(".*([0-9]+)"),
237 | )
238 | actual = filters.Filter.build_filter(f)
239 | expected = {
240 | "type": "interval",
241 | "dimension": "dim",
242 | "intervals": ["2014-10-01T00:00:00.000Z/2014-10-07T00:00:00.000Z"],
243 | "extractionFn": {"type": "regex", "expr": ".*([0-9]+)"},
244 | }
245 | assert actual == expected
246 |
247 | def test_and_filter(self):
248 | f1 = filters.Filter(dimension="dim1", value="val1")
249 | f2 = filters.Filter(dimension="dim2", value="val2")
250 | actual = filters.Filter.build_filter(f1 & f2)
251 | expected = {
252 | "type": "and",
253 | "fields": [
254 | {"type": "selector", "dimension": "dim1", "value": "val1"},
255 | {"type": "selector", "dimension": "dim2", "value": "val2"},
256 | ],
257 | }
258 | assert actual == expected
259 |
260 | def test_and_filter_multiple(self):
261 | f1 = filters.Filter(dimension="dim1", value="val1")
262 | f2 = filters.Filter(dimension="dim2", value="val2")
263 | f3 = filters.Filter(dimension="dim3", value="val3")
264 | filter = filters.Filter(type="and", fields=[f1, f2, f3])
265 | actual = filters.Filter.build_filter(filter)
266 | expected = {
267 | "type": "and",
268 | "fields": [
269 | {"type": "selector", "dimension": "dim1", "value": "val1"},
270 | {"type": "selector", "dimension": "dim2", "value": "val2"},
271 | {"type": "selector", "dimension": "dim3", "value": "val3"},
272 | ],
273 | }
274 | assert actual == expected
275 |
276 | def test_or_filter(self):
277 | f1 = filters.Filter(dimension="dim1", value="val1")
278 | f2 = filters.Filter(dimension="dim2", value="val2")
279 | actual = filters.Filter.build_filter(f1 | f2)
280 | expected = {
281 | "type": "or",
282 | "fields": [
283 | {"type": "selector", "dimension": "dim1", "value": "val1"},
284 | {"type": "selector", "dimension": "dim2", "value": "val2"},
285 | ],
286 | }
287 | assert actual == expected
288 |
289 | def test_nested_mix_filter(self):
290 | f1 = filters.Filter(dimension="dim1", value="val1")
291 | f2 = filters.Filter(dimension="dim2", value="val2")
292 | f3 = filters.Filter(dimension="dim3", value="val3")
293 | f4 = filters.Filter(dimension="dim4", value="val4")
294 | f5 = filters.Filter(dimension="dim5", value="val5")
295 | f6 = filters.Filter(dimension="dim6", value="val6")
296 | f7 = filters.Filter(dimension="dim7", value="val7")
297 | f8 = filters.Filter(dimension="dim8", value="val8")
298 | actual = filters.Filter.build_filter(
299 | f1 & ~f2 & f3 & (f4 | ~f5 | f6 | (f7 & ~f8))
300 | )
301 | expected = {
302 | "fields": [
303 | {"dimension": "dim1", "type": "selector", "value": "val1"},
304 | {
305 | "field": {"dimension": "dim2", "type": "selector", "value": "val2"},
306 | "type": "not",
307 | },
308 | {"dimension": "dim3", "type": "selector", "value": "val3"},
309 | {
310 | "fields": [
311 | {"dimension": "dim4", "type": "selector", "value": "val4"},
312 | {
313 | "field": {
314 | "dimension": "dim5",
315 | "type": "selector",
316 | "value": "val5",
317 | },
318 | "type": "not",
319 | },
320 | {"dimension": "dim6", "type": "selector", "value": "val6"},
321 | {
322 | "fields": [
323 | {
324 | "dimension": "dim7",
325 | "type": "selector",
326 | "value": "val7",
327 | },
328 | {
329 | "field": {
330 | "dimension": "dim8",
331 | "type": "selector",
332 | "value": "val8",
333 | },
334 | "type": "not",
335 | },
336 | ],
337 | "type": "and",
338 | },
339 | ],
340 | "type": "or",
341 | },
342 | ],
343 | "type": "and",
344 | }
345 | assert actual == expected
346 |
347 | def test_or_filter_multiple(self):
348 | f1 = filters.Filter(dimension="dim1", value="val1")
349 | f2 = filters.Filter(dimension="dim2", value="val2")
350 | f3 = filters.Filter(dimension="dim3", value="val3")
351 | filter = filters.Filter(type="or", fields=[f1, f2, f3])
352 | actual = filters.Filter.build_filter(filter)
353 | expected = {
354 | "type": "or",
355 | "fields": [
356 | {"type": "selector", "dimension": "dim1", "value": "val1"},
357 | {"type": "selector", "dimension": "dim2", "value": "val2"},
358 | {"type": "selector", "dimension": "dim3", "value": "val3"},
359 | ],
360 | }
361 | assert actual == expected
362 |
363 | def test_not_filter(self):
364 | f = ~filters.Filter(dimension="dim", value="val")
365 | actual = filters.Filter.build_filter(f)
366 | # Call `build_filter` twice to make sure it does not
367 | # change the passed filter object argument `f`.
368 | actual = filters.Filter.build_filter(f)
369 | expected = {
370 | "type": "not",
371 | "field": {"type": "selector", "dimension": "dim", "value": "val"},
372 | }
373 | assert actual == expected
374 |
375 | def test_nested_not_or_filter(self):
376 | f1 = filters.Filter(dimension="dim1", value="val1")
377 | f2 = filters.Filter(dimension="dim2", value="val2")
378 | actual = filters.Filter.build_filter(~(f1 | f2))
379 | expected = {
380 | "type": "not",
381 | "field": {
382 | "type": "or",
383 | "fields": [
384 | {"type": "selector", "dimension": "dim1", "value": "val1"},
385 | {"type": "selector", "dimension": "dim2", "value": "val2"},
386 | ],
387 | },
388 | }
389 | assert actual == expected
390 |
391 | def test_in_filter(self):
392 | actual = filters.Filter.build_filter(
393 | filters.Filter(type="in", dimension="dim", values=["val1", "val2", "val3"])
394 | )
395 | expected = {
396 | "type": "in",
397 | "dimension": "dim",
398 | "values": ["val1", "val2", "val3"],
399 | }
400 | assert actual == expected
401 |
402 | def test_not_in_filter(self):
403 | actual = filters.Filter.build_filter(
404 | ~filters.Filter(type="in", dimension="dim", values=["val1", "val2", "val3"])
405 | )
406 | expected = {
407 | "type": "not",
408 | "field": {
409 | "type": "in",
410 | "dimension": "dim",
411 | "values": ["val1", "val2", "val3"],
412 | },
413 | }
414 | assert actual == expected
415 |
416 | def test_invalid_filter(self):
417 | with pytest.raises(NotImplementedError):
418 | filters.Filter(type="invalid", dimension="dim", value="val")
419 |
420 | def test_columnComparison_filter(self):
421 | actual = filters.Filter.build_filter(
422 | filters.Filter(
423 | type="columnComparison",
424 | dimensions=["dim1", dimensions.DimensionSpec("dim2", "dim2")],
425 | )
426 | )
427 | expected = {
428 | "type": "columnComparison",
429 | "dimensions": [
430 | "dim1",
431 | {"type": "default", "dimension": "dim2", "outputName": "dim2"},
432 | ],
433 | }
434 | assert actual == expected
435 |
436 | def test_search_filter(self):
437 | # Without caseSensitive param - default:false
438 | actual = filters.Filter.build_filter(
439 | filters.Filter(type="search", dimension="dim", value="val")
440 | )
441 | expected = {
442 | "type": "search",
443 | "dimension": "dim",
444 | "query": {"type": "contains", "caseSensitive": "false", "value": "val"},
445 | }
446 | assert actual == expected
447 |
448 | # With caseSensitive param
449 | actual = filters.Filter.build_filter(
450 | filters.Filter(
451 | type="search", dimension="dim", value="val", caseSensitive="true"
452 | )
453 | )
454 | expected = {
455 | "type": "search",
456 | "dimension": "dim",
457 | "query": {"type": "contains", "caseSensitive": "true", "value": "val"},
458 | }
459 | assert actual == expected
460 |
461 | def test_like_filter(self):
462 | actual = filters.Filter.build_filter(
463 | filters.Filter(type="like", dimension="dim", pattern="%val%")
464 | )
465 | expected = {"type": "like", "dimension": "dim", "pattern": "%val%"}
466 | assert actual == expected
467 |
--------------------------------------------------------------------------------