├── .gitignore ├── README.md ├── example.py ├── format.py └── sql_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | env/ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A validating SQL parser that can lint select statements 2 | 3 | Lots left to do here: 4 | 1. Finish formatting logic (only partially implemented) 5 | 2. Cross-db support (Snowflake, Redshift, BigQuery, Postgres, etc) 6 | 3. Handle more jinja blocks (config, ref, source, etc) 7 | 4. Add validating to formatting code (make sure we don't drop tokens) 8 | 5. Make this distributable (editor plugin at a POC?) 9 | 6. Lots and lots of testing! 10 | 11 | ### Requirements 12 | ``` 13 | pip install pyparsing 14 | ``` 15 | 16 | 17 | ### Example 18 | ``` 19 | import sql_parser 20 | from format import Formatter 21 | 22 | parser = sql_parser.BigQueryViewParser() 23 | sql = """ 24 | with my_cte as (select sum(case when a=1 then 1 else 0 end) as pivoted from table) select * from my_cte 25 | """ 26 | 27 | ast = parser._parse(sql) 28 | f = Formatter() 29 | f.format(ast) 30 | f.document.pprint() 31 | ``` 32 | 33 | Output: 34 | 35 | ``` 36 | with my_cte as ( 37 | 38 | select 39 | sum(case 40 | when a = 1 then 1 41 | else 0 42 | end) as pivoted 43 | 44 | from table 45 | 46 | ) 47 | 48 | select 49 | * 50 | 51 | from my_cte 52 | ``` 53 | 54 | ### Thanks 55 | 56 | Heavily inspired by (and partially copied from) code in: 57 | - https://github.com/mozilla/moz-sql-parser 58 | - https://github.com/pyparsing/pyparsing/blob/master/examples/bigquery_view_parser.py 59 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | 2 | import sql_parser 3 | from format import Formatter 4 | 5 | parser = sql_parser.BigQueryViewParser() 6 | sql = """ 7 | with my_cte as (select sum(case when a=1 then 1 else 0 end) as pivoted from table) select * from my_cte 8 | """ 9 | 10 | ast = parser._parse(sql) 11 | f = Formatter() 12 | f.format(ast) 13 | f.document.pprint() 14 | -------------------------------------------------------------------------------- /format.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 5 | # You can obtain one at http://mozilla.org/MPL/2.0/. 6 | # 7 | # Author: Beto Dealmeida (beto@dealmeida.net) 8 | # 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import unicode_literals 13 | 14 | import re 15 | 16 | from contextlib import contextmanager 17 | 18 | VALID = re.compile(r'[a-zA-Z_]\w*') 19 | 20 | 21 | class Token(object): 22 | def __init__(self, token): 23 | self.token = token 24 | 25 | class Whitespace(object): 26 | def __init__(self, count, char): 27 | self.count = count 28 | self.char = char 29 | 30 | class SQLDocument(object): 31 | def __init__(self): 32 | self.tokens = [] 33 | self.indent = 0 34 | self.indent_char = ' ' 35 | 36 | self._newline = False 37 | self.commas = 'back' 38 | 39 | def add(self, token): 40 | if hasattr(token, 'asList'): 41 | token = token.asList() 42 | if type(token) not in (list, tuple): 43 | token = [token] 44 | if self._newline: 45 | self.tokens.append(self.indent_char * self.indent) 46 | self._newline = False 47 | self.tokens.extend(list(token)) 48 | 49 | def newline(self): 50 | self.tokens.append('\n') 51 | self._newline = True 52 | 53 | @contextmanager 54 | def indented(self, count=4): 55 | self.indent += count 56 | yield self 57 | self.indent -= count 58 | 59 | def pprint(self): 60 | print("".join(self.tokens)) 61 | 62 | def Operator(op, parentheses=False): 63 | op = ' {0} '.format(op) 64 | def func(self, json): 65 | out = op.join(self.dispatch(v) for v in json) 66 | if parentheses: 67 | out = '({0})'.format(out) 68 | return out 69 | return func 70 | 71 | 72 | class Formatter: 73 | 74 | clauses = [ 75 | 'ctes', 76 | 'columns', 77 | 'from_', 78 | 'where', 79 | 'group_by_terms', 80 | 'having_expr', 81 | 'order_by_terms', 82 | 'limit', 83 | # 'offset', # not supported TODO 84 | ] 85 | 86 | # simple operators 87 | _concat = Operator('||') 88 | _mul = Operator('*') 89 | _div = Operator('/', parentheses=True) 90 | _add = Operator('+') 91 | _sub = Operator('-', parentheses=True) 92 | _neq = Operator('<>') 93 | _gt = Operator('>') 94 | _lt = Operator('<') 95 | _gte = Operator('>=') 96 | _lte = Operator('<=') 97 | _eq = Operator('=') 98 | _or = Operator('OR') 99 | _and = Operator('AND') 100 | 101 | def __init__(self): 102 | self.document = SQLDocument() 103 | 104 | def format(self, json): 105 | if 'union' in json: 106 | return self.union(json['union']) 107 | else: 108 | return self.query(json) 109 | 110 | def dispatch(self, json): 111 | if isinstance(json, list): 112 | return self.delimited_list(json) 113 | if isinstance(json, dict): 114 | if len(json) == 0: 115 | return '' 116 | elif 'value' in json: 117 | return self.value(json) 118 | elif 'from' in json: 119 | # Nested queries 120 | return '({})'.format(self.format(json)) 121 | elif 'select' in json: 122 | # Nested queries 123 | return '({})'.format(self.format(json)) 124 | else: 125 | return self.op(json) 126 | if isinstance(json, string_types): 127 | return json 128 | 129 | return text(json) 130 | 131 | def delimited_list(self, json): 132 | return ', '.join(self.dispatch(element) for element in json) 133 | 134 | def value(self, json): 135 | parts = [self.dispatch(json['value'])] 136 | if 'name' in json: 137 | parts.extend(['AS', self.dispatch(json['name'])]) 138 | return ' '.join(parts) 139 | 140 | def op(self, json): 141 | if 'on' in json: 142 | return self._on(json) 143 | 144 | if len(json) > 1: 145 | raise Exception('Operators should have only one key!') 146 | key, value = list(json.items())[0] 147 | 148 | # check if the attribute exists, and call the corresponding method; 149 | # note that we disallow keys that start with `_` to avoid giving access 150 | # to magic methods 151 | attr = '_{0}'.format(key) 152 | if hasattr(self, attr) and not key.startswith('_'): 153 | method = getattr(self, attr) 154 | return method(value) 155 | 156 | # treat as regular function call 157 | if isinstance(value, dict) and len(value) == 0: 158 | return key.upper() + "()" # NOT SURE IF AN EMPTY dict SHOULD BE DELT WITH HERE, OR IN self.dispatch() 159 | else: 160 | return '{0}({1})'.format(key.upper(), self.dispatch(value)) 161 | 162 | def _exists(self, value): 163 | return '{0} IS NOT NULL'.format(self.dispatch(value)) 164 | 165 | def _missing(self, value): 166 | return '{0} IS NULL'.format(self.dispatch(value)) 167 | 168 | def _like(self, pair): 169 | return '{0} LIKE {1}'.format(self.dispatch(pair[0]), self.dispatch(pair[1])) 170 | 171 | def _nlike(self, pair): 172 | return '{0} NOT LIKE {1}'.format(self.dispatch(pair[0]), self.dispatch(pair[1])) 173 | 174 | def _is(self, pair): 175 | return '{0} IS {1}'.format(self.dispatch(pair[0]), self.dispatch(pair[1])) 176 | 177 | def _in(self, json): 178 | valid = self.dispatch(json[1]) 179 | # `(10, 11, 12)` does not get parsed as literal, so it's formatted as 180 | # `10, 11, 12`. This fixes it. 181 | if not valid.startswith('('): 182 | valid = '({0})'.format(valid) 183 | 184 | return '{0} IN {1}'.format(json[0], valid) 185 | 186 | def _nin(self, json): 187 | valid = self.dispatch(json[1]) 188 | # `(10, 11, 12)` does not get parsed as literal, so it's formatted as 189 | # `10, 11, 12`. This fixes it. 190 | if not valid.startswith('('): 191 | valid = '({0})'.format(valid) 192 | 193 | return '{0} NOT IN {1}'.format(json[0], valid) 194 | 195 | def _case(self, checks): 196 | parts = ['CASE'] 197 | for check in checks: 198 | if isinstance(check, dict): 199 | parts.extend(['WHEN', self.dispatch(check['when'])]) 200 | parts.extend(['THEN', self.dispatch(check['then'])]) 201 | else: 202 | parts.extend(['ELSE', self.dispatch(check)]) 203 | parts.append('END') 204 | return ' '.join(parts) 205 | 206 | def _literal(self, json): 207 | if isinstance(json, list): 208 | return '({0})'.format(', '.join(self._literal(v) for v in json)) 209 | elif isinstance(json, string_types): 210 | return "'{0}'".format(json.replace("'", "''")) 211 | else: 212 | return str(json) 213 | 214 | def _on(self, json): 215 | detected_join = join_keywords & set(json.keys()) 216 | if len(detected_join) == 0: 217 | raise Exception( 218 | 'Fail to detect join type! Detected: "{}" Except one of: "{}"'.format( 219 | [on_keyword for on_keyword in json if on_keyword != 'on'][0], 220 | '", "'.join(join_keywords) 221 | ) 222 | ) 223 | 224 | join_keyword = detected_join.pop() 225 | 226 | return '{0} {1} ON {2}'.format( 227 | join_keyword.upper(), self.dispatch(json[join_keyword]), self.dispatch(json['on']) 228 | ) 229 | 230 | def union(self, json): 231 | return ' UNION '.join(self.query(query) for query in json) 232 | 233 | def query(self, json): 234 | for clause in self.clauses: 235 | func = getattr(self, clause, None) 236 | if func: 237 | func(json) 238 | 239 | def add_expr_list(self, expr): 240 | if isinstance(expr, (str, int, float)): 241 | self.add_expr(expr) 242 | else: 243 | for i, field in enumerate(expr): 244 | self.add_expr(field) 245 | if i != len(expr) - 1: 246 | self.document.add(', ') 247 | 248 | def add_expr(self, expr): 249 | doc = self.document 250 | 251 | if hasattr(expr, 'getName') and expr.getName() == 'operator': 252 | if expr.assoc == 'unary': 253 | doc.add(str(expr.op.match)) 254 | if len(expr.op.match) > 1: 255 | doc.add(' ') 256 | self.add_expr(expr.tokens) 257 | elif expr.assoc == 'binary': 258 | self.add_expr(expr.tokens[0]) 259 | doc.add(' ') 260 | doc.add(str(expr.op.match)) 261 | doc.add(' ') 262 | self.add_expr(expr.tokens[1]) 263 | elif expr.assoc == 'ternary': 264 | import ipdb; ipdb.set_trace() 265 | else: 266 | import ipdb; ipdb.set_trace() 267 | elif hasattr(expr, 'getName') and expr.getName() == 'function': 268 | doc.add(str(expr.func)) 269 | doc.add('(') 270 | self.add_expr_list(expr.tokens[0]) 271 | doc.add(')') 272 | elif hasattr(expr, 'getName') and expr.getName() == 'window function': 273 | doc.add(str(expr.func)) 274 | doc.add('(') 275 | self.add_expr_list(expr.func_args) 276 | doc.add(')') 277 | doc.add(' ') 278 | doc.add('over') 279 | doc.add(' ') 280 | doc.add('(') 281 | if expr.partition_args: 282 | # TODO: Method for printing list 283 | doc.add('partition by ') 284 | self.add_expr_list(expr.partition_args) 285 | if expr.order_args: 286 | if expr.partition_args: 287 | doc.add(' ') 288 | doc.add('order by ') 289 | self.add_expr_list(expr.order_args) 290 | for arg in expr.window_args: 291 | doc.add(' ') 292 | doc.add(arg) 293 | doc.add(')') 294 | elif hasattr(expr, 'getName') and expr.getName() == 'case': 295 | doc.add('case') 296 | doc.newline() 297 | with doc.indented(): 298 | for when in expr.whens: 299 | doc.add('when') 300 | doc.add(' ') 301 | self.add_expr(when['when']) 302 | doc.add(' ') 303 | doc.add('then') 304 | doc.add(' ') 305 | self.add_expr(when['then']) 306 | doc.newline() 307 | if expr._else: 308 | doc.add('else') 309 | doc.add(' ') 310 | self.add_expr(expr._else) 311 | doc.newline() 312 | doc.add('end') 313 | elif hasattr(expr, 'getName') and expr.getName() == 'select': 314 | import ipdb; ipdb.set_trace() 315 | #elif hasattr(expr, 'getName') and expr.getName() != 'quoted_expr': 316 | # import ipdb; ipdb.set_trace() 317 | 318 | elif isinstance(expr, (str, int)): 319 | doc.add(expr) 320 | else: 321 | for el in expr: 322 | self.add_expr(el) 323 | 324 | def add_column(self, column): 325 | if type(column) == str: 326 | self.document.add(column) 327 | elif column.select: 328 | self.document.add('(') 329 | self.document.newline() 330 | with self.document.indented(): 331 | self.query(column) 332 | self.document.newline() 333 | self.document.add(')') 334 | 335 | else: 336 | self.add_expr(column.quoted_expr) 337 | 338 | if type(column) != str and column.alias: 339 | self.document.add(' ') 340 | self.document.add('as') 341 | self.document.add(' ') 342 | self.document.add(column.alias) 343 | 344 | def add_cte(self, cte): 345 | self.document.add(cte.cte_name) 346 | self.document.add(' ') 347 | self.document.add('as') 348 | self.document.add(' ') 349 | self.document.add('(') 350 | self.document.newline() 351 | self.document.newline() 352 | with self.document.indented(): 353 | self.query(cte) 354 | self.document.newline() 355 | self.document.newline() 356 | self.document.add(')') 357 | 358 | def ctes(self, json): 359 | if len(json.ctes) == 0: 360 | return 361 | 362 | self.document.add('with ') 363 | for i, cte in enumerate(json.ctes): 364 | self.add_cte(cte) 365 | if i != len(json.ctes) - 1: 366 | self.document.add(',') 367 | self.document.newline() 368 | self.document.newline() 369 | 370 | 371 | def columns(self, json): 372 | self.document.add('select') 373 | self.document.newline() 374 | with self.document.indented(): 375 | for i, column in enumerate(json.columns): 376 | if self.document.commas == 'front' and i != 0: 377 | self.document.add(', ') 378 | 379 | self.add_column(column) 380 | 381 | if self.document.commas == 'back' and i != len(json.columns)-1: 382 | self.document.add(',') 383 | self.document.newline() 384 | 385 | # This ain't it :/ 386 | def add_from(self, from_): 387 | self.document.newline() 388 | self.document.add('from') 389 | self.document.add(' ') 390 | if type(from_) == str: 391 | self.document.add(from_) 392 | else: 393 | self.document.add(from_.table.asList()) # TODO: SBQ 394 | 395 | for join in from_.joins: 396 | if join.join_op[0] == ',': 397 | self.document.add(',') 398 | self.document.newline() 399 | self.document.add(join.table) 400 | else: 401 | self.document.newline() 402 | self.add_expr(join.asList()) 403 | 404 | def from_(self, json): 405 | if 'from' not in json: 406 | return 407 | from_ = json['from'] 408 | if 'union' in from_: 409 | return self.union(from_['union']) 410 | 411 | self.add_from(from_[0]) 412 | 413 | def where(self, json): 414 | if 'where' not in json: 415 | return 416 | 417 | self.document.newline() 418 | self.document.add('where ') 419 | self.add_expr(json['where']) 420 | 421 | def group_by_terms(self, json): 422 | if 'group_by_terms' not in json: 423 | return 424 | 425 | def having(self, json): 426 | import ipdb; ipdb.set_trace() 427 | if 'having' in json: 428 | return 'HAVING {0}'.format(self.dispatch(json['having'])) 429 | 430 | def orderby(self, json): 431 | import ipdb; ipdb.set_trace() 432 | if 'orderby' in json: 433 | orderby = json['orderby'] 434 | if isinstance(orderby, dict): 435 | orderby = [orderby] 436 | return 'ORDER BY {0}'.format(','.join([ 437 | '{0} {1}'.format(self.dispatch(o), o.get('sort', '').upper()).strip() 438 | for o in orderby 439 | ])) 440 | 441 | def limit(self, json): 442 | if 'limit' in json: 443 | if json['limit']: 444 | return 'LIMIT {0}'.format(self.dispatch(json['limit'])) 445 | 446 | def offset(self, json): 447 | if 'offset' in json: 448 | return 'OFFSET {0}'.format(self.dispatch(json['offset'])) 449 | 450 | -------------------------------------------------------------------------------- /sql_parser.py: -------------------------------------------------------------------------------- 1 | # bigquery_view_parser.py 2 | # 3 | # A parser to extract table names from BigQuery view definitions. 4 | # This is based on the `select_parser.py` sample in pyparsing: 5 | # https://github.com/pyparsing/pyparsing/blob/master/examples/select_parser.py 6 | # 7 | # Michael Smedberg 8 | # 9 | 10 | from pyparsing import ParserElement, Suppress, Forward, CaselessKeyword 11 | from pyparsing import MatchFirst, alphas, alphanums, Combine, Word, Literal, White, Empty 12 | from pyparsing import QuotedString, CharsNotIn, Optional, Group, ZeroOrMore, NoMatch 13 | from pyparsing import oneOf, delimitedList, restOfLine, cStyleComment 14 | from pyparsing import infixNotation, opAssoc, OneOrMore, Regex, nums 15 | 16 | def debug(s, i, toks): 17 | if len(toks) > 0: 18 | #import ipdb; ipdb.set_trace() 19 | pass 20 | 21 | 22 | class SemanticToken(object): 23 | def __iter__(self): 24 | return (i for i in self.tokens) 25 | 26 | def __len__(self): 27 | return len(self.tokens) 28 | 29 | 30 | 31 | 32 | class BigQueryViewParser: 33 | """Parser to extract table info from BigQuery view definitions""" 34 | 35 | _parser = None 36 | _table_identifiers = set() 37 | _with_aliases = set() 38 | 39 | def get_table_names(self, sql_stmt): 40 | table_identifiers, with_aliases = self._parse(sql_stmt) 41 | 42 | # Table names and alias names might differ by case, but that's not 43 | # relevant- aliases are not case sensitive 44 | lower_aliases = BigQueryViewParser.lowercase_set_of_tuples(with_aliases) 45 | tables = { 46 | x 47 | for x in table_identifiers 48 | if not BigQueryViewParser.lowercase_of_tuple(x) in lower_aliases 49 | } 50 | 51 | # Table names ARE case sensitive as described at 52 | # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 53 | # return tables 54 | return table_identifiers, with_aliases 55 | 56 | def _parse(self, sql_stmt): 57 | BigQueryViewParser._table_identifiers.clear() 58 | BigQueryViewParser._with_aliases.clear() 59 | res = BigQueryViewParser._get_parser().parseString(sql_stmt, parseAll=True) 60 | 61 | def kk(v): 62 | print(list(v.keys())) 63 | 64 | return res 65 | 66 | @classmethod 67 | def lowercase_of_tuple(cls, tuple_to_lowercase): 68 | return tuple(x.lower() if x else None for x in tuple_to_lowercase) 69 | 70 | @classmethod 71 | def lowercase_set_of_tuples(cls, set_of_tuples): 72 | return {BigQueryViewParser.lowercase_of_tuple(x) for x in set_of_tuples} 73 | 74 | @classmethod 75 | def _get_parser(cls): 76 | if cls._parser is not None: 77 | return cls._parser 78 | 79 | ParserElement.enablePackrat() 80 | 81 | LPAR, RPAR, COMMA, LBRACKET, RBRACKET, LT, GT = map(Literal, "(),[]<>") 82 | ungrouped_select_stmt = Forward().setName("select statement") 83 | 84 | # keywords 85 | ( 86 | UNION, 87 | ALL, 88 | AND, 89 | INTERSECT, 90 | EXCEPT, 91 | COLLATE, 92 | ASC, 93 | DESC, 94 | ON, 95 | USING, 96 | NATURAL, 97 | INNER, 98 | CROSS, 99 | LEFT, 100 | RIGHT, 101 | OUTER, 102 | FULL, 103 | JOIN, 104 | AS, 105 | INDEXED, 106 | NOT, 107 | SELECT, 108 | DISTINCT, 109 | FROM, 110 | WHERE, 111 | GROUP, 112 | BY, 113 | HAVING, 114 | ORDER, 115 | BY, 116 | LIMIT, 117 | OFFSET, 118 | OR, 119 | CAST, 120 | ISNULL, 121 | NOTNULL, 122 | NULL, 123 | IS, 124 | BETWEEN, 125 | ELSE, 126 | END, 127 | CASE, 128 | WHEN, 129 | THEN, 130 | EXISTS, 131 | COLLATE, 132 | IN, 133 | LIKE, 134 | GLOB, 135 | REGEXP, 136 | MATCH, 137 | ESCAPE, 138 | CURRENT_TIME, 139 | CURRENT_DATE, 140 | CURRENT_TIMESTAMP, 141 | WITH, 142 | EXTRACT, 143 | PARTITION, 144 | ROWS, 145 | RANGE, 146 | UNBOUNDED, 147 | PRECEDING, 148 | CURRENT, 149 | ROW, 150 | FOLLOWING, 151 | OVER, 152 | INTERVAL, 153 | DATE_ADD, 154 | DATE_SUB, 155 | ADDDATE, 156 | SUBDATE, 157 | REGEXP_EXTRACT, 158 | SPLIT, 159 | ORDINAL, 160 | FIRST_VALUE, 161 | LAST_VALUE, 162 | NTH_VALUE, 163 | LEAD, 164 | LAG, 165 | PERCENTILE_CONT, 166 | PRECENTILE_DISC, 167 | RANK, 168 | DENSE_RANK, 169 | PERCENT_RANK, 170 | CUME_DIST, 171 | NTILE, 172 | ROW_NUMBER, 173 | DATE, 174 | TIME, 175 | DATETIME, 176 | TIMESTAMP, 177 | UNNEST, 178 | INT64, 179 | NUMERIC, 180 | FLOAT64, 181 | BOOL, 182 | BYTES, 183 | GEOGRAPHY, 184 | ARRAY, 185 | STRUCT, 186 | SAFE_CAST, 187 | ANY_VALUE, 188 | ARRAY_AGG, 189 | ARRAY_CONCAT_AGG, 190 | AVG, 191 | BIT_AND, 192 | BIT_OR, 193 | BIT_XOR, 194 | COUNT, 195 | COUNTIF, 196 | LOGICAL_AND, 197 | LOGICAL_OR, 198 | MAX, 199 | MIN, 200 | STRING_AGG, 201 | SUM, 202 | CORR, 203 | COVAR_POP, 204 | COVAR_SAMP, 205 | STDDEV_POP, 206 | STDDEV_SAMP, 207 | STDDEV, 208 | VAR_POP, 209 | VAR_SAMP, 210 | VARIANCE, 211 | TIMESTAMP_ADD, 212 | TIMESTAMP_SUB, 213 | GENERATE_ARRAY, 214 | GENERATE_DATE_ARRAY, 215 | GENERATE_TIMESTAMP_ARRAY, 216 | FOR, 217 | SYSTEMTIME, 218 | AS, 219 | OF, 220 | WINDOW, 221 | RESPECT, 222 | IGNORE, 223 | NULLS, 224 | ) = map( 225 | CaselessKeyword, 226 | """ 227 | UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, 228 | NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, 229 | NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, 230 | LIMIT, OFFSET, OR, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, 231 | END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, 232 | MATCH, ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, WITH, 233 | EXTRACT, PARTITION, ROWS, RANGE, UNBOUNDED, PRECEDING, CURRENT, 234 | ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, DATE_SUB, ADDDATE, 235 | SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, LAST_VALUE, 236 | NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, 237 | DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, 238 | DATETIME, TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, 239 | GEOGRAPHY, ARRAY, STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, 240 | ARRAY_CONCAT_AGG, AVG, BIT_AND, BIT_OR, BIT_XOR, COUNT, COUNTIF, 241 | LOGICAL_AND, LOGICAL_OR, MAX, MIN, STRING_AGG, SUM, CORR, 242 | COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, STDDEV, VAR_POP, 243 | VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, GENERATE_ARRAY, 244 | GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, SYSTEMTIME, AS, 245 | OF, WINDOW, RESPECT, IGNORE, NULLS 246 | """.replace( 247 | ",", "" 248 | ).split(), 249 | ) 250 | 251 | keyword_nonfunctions = MatchFirst( 252 | ( 253 | UNION, 254 | ALL, 255 | INTERSECT, 256 | EXCEPT, 257 | COLLATE, 258 | ASC, 259 | DESC, 260 | ON, 261 | USING, 262 | NATURAL, 263 | INNER, 264 | CROSS, 265 | LEFT, 266 | RIGHT, 267 | OUTER, 268 | FULL, 269 | JOIN, 270 | AS, 271 | INDEXED, 272 | NOT, 273 | SELECT, 274 | DISTINCT, 275 | FROM, 276 | WHERE, 277 | GROUP, 278 | BY, 279 | HAVING, 280 | ORDER, 281 | BY, 282 | LIMIT, 283 | OFFSET, 284 | CAST, 285 | ISNULL, 286 | NOTNULL, 287 | NULL, 288 | IS, 289 | BETWEEN, 290 | ELSE, 291 | END, 292 | CASE, 293 | WHEN, 294 | THEN, 295 | EXISTS, 296 | COLLATE, 297 | IN, 298 | LIKE, 299 | GLOB, 300 | REGEXP, 301 | MATCH, 302 | STRUCT, 303 | WINDOW, 304 | ) 305 | ) 306 | 307 | keyword = keyword_nonfunctions | MatchFirst( 308 | ( 309 | ESCAPE, 310 | CURRENT_TIME, 311 | CURRENT_DATE, 312 | CURRENT_TIMESTAMP, 313 | DATE_ADD, 314 | DATE_SUB, 315 | ADDDATE, 316 | SUBDATE, 317 | INTERVAL, 318 | STRING_AGG, 319 | REGEXP_EXTRACT, 320 | SPLIT, 321 | ORDINAL, 322 | UNNEST, 323 | SAFE_CAST, 324 | PARTITION, 325 | TIMESTAMP_ADD, 326 | TIMESTAMP_SUB, 327 | ARRAY, 328 | GENERATE_ARRAY, 329 | GENERATE_DATE_ARRAY, 330 | GENERATE_TIMESTAMP_ARRAY, 331 | ) 332 | ) 333 | 334 | identifier_word = Word(alphas + "_@#", alphanums + "@$#_") 335 | identifier = ~keyword + identifier_word.copy() 336 | collation_name = identifier.copy() 337 | # NOTE: Column names can be keywords. Doc says they cannot, but in practice it seems to work. 338 | column_name = identifier.copy() 339 | cast_to = identifier.copy() 340 | qualified_column_name = Group( 341 | delimitedList(column_name, delim=".") 342 | + Optional( 343 | Suppress("::") 344 | + delimitedList(cast_to("cast"), delim="::") 345 | ) 346 | ) 347 | # NOTE: As with column names, column aliases can be keywords, e.g. functions like `current_time`. Other 348 | # keywords, e.g. `from` make parsing pretty difficult (e.g. "SELECT a from from b" is confusing.) 349 | column_alias = ~keyword_nonfunctions + column_name.copy() 350 | table_name = identifier.copy() 351 | table_alias = identifier.copy() 352 | index_name = identifier.copy() 353 | function_name = identifier.copy() 354 | parameter_name = identifier.copy() 355 | # NOTE: The expression in a CASE statement can be an integer. E.g. this is valid SQL: 356 | # select CASE 1 WHEN 1 THEN -1 ELSE -2 END from test_table 357 | unquoted_case_identifier = ~keyword + Word(alphanums + "$_") 358 | quoted_case_identifier = ~keyword + ( 359 | QuotedString('"') ^ Suppress("`") + CharsNotIn("`") + Suppress("`") 360 | ) 361 | case_identifier = quoted_case_identifier | unquoted_case_identifier 362 | case_expr = ( 363 | Optional(case_identifier + Suppress(".")) 364 | + Optional(case_identifier + Suppress(".")) 365 | + case_identifier 366 | ) 367 | 368 | # expression 369 | expr = Forward().setName("expression") 370 | 371 | integer = Regex(r"[+-]?\d+") 372 | numeric_literal = Regex(r"[+-]?\d*\.?\d+([eE][+-]?\d+)?") 373 | string_literal = QuotedString("'") | QuotedString('"') | QuotedString("`") 374 | regex_literal = "r" + string_literal 375 | blob_literal = Regex(r"[xX]'[0-9A-Fa-f]+'") 376 | date_or_time_literal = (DATE | TIME | DATETIME | TIMESTAMP) + string_literal 377 | literal_value = ( 378 | numeric_literal 379 | | string_literal 380 | | regex_literal 381 | | blob_literal 382 | | date_or_time_literal 383 | | NULL 384 | | CURRENT_TIME + Optional(LPAR + Optional(string_literal) + RPAR) 385 | | CURRENT_DATE + Optional(LPAR + Optional(string_literal) + RPAR) 386 | | CURRENT_TIMESTAMP + Optional(LPAR + Optional(string_literal) + RPAR) 387 | ) 388 | bind_parameter = Word("?", nums) | Combine(oneOf(": @ $") + parameter_name) 389 | type_name = oneOf( 390 | """TEXT REAL INTEGER BLOB NULL TIMESTAMP STRING DATE 391 | INT64 NUMERIC FLOAT64 BOOL BYTES DATETIME GEOGRAPHY TIME ARRAY 392 | STRUCT""", 393 | caseless=True, 394 | ) 395 | date_part = oneOf( 396 | """DAY DAY_HOUR DAY_MICROSECOND DAY_MINUTE DAY_SECOND 397 | HOUR HOUR_MICROSECOND HOUR_MINUTE HOUR_SECOND MICROSECOND MINUTE 398 | MINUTE_MICROSECOND MINUTE_SECOND MONTH QUARTER SECOND 399 | SECOND_MICROSECOND WEEK YEAR YEAR_MONTH""", 400 | caseless=True, 401 | ) 402 | datetime_operators = ( 403 | DATE_ADD | DATE_SUB | ADDDATE | SUBDATE | TIMESTAMP_ADD | TIMESTAMP_SUB 404 | ) 405 | 406 | def invalid_date_add(s, loc, tokens): 407 | prev_newline = s[:loc].rfind('\n') 408 | prev_prev_newline = s[:prev_newline].rfind('\n') 409 | if '--ignore' in s[prev_prev_newline:prev_newline]: 410 | pass 411 | else: 412 | raise RuntimeError("{} is not valid, did you mean 'date_add'".format(tokens[0])) 413 | 414 | #bad_datetime_operators = ( 415 | # CaselessKeyword('dateadd').setParseAction(invalid_date_add) 416 | #) 417 | 418 | grouping_term = expr.copy() 419 | ordering_term = Group( 420 | expr("order_key") 421 | + Optional(COLLATE + collation_name("collate")) 422 | + Optional(ASC | DESC)("direction") 423 | )("ordering_term") 424 | 425 | function_arg = expr.copy()("function_arg") 426 | function_args = Optional( 427 | "*" 428 | | Optional(DISTINCT) 429 | + delimitedList(function_arg) 430 | + Optional((RESPECT | IGNORE) + NULLS) 431 | )("function_args") 432 | function_call = ( 433 | (function_name | keyword)("function_name") 434 | + LPAR 435 | + Group(function_args)("function_args_group") 436 | + RPAR 437 | )('function') 438 | 439 | navigation_function_name = ( 440 | FIRST_VALUE 441 | | LAST_VALUE 442 | | NTH_VALUE 443 | | LEAD 444 | | LAG 445 | | PERCENTILE_CONT 446 | | PRECENTILE_DISC 447 | ) 448 | aggregate_function_name = ( 449 | ANY_VALUE 450 | | ARRAY_AGG 451 | | ARRAY_CONCAT_AGG 452 | | AVG 453 | | BIT_AND 454 | | BIT_OR 455 | | BIT_XOR 456 | | COUNT 457 | | COUNTIF 458 | | LOGICAL_AND 459 | | LOGICAL_OR 460 | | MAX 461 | | MIN 462 | | STRING_AGG 463 | | SUM 464 | ) 465 | statistical_aggregate_function_name = ( 466 | CORR 467 | | COVAR_POP 468 | | COVAR_SAMP 469 | | STDDEV_POP 470 | | STDDEV_SAMP 471 | | STDDEV 472 | | VAR_POP 473 | | VAR_SAMP 474 | | VARIANCE 475 | ) 476 | numbering_function_name = ( 477 | RANK | DENSE_RANK | PERCENT_RANK | CUME_DIST | NTILE | ROW_NUMBER 478 | ) 479 | analytic_function_name = ( 480 | navigation_function_name 481 | | aggregate_function_name 482 | | statistical_aggregate_function_name 483 | | numbering_function_name 484 | )("analytic_function_name") 485 | partition_expression_list = delimitedList(grouping_term)( 486 | "partition_expression_list" 487 | ) 488 | window_frame_boundary_start = ( 489 | UNBOUNDED + PRECEDING 490 | | numeric_literal + (PRECEDING | FOLLOWING) 491 | | CURRENT + ROW 492 | ) 493 | window_frame_boundary_end = ( 494 | UNBOUNDED + FOLLOWING 495 | | numeric_literal + (PRECEDING | FOLLOWING) 496 | | CURRENT + ROW 497 | ) 498 | window_frame_clause = (ROWS | RANGE) + ( 499 | ((UNBOUNDED + PRECEDING) | (numeric_literal + PRECEDING) | (CURRENT + ROW)) 500 | | (BETWEEN + window_frame_boundary_start + AND + window_frame_boundary_end) 501 | ) 502 | window_name = identifier.copy()("window_name") 503 | window_specification = ( 504 | Optional(window_name) 505 | + Optional(PARTITION + BY + partition_expression_list) 506 | + Optional(ORDER + BY + delimitedList(ordering_term)) 507 | + Optional(window_frame_clause)("window_specification") 508 | ) 509 | analytic_function = ( 510 | analytic_function_name 511 | + LPAR 512 | + function_args.setParseAction(debug) 513 | + RPAR 514 | + OVER 515 | + (window_name | LPAR + Optional(window_specification)('window') + RPAR) 516 | )("analytic_function") 517 | 518 | string_agg_term = ( 519 | STRING_AGG 520 | + LPAR 521 | + Optional(DISTINCT)('has_distinct') 522 | + expr('string_agg_expr') 523 | + Optional(COMMA + string_literal('delimiter')) 524 | + Optional( 525 | ORDER + BY + expr + Optional(ASC | DESC) + Optional(LIMIT + integer) 526 | ) 527 | + RPAR 528 | )("string_agg") 529 | array_literal = ( 530 | Optional(ARRAY + Optional(LT + delimitedList(type_name) + GT)) 531 | + LBRACKET 532 | + delimitedList(expr) 533 | + RBRACKET 534 | ) 535 | interval = INTERVAL + expr + date_part 536 | array_generator = ( 537 | GENERATE_ARRAY 538 | + LPAR 539 | + numeric_literal 540 | + COMMA 541 | + numeric_literal 542 | + COMMA 543 | + numeric_literal 544 | + RPAR 545 | ) 546 | date_array_generator = ( 547 | (GENERATE_DATE_ARRAY | GENERATE_TIMESTAMP_ARRAY) 548 | + LPAR 549 | + expr("start_date") 550 | + COMMA 551 | + expr("end_date") 552 | + Optional(COMMA + interval) 553 | + RPAR 554 | ) 555 | 556 | explicit_struct = ( 557 | STRUCT 558 | + Optional(LT + delimitedList(type_name) + GT) 559 | + LPAR 560 | + Optional(delimitedList(expr + Optional(AS + identifier))) 561 | + RPAR 562 | ) 563 | 564 | case_when = WHEN + expr.copy()("when") 565 | case_then = THEN + expr.copy()("then") 566 | case_clauses = Group(ZeroOrMore(case_when + case_then)) 567 | case_else = ELSE + expr.copy()("_else") 568 | case_stmt = ( 569 | CASE 570 | + Optional(case_expr.copy()) 571 | + case_clauses("case_clauses") 572 | + Optional(case_else) 573 | + END 574 | )("case") 575 | 576 | class SelectStatement(SemanticToken): 577 | def __init__(self, tokens): 578 | self.tokens = tokens 579 | 580 | def getName(self): 581 | return 'select' 582 | 583 | @classmethod 584 | def parse(cls, tokens): 585 | return SelectStatement(tokens) 586 | 587 | class Function(SemanticToken): 588 | def __init__(self, func, tokens): 589 | self.func = func 590 | self.tokens = tokens 591 | 592 | def getName(self): 593 | return 'function' 594 | 595 | @classmethod 596 | def parse(cls, tokens): 597 | method = tokens[0] 598 | args = tokens[2:-1] 599 | return Function(method, args) 600 | 601 | def __repr__(self): 602 | return "func:{}({})".format(self.func, self.tokens) 603 | 604 | 605 | class WindowFunction(Function): 606 | def __init__(self, func, tokens, func_args, partition_args, order_args, window_args): 607 | self.func = func 608 | self.tokens = tokens 609 | self.func_args = func_args 610 | self.partition_args = partition_args 611 | self.order_args = order_args 612 | self.window_args = window_args 613 | 614 | def getName(self): 615 | return 'window function' 616 | 617 | @classmethod 618 | def parse(cls, tokens): 619 | return WindowFunction( 620 | tokens.analytic_function_name, 621 | tokens, 622 | tokens.function_args, 623 | tokens.partition_expression_list, 624 | tokens.ordering_term, 625 | tokens.window_specification 626 | ) 627 | 628 | def __repr__(self): 629 | return "window:{}({})over({}, {}, {})".format(self.func, self.func_args, self.partition_args, self.order_args, self.window_args) 630 | 631 | class CaseStatement(SemanticToken): 632 | def __init__(self, tokens, whens, _else): 633 | self.tokens = tokens 634 | self.whens = whens 635 | self._else = _else 636 | 637 | def getName(self): 638 | return 'case' 639 | 640 | @classmethod 641 | def parse_whens(self, tokens): 642 | whens = [] 643 | while len(tokens) > 0: 644 | _, when, _, then, *tokens = tokens 645 | whens.append({"when": when, "then": then}) 646 | return whens 647 | 648 | @classmethod 649 | def parse(cls, tokens): 650 | whens = tokens[1] 651 | _else = tokens[3] 652 | return CaseStatement( 653 | tokens, 654 | cls.parse_whens(whens), 655 | _else 656 | ) 657 | 658 | def __repr__(self): 659 | return "".format(len(self.whens), self._else) 660 | 661 | expr_term = ( 662 | (analytic_function)("analytic_function").setParseAction(WindowFunction.parse) 663 | | (CAST + LPAR + expr + AS + type_name + RPAR)("cast") 664 | | (SAFE_CAST + LPAR + expr + AS + type_name + RPAR)("safe_cast") 665 | | (Optional(EXISTS) + LPAR + ungrouped_select_stmt + RPAR)("subselect") 666 | | (literal_value)("literal") 667 | | (bind_parameter)("bind_parameter") 668 | | (EXTRACT + LPAR + expr + FROM + expr + RPAR)("extract") 669 | | case_stmt.setParseAction(CaseStatement.parse) 670 | | (datetime_operators + LPAR + expr + COMMA + interval + RPAR)( 671 | "date_operation" 672 | ) 673 | #| (bad_datetime_operators + LPAR + expr + COMMA + interval + RPAR) 674 | | string_agg_term("string_agg_term") 675 | | array_literal("array_literal") 676 | | array_generator("array_generator") 677 | | date_array_generator("date_array_generator") 678 | | explicit_struct("explicit_struct") 679 | | function_call("function_call").setParseAction(Function.parse) 680 | | qualified_column_name("column").setParseAction(lambda x: ".".join([str(i) for i in x[0]])) 681 | ).setParseAction(debug) + Optional(LBRACKET + (OFFSET | ORDINAL) + LPAR + expr + RPAR + RBRACKET)( 682 | "offset_ordinal" 683 | ) 684 | 685 | struct_term = (LPAR + delimitedList(expr_term) + RPAR) 686 | 687 | KNOWN_OPS = [ 688 | (BETWEEN, AND), 689 | Literal("||").setName("concat"), 690 | Literal("*").setName("mul"), 691 | Literal("/").setName("div"), 692 | Literal("+").setName("add"), 693 | Literal("-").setName("sub"), 694 | Literal("<>").setName("neq"), 695 | Literal(">").setName("gt"), 696 | Literal("<").setName("lt"), 697 | Literal(">=").setName("gte"), 698 | Literal("<=").setName("lte"), 699 | Literal("=").setName("eq"), 700 | Literal("==").setName("eq"), 701 | Literal("!=").setName("neq"), 702 | IN.setName("in"), 703 | IS.setName("is"), 704 | LIKE.setName("like"), 705 | OR.setName("or"), 706 | AND.setName("and"), 707 | 708 | NOT.setName('not') 709 | ] 710 | 711 | class Operator(SemanticToken): 712 | def __init__(self, op, assoc, name, tokens): 713 | self.op = op 714 | self.assoc = assoc 715 | self.name = name 716 | self.tokens = tokens 717 | 718 | def getName(self): 719 | return 'operator' 720 | 721 | @classmethod 722 | def parse(cls, tokens): 723 | # ARRANGE INTO {op: params} FORMAT 724 | toks = tokens[0] 725 | if toks[1] in KNOWN_OPS: 726 | op = KNOWN_OPS[KNOWN_OPS.index(toks[1])] 727 | if toks.subselect: 728 | import ipdb; ipdb.set_trace() 729 | return Operator(op, 'binary', op.name, [toks[0], toks[2:]]) 730 | else: 731 | import ipdb; ipdb.set_trace() 732 | return tokens 733 | 734 | @classmethod 735 | def parse_unary(cls, tokens): 736 | toks = tokens[0] 737 | if toks[0] in KNOWN_OPS: 738 | op = KNOWN_OPS[KNOWN_OPS.index(toks[0])] 739 | else: 740 | import ipdb; ipdb.set_trace() 741 | return Operator(op, 'unary', op.name, [toks[1:]]) 742 | 743 | @classmethod 744 | def parse_ternary(cls, tokens): 745 | import ipdb; ipdb.set_trace() 746 | 747 | def __repr__(self): 748 | return "".format(self.op, self.assoc, self.tokens) 749 | 750 | UNARY, BINARY, TERNARY = 1, 2, 3 751 | expr << infixNotation( 752 | (expr_term | struct_term), 753 | [ 754 | (oneOf("- + ~") | NOT, UNARY, opAssoc.RIGHT, Operator.parse_unary), 755 | (ISNULL | NOTNULL | NOT + NULL, UNARY, opAssoc.LEFT, Operator.parse_unary), 756 | ("||", BINARY, opAssoc.LEFT, Operator.parse), 757 | (oneOf("* / %"), BINARY, opAssoc.LEFT, Operator.parse), 758 | (oneOf("+ -"), BINARY, opAssoc.LEFT, Operator.parse), 759 | (oneOf("<< >> & |"), BINARY, opAssoc.LEFT, Operator.parse), 760 | (oneOf("= > < >= <= <> != !< !>"), BINARY, opAssoc.LEFT, Operator.parse), 761 | ( 762 | IS + Optional(NOT) 763 | | Optional(NOT) + IN 764 | | Optional(NOT) + LIKE 765 | | GLOB 766 | | MATCH 767 | | REGEXP, 768 | BINARY, 769 | opAssoc.LEFT, 770 | Operator.parse 771 | ), 772 | ((BETWEEN, AND), TERNARY, opAssoc.LEFT, Operator.parse_ternary), 773 | ( 774 | Optional(NOT) 775 | + IN 776 | + LPAR 777 | + Group(ungrouped_select_stmt | delimitedList(expr)) 778 | + RPAR, 779 | UNARY, 780 | opAssoc.LEFT, 781 | Operator.parse_unary 782 | ), 783 | (AND, BINARY, opAssoc.LEFT, Operator.parse), 784 | (OR, BINARY, opAssoc.LEFT, Operator.parse), 785 | ], 786 | lpar=Literal('('), 787 | rpar=Literal(')'), 788 | ) 789 | quoted_expr = ( 790 | expr 791 | ^ Suppress('"') + expr + Suppress('"') 792 | ^ Suppress("'") + expr + Suppress("'") 793 | ^ Suppress("`") + expr + Suppress("`") 794 | )("quoted_expr") 795 | 796 | compound_operator = ( 797 | UNION + Optional(ALL | DISTINCT) 798 | | INTERSECT + DISTINCT 799 | | EXCEPT + DISTINCT 800 | | INTERSECT 801 | | EXCEPT 802 | )("compound_operator") 803 | 804 | join_constraint = Group( 805 | Optional( 806 | ON + expr 807 | | USING + LPAR + Group(delimitedList(qualified_column_name)) + RPAR 808 | ) 809 | )("join_constraint") 810 | 811 | join_op = ( 812 | COMMA 813 | | Group( 814 | Optional(NATURAL) 815 | + Optional( 816 | INNER 817 | | CROSS 818 | | LEFT + OUTER 819 | | LEFT 820 | | RIGHT + OUTER 821 | | RIGHT 822 | | FULL + OUTER 823 | | OUTER 824 | | FULL 825 | ) 826 | + JOIN 827 | ) 828 | )("join_op") 829 | 830 | join_source = Forward() 831 | 832 | # We support three kinds of table identifiers. 833 | # 834 | # First, dot delimited info like project.dataset.table, where 835 | # each component follows the rules described in the BigQuery 836 | # docs, namely: 837 | # Contain letters (upper or lower case), numbers, and underscores 838 | # 839 | # Second, a dot delimited quoted string. Since it's quoted, we'll be 840 | # liberal w.r.t. what characters we allow. E.g.: 841 | # `project.dataset.name-with-dashes` 842 | # 843 | # Third, a series of quoted strings, delimited by dots, e.g.: 844 | # `project`.`dataset`.`name-with-dashes` 845 | # 846 | # We won't attempt to support combinations, like: 847 | # project.dataset.`name-with-dashes` 848 | # `project`.`dataset.name-with-dashes` 849 | 850 | def record_table_identifier(t): 851 | identifier_list = t.asList() 852 | padded_list = [None] * (3 - len(identifier_list)) + identifier_list 853 | cls._table_identifiers.add(tuple(padded_list)) 854 | 855 | standard_table_part = ~keyword + Word(alphanums + "_") 856 | standard_table_identifier = ( 857 | Optional(standard_table_part("project") + Suppress(".")) 858 | + Optional(standard_table_part("dataset") + Suppress(".")) 859 | + standard_table_part("table") 860 | ).setParseAction(lambda t: record_table_identifier(t)) 861 | 862 | quoted_project_part = ( 863 | Suppress('"') + CharsNotIn('"') + Suppress('"') 864 | | Suppress("'") + CharsNotIn("'") + Suppress("'") 865 | | Suppress("`") + CharsNotIn("`") + Suppress("`") 866 | ) 867 | quoted_table_part = ( 868 | Suppress('"') + CharsNotIn('".') + Suppress('"') 869 | | Suppress("'") + CharsNotIn("'.") + Suppress("'") 870 | | Suppress("`") + CharsNotIn("`.") + Suppress("`") 871 | ) 872 | quoted_table_parts_identifier = ( 873 | Optional(quoted_project_part("project") + Suppress(".")) 874 | + Optional(quoted_table_part("dataset") + Suppress(".")) 875 | + quoted_table_part("table") 876 | ).setParseAction(lambda t: record_table_identifier(t)) 877 | 878 | def record_quoted_table_identifier(t): 879 | identifier_list = t.asList()[0].split(".") 880 | first = ".".join(identifier_list[0:-2]) or None 881 | second = identifier_list[-2] 882 | third = identifier_list[-1] 883 | identifier_list = [first, second, third] 884 | padded_list = [None] * (3 - len(identifier_list)) + identifier_list 885 | cls._table_identifiers.add(tuple(padded_list)) 886 | 887 | quotable_table_parts_identifier = ( 888 | Suppress('"') + CharsNotIn('"') + Suppress('"') 889 | | Suppress("'") + CharsNotIn("'") + Suppress("'") 890 | | Suppress("`") + CharsNotIn("`") + Suppress("`") 891 | ).setParseAction(lambda t: record_quoted_table_identifier(t)) 892 | 893 | table_identifier = ( 894 | standard_table_identifier 895 | | quoted_table_parts_identifier 896 | | quotable_table_parts_identifier 897 | ) 898 | 899 | def record_ref(t): 900 | lol = [t.op] + t.ref_target.asList() 901 | cls._with_aliases.add(tuple(lol)) 902 | cls._table_identifiers.add(tuple(lol)) 903 | 904 | ref_target = identifier.copy() 905 | single_source = ( 906 | # ref + source statements 907 | ( 908 | ( 909 | Suppress('{{') 910 | + (CaselessKeyword('ref') | CaselessKeyword("source"))("op") 911 | + LPAR 912 | + delimitedList( 913 | (Suppress("'") | Suppress('"')) 914 | + ref_target 915 | + (Suppress("'") | Suppress('"')) 916 | )("ref_target") 917 | + RPAR 918 | + Suppress("}}") 919 | ).setParseAction(record_ref) 920 | | table_identifier 921 | ) 922 | + Optional(Optional(AS) + table_alias("table_alias*")) 923 | + Optional(FOR + SYSTEMTIME + AS + OF + string_literal) 924 | + Optional(INDEXED + BY + index_name("name") | NOT + INDEXED)("index") 925 | | ( 926 | LPAR 927 | + ungrouped_select_stmt 928 | + RPAR 929 | + Optional(Optional(AS) + table_alias) 930 | )('subquery') 931 | | (LPAR + join_source + RPAR) 932 | | (UNNEST + LPAR + expr + RPAR) + Optional(Optional(AS) + column_alias) 933 | ) 934 | 935 | join_source << ( 936 | Group(single_source + OneOrMore(Group(join_op + single_source + join_constraint)('joins*'))) 937 | | single_source 938 | )('sources*') 939 | 940 | over_partition = (PARTITION + BY + delimitedList(partition_expression_list))( 941 | "over_partition" 942 | ) 943 | over_order = ORDER + BY + delimitedList(ordering_term) 944 | over_unsigned_value_specification = expr 945 | over_window_frame_preceding = ( 946 | UNBOUNDED + PRECEDING 947 | | over_unsigned_value_specification + PRECEDING 948 | | CURRENT + ROW 949 | ) 950 | over_window_frame_following = ( 951 | UNBOUNDED + FOLLOWING 952 | | over_unsigned_value_specification + FOLLOWING 953 | | CURRENT + ROW 954 | ) 955 | over_window_frame_bound = ( 956 | over_window_frame_preceding | over_window_frame_following 957 | ) 958 | over_window_frame_between = ( 959 | BETWEEN + over_window_frame_bound + AND + over_window_frame_bound 960 | ) 961 | over_window_frame_extent = ( 962 | over_window_frame_preceding | over_window_frame_between 963 | ) 964 | over_row_or_range = (ROWS | RANGE) + over_window_frame_extent 965 | over = ( 966 | OVER 967 | + LPAR 968 | + Optional(over_partition) 969 | + Optional(over_order) 970 | + Optional(over_row_or_range) 971 | + RPAR 972 | )("over") 973 | 974 | 975 | result_column = ( 976 | Optional(table_name + ".") 977 | + "*" 978 | + Optional( 979 | EXCEPT 980 | + LPAR 981 | + delimitedList(column_name) 982 | + RPAR 983 | ) | Group(quoted_expr + Optional(over) + Optional(Optional(AS) + column_alias('alias'))) 984 | ) 985 | 986 | window_select_clause = ( 987 | WINDOW + identifier + AS + LPAR + window_specification + RPAR 988 | ) 989 | 990 | select_core = ( 991 | SELECT 992 | + Optional(DISTINCT | ALL) 993 | + Group(delimitedList(result_column))("columns") 994 | + Optional(FROM - join_source("from*")) 995 | + Optional(WHERE + expr('where')) 996 | + Optional( 997 | GROUP + BY + Group(delimitedList(grouping_term))("group_by_terms") 998 | ) 999 | + Optional(HAVING + expr("having_expr")) 1000 | + Optional( 1001 | ORDER + BY + Group(delimitedList(ordering_term))("order_by_terms") 1002 | ) 1003 | + Optional(delimitedList(window_select_clause)) 1004 | ) 1005 | grouped_select_core = select_core | (LPAR + select_core + RPAR) 1006 | 1007 | ungrouped_select_stmt << ( 1008 | grouped_select_core 1009 | + ZeroOrMore(compound_operator + grouped_select_core) 1010 | + Optional( 1011 | LIMIT 1012 | + (Group(expr + OFFSET + expr) | Group(expr + COMMA + expr) | expr)( 1013 | "limit" 1014 | ) 1015 | ) 1016 | )("select") 1017 | select_stmt = ungrouped_select_stmt | (LPAR + ungrouped_select_stmt + RPAR) 1018 | 1019 | # define comment format, and ignore them 1020 | sql_comment = oneOf("-- #") + restOfLine | cStyleComment 1021 | select_stmt.ignore(sql_comment) 1022 | 1023 | def record_with_alias(t): 1024 | identifier_list = t.asList() 1025 | padded_list = [None] * (3 - len(identifier_list)) + identifier_list 1026 | cls._with_aliases.add(tuple(padded_list)) 1027 | 1028 | with_stmt = Forward().setName("with statement") 1029 | with_clause = Group( 1030 | identifier.setParseAction(lambda t: record_with_alias(t))('cte_name') 1031 | - AS 1032 | - LPAR 1033 | + (select_stmt | with_stmt) 1034 | - RPAR 1035 | ) 1036 | with_core = WITH + delimitedList(with_clause)('ctes') 1037 | with_stmt << (with_core - ~Literal(',') + ungrouped_select_stmt) 1038 | with_stmt.ignore(sql_comment) 1039 | 1040 | select_or_with = select_stmt | with_stmt 1041 | select_or_with_parens = LPAR + select_or_with - RPAR 1042 | 1043 | cls._parser = select_or_with | select_or_with_parens 1044 | return cls._parser 1045 | 1046 | 1047 | --------------------------------------------------------------------------------