├── .gitignore
├── README.md
├── example.py
├── format.py
└── sql_parser.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | env/
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A validating SQL parser that can lint select statements
 2 | 
 3 | Lots left to do here:
 4 | 1. Finish formatting logic (only partially implemented)
 5 | 2. Cross-db support (Snowflake, Redshift, BigQuery, Postgres, etc)
 6 | 3. Handle more jinja blocks (config, ref, source, etc)
 7 | 4. Add validating to formatting code (make sure we don't drop tokens)
 8 | 5. Make this distributable (editor plugin at a POC?)
 9 | 6. Lots and lots of testing!
10 | 
11 | ### Requirements
12 | ```
13 | pip install pyparsing
14 | ```
15 | 
16 | 
17 | ### Example
18 | ```
19 | import sql_parser
20 | from format import Formatter
21 | 
22 | parser = sql_parser.BigQueryViewParser()
23 | sql = """
24 | with my_cte as (select sum(case when a=1 then 1 else 0 end) as pivoted from table) select * from my_cte
25 | """
26 | 
27 | ast = parser._parse(sql)
28 | f = Formatter()
29 | f.format(ast)
30 | f.document.pprint()
31 | ```
32 | 
33 | Output:
34 | 
35 | ```
36 | with my_cte as (
37 | 
38 |     select
39 |         sum(case
40 |             when a = 1 then 1
41 |             else 0
42 |         end) as pivoted
43 | 
44 |     from table
45 | 
46 | )
47 | 
48 | select
49 |     *
50 | 
51 | from my_cte
52 | ```
53 | 
54 | ### Thanks
55 | 
56 | Heavily inspired by (and partially copied from) code in:
57 |  - https://github.com/mozilla/moz-sql-parser
58 |  - https://github.com/pyparsing/pyparsing/blob/master/examples/bigquery_view_parser.py
59 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sql_parser
 3 | from format import Formatter
 4 | 
 5 | parser = sql_parser.BigQueryViewParser()
 6 | sql = """
 7 | with my_cte as (select sum(case when a=1 then 1 else 0 end) as pivoted from table) select * from my_cte
 8 | """
 9 | 
10 | ast = parser._parse(sql)
11 | f = Formatter()
12 | f.format(ast)
13 | f.document.pprint()
14 | 


--------------------------------------------------------------------------------
/format.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | #
  3 | # This Source Code Form is subject to the terms of the Mozilla Public
  4 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5 | # You can obtain one at http://mozilla.org/MPL/2.0/.
  6 | #
  7 | # Author: Beto Dealmeida (beto@dealmeida.net)
  8 | #
  9 | 
 10 | from __future__ import absolute_import
 11 | from __future__ import division
 12 | from __future__ import unicode_literals
 13 | 
 14 | import re
 15 | 
 16 | from contextlib import contextmanager
 17 | 
 18 | VALID = re.compile(r'[a-zA-Z_]\w*')
 19 | 
 20 | 
 21 | class Token(object):
 22 |     def __init__(self, token):
 23 |         self.token = token
 24 | 
 25 | class Whitespace(object):
 26 |     def __init__(self, count, char):
 27 |         self.count = count
 28 |         self.char = char
 29 | 
 30 | class SQLDocument(object):
 31 |     def __init__(self):
 32 |         self.tokens = []
 33 |         self.indent = 0
 34 |         self.indent_char = ' '
 35 | 
 36 |         self._newline = False
 37 |         self.commas = 'back'
 38 | 
 39 |     def add(self, token):
 40 |         if hasattr(token, 'asList'):
 41 |             token = token.asList()
 42 |         if type(token) not in (list, tuple):
 43 |             token = [token]
 44 |         if self._newline:
 45 |             self.tokens.append(self.indent_char * self.indent)
 46 |             self._newline = False
 47 |         self.tokens.extend(list(token))
 48 | 
 49 |     def newline(self):
 50 |         self.tokens.append('\n')
 51 |         self._newline = True
 52 | 
 53 |     @contextmanager
 54 |     def indented(self, count=4):
 55 |         self.indent += count
 56 |         yield self
 57 |         self.indent -= count
 58 | 
 59 |     def pprint(self):
 60 |         print("".join(self.tokens))
 61 | 
 62 | def Operator(op, parentheses=False):
 63 |     op = ' {0} '.format(op)
 64 |     def func(self, json):
 65 |         out = op.join(self.dispatch(v) for v in json)
 66 |         if parentheses:
 67 |             out = '({0})'.format(out)
 68 |         return out
 69 |     return func
 70 | 
 71 | 
 72 | class Formatter:
 73 | 
 74 |     clauses = [
 75 |         'ctes',
 76 |         'columns',
 77 |         'from_',
 78 |         'where',
 79 |         'group_by_terms',
 80 |         'having_expr',
 81 |         'order_by_terms',
 82 |         'limit',
 83 |         # 'offset', # not supported TODO
 84 |     ]
 85 | 
 86 |     # simple operators
 87 |     _concat = Operator('||')
 88 |     _mul = Operator('*')
 89 |     _div = Operator('/', parentheses=True)
 90 |     _add = Operator('+')
 91 |     _sub = Operator('-', parentheses=True)
 92 |     _neq = Operator('<>')
 93 |     _gt = Operator('>')
 94 |     _lt = Operator('<')
 95 |     _gte = Operator('>=')
 96 |     _lte = Operator('<=')
 97 |     _eq = Operator('=')
 98 |     _or = Operator('OR')
 99 |     _and = Operator('AND')
100 | 
101 |     def __init__(self):
102 |         self.document = SQLDocument()
103 | 
104 |     def format(self, json):
105 |         if 'union' in json:
106 |             return self.union(json['union'])
107 |         else:
108 |             return self.query(json)
109 | 
110 |     def dispatch(self, json):
111 |         if isinstance(json, list):
112 |             return self.delimited_list(json)
113 |         if isinstance(json, dict):
114 |             if len(json) == 0:
115 |                 return ''
116 |             elif 'value' in json:
117 |                 return self.value(json)
118 |             elif 'from' in json:
119 |                 # Nested queries
120 |                 return '({})'.format(self.format(json))
121 |             elif 'select' in json:
122 |                 # Nested queries
123 |                 return '({})'.format(self.format(json))
124 |             else:
125 |                 return self.op(json)
126 |         if isinstance(json, string_types):
127 |             return json
128 | 
129 |         return text(json)
130 | 
131 |     def delimited_list(self, json):
132 |         return ', '.join(self.dispatch(element) for element in json)
133 | 
134 |     def value(self, json):
135 |         parts = [self.dispatch(json['value'])]
136 |         if 'name' in json:
137 |             parts.extend(['AS', self.dispatch(json['name'])])
138 |         return ' '.join(parts)
139 | 
140 |     def op(self, json):
141 |         if 'on' in json:
142 |             return self._on(json)
143 | 
144 |         if len(json) > 1:
145 |             raise Exception('Operators should have only one key!')
146 |         key, value = list(json.items())[0]
147 | 
148 |         # check if the attribute exists, and call the corresponding method;
149 |         # note that we disallow keys that start with `_` to avoid giving access
150 |         # to magic methods
151 |         attr = '_{0}'.format(key)
152 |         if hasattr(self, attr) and not key.startswith('_'):
153 |             method = getattr(self, attr)
154 |             return method(value)
155 | 
156 |         # treat as regular function call
157 |         if isinstance(value, dict) and len(value) == 0:
158 |             return key.upper() + "()"  # NOT SURE IF AN EMPTY dict SHOULD BE DELT WITH HERE, OR IN self.dispatch()
159 |         else:
160 |             return '{0}({1})'.format(key.upper(), self.dispatch(value))
161 | 
162 |     def _exists(self, value):
163 |         return '{0} IS NOT NULL'.format(self.dispatch(value))
164 | 
165 |     def _missing(self, value):
166 |         return '{0} IS NULL'.format(self.dispatch(value))
167 | 
168 |     def _like(self, pair):
169 |         return '{0} LIKE {1}'.format(self.dispatch(pair[0]), self.dispatch(pair[1]))
170 | 
171 |     def _nlike(self, pair):
172 |         return '{0} NOT LIKE {1}'.format(self.dispatch(pair[0]), self.dispatch(pair[1]))
173 | 
174 |     def _is(self, pair):
175 |         return '{0} IS {1}'.format(self.dispatch(pair[0]), self.dispatch(pair[1]))
176 | 
177 |     def _in(self, json):
178 |         valid = self.dispatch(json[1])
179 |         # `(10, 11, 12)` does not get parsed as literal, so it's formatted as
180 |         # `10, 11, 12`. This fixes it.
181 |         if not valid.startswith('('):
182 |             valid = '({0})'.format(valid)
183 | 
184 |         return '{0} IN {1}'.format(json[0], valid)
185 | 
186 |     def _nin(self, json):
187 |         valid = self.dispatch(json[1])
188 |         # `(10, 11, 12)` does not get parsed as literal, so it's formatted as
189 |         # `10, 11, 12`. This fixes it.
190 |         if not valid.startswith('('):
191 |             valid = '({0})'.format(valid)
192 | 
193 |         return '{0} NOT IN {1}'.format(json[0], valid)
194 | 
195 |     def _case(self, checks):
196 |         parts = ['CASE']
197 |         for check in checks:
198 |             if isinstance(check, dict):
199 |                 parts.extend(['WHEN', self.dispatch(check['when'])])
200 |                 parts.extend(['THEN', self.dispatch(check['then'])])
201 |             else:
202 |                 parts.extend(['ELSE', self.dispatch(check)])
203 |         parts.append('END')
204 |         return ' '.join(parts)
205 | 
206 |     def _literal(self, json):
207 |         if isinstance(json, list):
208 |             return '({0})'.format(', '.join(self._literal(v) for v in json))
209 |         elif isinstance(json, string_types):
210 |             return "'{0}'".format(json.replace("'", "''"))
211 |         else:
212 |             return str(json)
213 | 
214 |     def _on(self, json):
215 |         detected_join = join_keywords & set(json.keys())
216 |         if len(detected_join) == 0:
217 |             raise Exception(
218 |                 'Fail to detect join type! Detected: "{}" Except one of: "{}"'.format(
219 |                     [on_keyword for on_keyword in json if on_keyword != 'on'][0],
220 |                     '", "'.join(join_keywords)
221 |                 )
222 |             )
223 | 
224 |         join_keyword = detected_join.pop()
225 | 
226 |         return '{0} {1} ON {2}'.format(
227 |             join_keyword.upper(), self.dispatch(json[join_keyword]), self.dispatch(json['on'])
228 |         )
229 | 
230 |     def union(self, json):
231 |         return ' UNION '.join(self.query(query) for query in json)
232 | 
233 |     def query(self, json):
234 |         for clause in self.clauses:
235 |             func = getattr(self, clause, None)
236 |             if func:
237 |                 func(json)
238 | 
239 |     def add_expr_list(self, expr):
240 |         if isinstance(expr, (str, int, float)):
241 |             self.add_expr(expr)
242 |         else:
243 |             for i, field in enumerate(expr):
244 |                 self.add_expr(field)
245 |                 if i != len(expr) - 1:
246 |                     self.document.add(', ')
247 | 
248 |     def add_expr(self, expr):
249 |         doc = self.document
250 | 
251 |         if hasattr(expr, 'getName') and expr.getName() == 'operator':
252 |             if expr.assoc == 'unary':
253 |                 doc.add(str(expr.op.match))
254 |                 if len(expr.op.match) > 1:
255 |                     doc.add(' ')
256 |                 self.add_expr(expr.tokens)
257 |             elif expr.assoc == 'binary':
258 |                 self.add_expr(expr.tokens[0])
259 |                 doc.add(' ')
260 |                 doc.add(str(expr.op.match))
261 |                 doc.add(' ')
262 |                 self.add_expr(expr.tokens[1])
263 |             elif expr.assoc == 'ternary':
264 |                 import ipdb; ipdb.set_trace()
265 |             else:
266 |                 import ipdb; ipdb.set_trace()
267 |         elif hasattr(expr, 'getName') and expr.getName() == 'function':
268 |             doc.add(str(expr.func))
269 |             doc.add('(')
270 |             self.add_expr_list(expr.tokens[0])
271 |             doc.add(')')
272 |         elif hasattr(expr, 'getName') and expr.getName() == 'window function':
273 |             doc.add(str(expr.func))
274 |             doc.add('(')
275 |             self.add_expr_list(expr.func_args)
276 |             doc.add(')')
277 |             doc.add(' ')
278 |             doc.add('over')
279 |             doc.add(' ')
280 |             doc.add('(')
281 |             if expr.partition_args:
282 |                 # TODO: Method for printing list
283 |                 doc.add('partition by ')
284 |                 self.add_expr_list(expr.partition_args)
285 |             if expr.order_args:
286 |                 if expr.partition_args:
287 |                     doc.add(' ')
288 |                 doc.add('order by ')
289 |                 self.add_expr_list(expr.order_args)
290 |             for arg in expr.window_args:
291 |                 doc.add(' ')
292 |                 doc.add(arg)
293 |             doc.add(')')
294 |         elif hasattr(expr, 'getName') and expr.getName() == 'case':
295 |             doc.add('case')
296 |             doc.newline()
297 |             with doc.indented():
298 |                 for when in expr.whens:
299 |                     doc.add('when')
300 |                     doc.add(' ')
301 |                     self.add_expr(when['when'])
302 |                     doc.add(' ')
303 |                     doc.add('then')
304 |                     doc.add(' ')
305 |                     self.add_expr(when['then'])
306 |                     doc.newline()
307 |                 if expr._else:
308 |                     doc.add('else')
309 |                     doc.add(' ')
310 |                     self.add_expr(expr._else)
311 |                     doc.newline()
312 |             doc.add('end')
313 |         elif hasattr(expr, 'getName') and expr.getName() == 'select':
314 |             import ipdb; ipdb.set_trace()
315 |         #elif hasattr(expr, 'getName') and expr.getName() != 'quoted_expr':
316 |         #    import ipdb; ipdb.set_trace()
317 | 
318 |         elif isinstance(expr, (str, int)):
319 |             doc.add(expr)
320 |         else:
321 |             for el in expr:
322 |                 self.add_expr(el)
323 | 
324 |     def add_column(self, column):
325 |         if type(column) == str:
326 |             self.document.add(column)
327 |         elif column.select:
328 |             self.document.add('(')
329 |             self.document.newline()
330 |             with self.document.indented():
331 |                 self.query(column)
332 |             self.document.newline()
333 |             self.document.add(')')
334 | 
335 |         else:
336 |             self.add_expr(column.quoted_expr)
337 | 
338 |         if type(column) != str and column.alias:
339 |             self.document.add(' ')
340 |             self.document.add('as')
341 |             self.document.add(' ')
342 |             self.document.add(column.alias)
343 | 
344 |     def add_cte(self, cte):
345 |         self.document.add(cte.cte_name)
346 |         self.document.add(' ')
347 |         self.document.add('as')
348 |         self.document.add(' ')
349 |         self.document.add('(')
350 |         self.document.newline()
351 |         self.document.newline()
352 |         with self.document.indented():
353 |             self.query(cte)
354 |         self.document.newline()
355 |         self.document.newline()
356 |         self.document.add(')')
357 | 
358 |     def ctes(self, json):
359 |         if len(json.ctes) == 0:
360 |             return
361 | 
362 |         self.document.add('with ')
363 |         for i, cte in enumerate(json.ctes):
364 |             self.add_cte(cte)
365 |             if i != len(json.ctes) - 1:
366 |                 self.document.add(',')
367 |             self.document.newline()
368 |             self.document.newline()
369 | 
370 | 
371 |     def columns(self, json):
372 |         self.document.add('select')
373 |         self.document.newline()
374 |         with self.document.indented():
375 |             for i, column in enumerate(json.columns):
376 |                 if self.document.commas == 'front' and i != 0:
377 |                     self.document.add(', ')
378 | 
379 |                 self.add_column(column)
380 | 
381 |                 if self.document.commas == 'back' and i != len(json.columns)-1:
382 |                     self.document.add(',')
383 |                 self.document.newline()
384 | 
385 |     # This ain't it :/
386 |     def add_from(self, from_):
387 |         self.document.newline()
388 |         self.document.add('from')
389 |         self.document.add(' ')
390 |         if type(from_) == str:
391 |             self.document.add(from_)
392 |         else:
393 |             self.document.add(from_.table.asList()) # TODO: SBQ
394 | 
395 |             for join in from_.joins:
396 |                 if join.join_op[0] == ',':
397 |                     self.document.add(',')
398 |                     self.document.newline()
399 |                     self.document.add(join.table)
400 |                 else:
401 |                     self.document.newline()
402 |                     self.add_expr(join.asList())
403 | 
404 |     def from_(self, json):
405 |         if 'from' not in json:
406 |             return
407 |         from_ = json['from']
408 |         if 'union' in from_:
409 |             return self.union(from_['union'])
410 | 
411 |         self.add_from(from_[0])
412 | 
413 |     def where(self, json):
414 |         if 'where' not in json:
415 |             return
416 | 
417 |         self.document.newline()
418 |         self.document.add('where ')
419 |         self.add_expr(json['where'])
420 | 
421 |     def group_by_terms(self, json):
422 |         if 'group_by_terms' not in json:
423 |             return
424 | 
425 |     def having(self, json):
426 |         import ipdb; ipdb.set_trace()
427 |         if 'having' in json:
428 |             return 'HAVING {0}'.format(self.dispatch(json['having']))
429 | 
430 |     def orderby(self, json):
431 |         import ipdb; ipdb.set_trace()
432 |         if 'orderby' in json:
433 |             orderby = json['orderby']
434 |             if isinstance(orderby, dict):
435 |                 orderby = [orderby]
436 |             return 'ORDER BY {0}'.format(','.join([
437 |                 '{0} {1}'.format(self.dispatch(o), o.get('sort', '').upper()).strip()
438 |                 for o in orderby
439 |             ]))
440 | 
441 |     def limit(self, json):
442 |         if 'limit' in json:
443 |             if json['limit']:
444 |                 return 'LIMIT {0}'.format(self.dispatch(json['limit']))
445 | 
446 |     def offset(self, json):
447 |         if 'offset' in json:
448 |             return 'OFFSET {0}'.format(self.dispatch(json['offset']))
449 | 
450 | 


--------------------------------------------------------------------------------
/sql_parser.py:
--------------------------------------------------------------------------------
   1 | # bigquery_view_parser.py
   2 | #
   3 | # A parser to extract table names from BigQuery view definitions.
   4 | # This is based on the `select_parser.py` sample in pyparsing:
   5 | # https://github.com/pyparsing/pyparsing/blob/master/examples/select_parser.py
   6 | #
   7 | # Michael Smedberg
   8 | #
   9 | 
  10 | from pyparsing import ParserElement, Suppress, Forward, CaselessKeyword
  11 | from pyparsing import MatchFirst, alphas, alphanums, Combine, Word, Literal, White, Empty
  12 | from pyparsing import QuotedString, CharsNotIn, Optional, Group, ZeroOrMore, NoMatch
  13 | from pyparsing import oneOf, delimitedList, restOfLine, cStyleComment
  14 | from pyparsing import infixNotation, opAssoc, OneOrMore, Regex, nums
  15 | 
  16 | def debug(s, i, toks):
  17 |     if len(toks) > 0:
  18 |         #import ipdb; ipdb.set_trace()
  19 |         pass
  20 | 
  21 | 
  22 | class SemanticToken(object):
  23 |     def __iter__(self):
  24 |         return (i for i in self.tokens)
  25 | 
  26 |     def __len__(self):
  27 |         return len(self.tokens)
  28 | 
  29 | 
  30 | 
  31 | 
  32 | class BigQueryViewParser:
  33 |     """Parser to extract table info from BigQuery view definitions"""
  34 | 
  35 |     _parser = None
  36 |     _table_identifiers = set()
  37 |     _with_aliases = set()
  38 | 
  39 |     def get_table_names(self, sql_stmt):
  40 |         table_identifiers, with_aliases = self._parse(sql_stmt)
  41 | 
  42 |         # Table names and alias names might differ by case, but that's not
  43 |         # relevant- aliases are not case sensitive
  44 |         lower_aliases = BigQueryViewParser.lowercase_set_of_tuples(with_aliases)
  45 |         tables = {
  46 |             x
  47 |             for x in table_identifiers
  48 |             if not BigQueryViewParser.lowercase_of_tuple(x) in lower_aliases
  49 |         }
  50 | 
  51 |         # Table names ARE case sensitive as described at
  52 |         # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity
  53 |         # return tables
  54 |         return table_identifiers, with_aliases
  55 | 
  56 |     def _parse(self, sql_stmt):
  57 |         BigQueryViewParser._table_identifiers.clear()
  58 |         BigQueryViewParser._with_aliases.clear()
  59 |         res = BigQueryViewParser._get_parser().parseString(sql_stmt, parseAll=True)
  60 | 
  61 |         def kk(v):
  62 |             print(list(v.keys()))
  63 | 
  64 |         return res
  65 | 
  66 |     @classmethod
  67 |     def lowercase_of_tuple(cls, tuple_to_lowercase):
  68 |         return tuple(x.lower() if x else None for x in tuple_to_lowercase)
  69 | 
  70 |     @classmethod
  71 |     def lowercase_set_of_tuples(cls, set_of_tuples):
  72 |         return {BigQueryViewParser.lowercase_of_tuple(x) for x in set_of_tuples}
  73 | 
  74 |     @classmethod
  75 |     def _get_parser(cls):
  76 |         if cls._parser is not None:
  77 |             return cls._parser
  78 | 
  79 |         ParserElement.enablePackrat()
  80 | 
  81 |         LPAR, RPAR, COMMA, LBRACKET, RBRACKET, LT, GT = map(Literal, "(),[]<>")
  82 |         ungrouped_select_stmt = Forward().setName("select statement")
  83 | 
  84 |         # keywords
  85 |         (
  86 |             UNION,
  87 |             ALL,
  88 |             AND,
  89 |             INTERSECT,
  90 |             EXCEPT,
  91 |             COLLATE,
  92 |             ASC,
  93 |             DESC,
  94 |             ON,
  95 |             USING,
  96 |             NATURAL,
  97 |             INNER,
  98 |             CROSS,
  99 |             LEFT,
 100 |             RIGHT,
 101 |             OUTER,
 102 |             FULL,
 103 |             JOIN,
 104 |             AS,
 105 |             INDEXED,
 106 |             NOT,
 107 |             SELECT,
 108 |             DISTINCT,
 109 |             FROM,
 110 |             WHERE,
 111 |             GROUP,
 112 |             BY,
 113 |             HAVING,
 114 |             ORDER,
 115 |             BY,
 116 |             LIMIT,
 117 |             OFFSET,
 118 |             OR,
 119 |             CAST,
 120 |             ISNULL,
 121 |             NOTNULL,
 122 |             NULL,
 123 |             IS,
 124 |             BETWEEN,
 125 |             ELSE,
 126 |             END,
 127 |             CASE,
 128 |             WHEN,
 129 |             THEN,
 130 |             EXISTS,
 131 |             COLLATE,
 132 |             IN,
 133 |             LIKE,
 134 |             GLOB,
 135 |             REGEXP,
 136 |             MATCH,
 137 |             ESCAPE,
 138 |             CURRENT_TIME,
 139 |             CURRENT_DATE,
 140 |             CURRENT_TIMESTAMP,
 141 |             WITH,
 142 |             EXTRACT,
 143 |             PARTITION,
 144 |             ROWS,
 145 |             RANGE,
 146 |             UNBOUNDED,
 147 |             PRECEDING,
 148 |             CURRENT,
 149 |             ROW,
 150 |             FOLLOWING,
 151 |             OVER,
 152 |             INTERVAL,
 153 |             DATE_ADD,
 154 |             DATE_SUB,
 155 |             ADDDATE,
 156 |             SUBDATE,
 157 |             REGEXP_EXTRACT,
 158 |             SPLIT,
 159 |             ORDINAL,
 160 |             FIRST_VALUE,
 161 |             LAST_VALUE,
 162 |             NTH_VALUE,
 163 |             LEAD,
 164 |             LAG,
 165 |             PERCENTILE_CONT,
 166 |             PRECENTILE_DISC,
 167 |             RANK,
 168 |             DENSE_RANK,
 169 |             PERCENT_RANK,
 170 |             CUME_DIST,
 171 |             NTILE,
 172 |             ROW_NUMBER,
 173 |             DATE,
 174 |             TIME,
 175 |             DATETIME,
 176 |             TIMESTAMP,
 177 |             UNNEST,
 178 |             INT64,
 179 |             NUMERIC,
 180 |             FLOAT64,
 181 |             BOOL,
 182 |             BYTES,
 183 |             GEOGRAPHY,
 184 |             ARRAY,
 185 |             STRUCT,
 186 |             SAFE_CAST,
 187 |             ANY_VALUE,
 188 |             ARRAY_AGG,
 189 |             ARRAY_CONCAT_AGG,
 190 |             AVG,
 191 |             BIT_AND,
 192 |             BIT_OR,
 193 |             BIT_XOR,
 194 |             COUNT,
 195 |             COUNTIF,
 196 |             LOGICAL_AND,
 197 |             LOGICAL_OR,
 198 |             MAX,
 199 |             MIN,
 200 |             STRING_AGG,
 201 |             SUM,
 202 |             CORR,
 203 |             COVAR_POP,
 204 |             COVAR_SAMP,
 205 |             STDDEV_POP,
 206 |             STDDEV_SAMP,
 207 |             STDDEV,
 208 |             VAR_POP,
 209 |             VAR_SAMP,
 210 |             VARIANCE,
 211 |             TIMESTAMP_ADD,
 212 |             TIMESTAMP_SUB,
 213 |             GENERATE_ARRAY,
 214 |             GENERATE_DATE_ARRAY,
 215 |             GENERATE_TIMESTAMP_ARRAY,
 216 |             FOR,
 217 |             SYSTEMTIME,
 218 |             AS,
 219 |             OF,
 220 |             WINDOW,
 221 |             RESPECT,
 222 |             IGNORE,
 223 |             NULLS,
 224 |         ) = map(
 225 |             CaselessKeyword,
 226 |             """
 227 |             UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING,
 228 |             NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED,
 229 |             NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY,
 230 |             LIMIT, OFFSET, OR, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE,
 231 |             END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP,
 232 |             MATCH, ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, WITH,
 233 |             EXTRACT, PARTITION, ROWS, RANGE, UNBOUNDED, PRECEDING, CURRENT,
 234 |             ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, DATE_SUB, ADDDATE,
 235 |             SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, LAST_VALUE,
 236 |             NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK,
 237 |             DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME,
 238 |             DATETIME, TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES,
 239 |             GEOGRAPHY, ARRAY, STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG,
 240 |             ARRAY_CONCAT_AGG, AVG, BIT_AND, BIT_OR, BIT_XOR, COUNT, COUNTIF,
 241 |             LOGICAL_AND, LOGICAL_OR, MAX, MIN, STRING_AGG, SUM, CORR,
 242 |             COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, STDDEV, VAR_POP,
 243 |             VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, GENERATE_ARRAY,
 244 |             GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, SYSTEMTIME, AS,
 245 |             OF, WINDOW, RESPECT, IGNORE, NULLS
 246 |                  """.replace(
 247 |                 ",", ""
 248 |             ).split(),
 249 |         )
 250 | 
 251 |         keyword_nonfunctions = MatchFirst(
 252 |             (
 253 |                 UNION,
 254 |                 ALL,
 255 |                 INTERSECT,
 256 |                 EXCEPT,
 257 |                 COLLATE,
 258 |                 ASC,
 259 |                 DESC,
 260 |                 ON,
 261 |                 USING,
 262 |                 NATURAL,
 263 |                 INNER,
 264 |                 CROSS,
 265 |                 LEFT,
 266 |                 RIGHT,
 267 |                 OUTER,
 268 |                 FULL,
 269 |                 JOIN,
 270 |                 AS,
 271 |                 INDEXED,
 272 |                 NOT,
 273 |                 SELECT,
 274 |                 DISTINCT,
 275 |                 FROM,
 276 |                 WHERE,
 277 |                 GROUP,
 278 |                 BY,
 279 |                 HAVING,
 280 |                 ORDER,
 281 |                 BY,
 282 |                 LIMIT,
 283 |                 OFFSET,
 284 |                 CAST,
 285 |                 ISNULL,
 286 |                 NOTNULL,
 287 |                 NULL,
 288 |                 IS,
 289 |                 BETWEEN,
 290 |                 ELSE,
 291 |                 END,
 292 |                 CASE,
 293 |                 WHEN,
 294 |                 THEN,
 295 |                 EXISTS,
 296 |                 COLLATE,
 297 |                 IN,
 298 |                 LIKE,
 299 |                 GLOB,
 300 |                 REGEXP,
 301 |                 MATCH,
 302 |                 STRUCT,
 303 |                 WINDOW,
 304 |             )
 305 |         )
 306 | 
 307 |         keyword = keyword_nonfunctions | MatchFirst(
 308 |             (
 309 |                 ESCAPE,
 310 |                 CURRENT_TIME,
 311 |                 CURRENT_DATE,
 312 |                 CURRENT_TIMESTAMP,
 313 |                 DATE_ADD,
 314 |                 DATE_SUB,
 315 |                 ADDDATE,
 316 |                 SUBDATE,
 317 |                 INTERVAL,
 318 |                 STRING_AGG,
 319 |                 REGEXP_EXTRACT,
 320 |                 SPLIT,
 321 |                 ORDINAL,
 322 |                 UNNEST,
 323 |                 SAFE_CAST,
 324 |                 PARTITION,
 325 |                 TIMESTAMP_ADD,
 326 |                 TIMESTAMP_SUB,
 327 |                 ARRAY,
 328 |                 GENERATE_ARRAY,
 329 |                 GENERATE_DATE_ARRAY,
 330 |                 GENERATE_TIMESTAMP_ARRAY,
 331 |             )
 332 |         )
 333 | 
 334 |         identifier_word = Word(alphas + "_@#", alphanums + "@$#_")
 335 |         identifier = ~keyword + identifier_word.copy()
 336 |         collation_name = identifier.copy()
 337 |         # NOTE: Column names can be keywords.  Doc says they cannot, but in practice it seems to work.
 338 |         column_name = identifier.copy()
 339 |         cast_to = identifier.copy()
 340 |         qualified_column_name = Group(
 341 |             delimitedList(column_name, delim=".")
 342 |             + Optional(
 343 |                 Suppress("::") 
 344 |                 + delimitedList(cast_to("cast"), delim="::")
 345 |             )
 346 |         )
 347 |         # NOTE: As with column names, column aliases can be keywords, e.g. functions like `current_time`.  Other
 348 |         # keywords, e.g. `from` make parsing pretty difficult (e.g. "SELECT a from from b" is confusing.)
 349 |         column_alias = ~keyword_nonfunctions + column_name.copy()
 350 |         table_name = identifier.copy()
 351 |         table_alias = identifier.copy()
 352 |         index_name = identifier.copy()
 353 |         function_name = identifier.copy()
 354 |         parameter_name = identifier.copy()
 355 |         # NOTE: The expression in a CASE statement can be an integer.  E.g. this is valid SQL:
 356 |         # select CASE 1 WHEN 1 THEN -1 ELSE -2 END from test_table
 357 |         unquoted_case_identifier = ~keyword + Word(alphanums + "$_")
 358 |         quoted_case_identifier = ~keyword + (
 359 |             QuotedString('"') ^ Suppress("`") + CharsNotIn("`") + Suppress("`")
 360 |         )
 361 |         case_identifier = quoted_case_identifier | unquoted_case_identifier
 362 |         case_expr = (
 363 |             Optional(case_identifier + Suppress("."))
 364 |             + Optional(case_identifier + Suppress("."))
 365 |             + case_identifier
 366 |         )
 367 | 
 368 |         # expression
 369 |         expr = Forward().setName("expression")
 370 | 
 371 |         integer = Regex(r"[+-]?\d+")
 372 |         numeric_literal = Regex(r"[+-]?\d*\.?\d+([eE][+-]?\d+)?")
 373 |         string_literal = QuotedString("'") | QuotedString('"') | QuotedString("`")
 374 |         regex_literal = "r" + string_literal
 375 |         blob_literal = Regex(r"[xX]'[0-9A-Fa-f]+'")
 376 |         date_or_time_literal = (DATE | TIME | DATETIME | TIMESTAMP) + string_literal
 377 |         literal_value = (
 378 |             numeric_literal
 379 |             | string_literal
 380 |             | regex_literal
 381 |             | blob_literal
 382 |             | date_or_time_literal
 383 |             | NULL
 384 |             | CURRENT_TIME + Optional(LPAR + Optional(string_literal) + RPAR)
 385 |             | CURRENT_DATE + Optional(LPAR + Optional(string_literal) + RPAR)
 386 |             | CURRENT_TIMESTAMP + Optional(LPAR + Optional(string_literal) + RPAR)
 387 |         )
 388 |         bind_parameter = Word("?", nums) | Combine(oneOf(": @ $") + parameter_name)
 389 |         type_name = oneOf(
 390 |             """TEXT REAL INTEGER BLOB NULL TIMESTAMP STRING DATE
 391 |             INT64 NUMERIC FLOAT64 BOOL BYTES DATETIME GEOGRAPHY TIME ARRAY
 392 |             STRUCT""",
 393 |             caseless=True,
 394 |         )
 395 |         date_part = oneOf(
 396 |             """DAY DAY_HOUR DAY_MICROSECOND DAY_MINUTE DAY_SECOND
 397 |             HOUR HOUR_MICROSECOND HOUR_MINUTE HOUR_SECOND MICROSECOND MINUTE
 398 |             MINUTE_MICROSECOND MINUTE_SECOND MONTH QUARTER SECOND
 399 |             SECOND_MICROSECOND WEEK YEAR YEAR_MONTH""",
 400 |             caseless=True,
 401 |         )
 402 |         datetime_operators = (
 403 |             DATE_ADD | DATE_SUB | ADDDATE | SUBDATE | TIMESTAMP_ADD | TIMESTAMP_SUB
 404 |         )
 405 | 
 406 |         def invalid_date_add(s, loc, tokens):
 407 |             prev_newline = s[:loc].rfind('\n')
 408 |             prev_prev_newline = s[:prev_newline].rfind('\n')
 409 |             if '--ignore' in s[prev_prev_newline:prev_newline]:
 410 |                 pass
 411 |             else:
 412 |                 raise RuntimeError("{} is not valid, did you mean 'date_add'".format(tokens[0]))
 413 | 
 414 |         #bad_datetime_operators = (
 415 |         #    CaselessKeyword('dateadd').setParseAction(invalid_date_add)
 416 |         #)
 417 | 
 418 |         grouping_term = expr.copy()
 419 |         ordering_term = Group(
 420 |             expr("order_key")
 421 |             + Optional(COLLATE + collation_name("collate"))
 422 |             + Optional(ASC | DESC)("direction")
 423 |         )("ordering_term")
 424 | 
 425 |         function_arg = expr.copy()("function_arg")
 426 |         function_args = Optional(
 427 |             "*"
 428 |             | Optional(DISTINCT)
 429 |             + delimitedList(function_arg)
 430 |             + Optional((RESPECT | IGNORE) + NULLS)
 431 |         )("function_args")
 432 |         function_call = (
 433 |             (function_name | keyword)("function_name")
 434 |             + LPAR
 435 |             + Group(function_args)("function_args_group")
 436 |             + RPAR
 437 |         )('function')
 438 | 
 439 |         navigation_function_name = (
 440 |             FIRST_VALUE
 441 |             | LAST_VALUE
 442 |             | NTH_VALUE
 443 |             | LEAD
 444 |             | LAG
 445 |             | PERCENTILE_CONT
 446 |             | PRECENTILE_DISC
 447 |         )
 448 |         aggregate_function_name = (
 449 |             ANY_VALUE
 450 |             | ARRAY_AGG
 451 |             | ARRAY_CONCAT_AGG
 452 |             | AVG
 453 |             | BIT_AND
 454 |             | BIT_OR
 455 |             | BIT_XOR
 456 |             | COUNT
 457 |             | COUNTIF
 458 |             | LOGICAL_AND
 459 |             | LOGICAL_OR
 460 |             | MAX
 461 |             | MIN
 462 |             | STRING_AGG
 463 |             | SUM
 464 |         )
 465 |         statistical_aggregate_function_name = (
 466 |             CORR
 467 |             | COVAR_POP
 468 |             | COVAR_SAMP
 469 |             | STDDEV_POP
 470 |             | STDDEV_SAMP
 471 |             | STDDEV
 472 |             | VAR_POP
 473 |             | VAR_SAMP
 474 |             | VARIANCE
 475 |         )
 476 |         numbering_function_name = (
 477 |             RANK | DENSE_RANK | PERCENT_RANK | CUME_DIST | NTILE | ROW_NUMBER
 478 |         )
 479 |         analytic_function_name = (
 480 |             navigation_function_name
 481 |             | aggregate_function_name
 482 |             | statistical_aggregate_function_name
 483 |             | numbering_function_name
 484 |         )("analytic_function_name")
 485 |         partition_expression_list = delimitedList(grouping_term)(
 486 |             "partition_expression_list"
 487 |         )
 488 |         window_frame_boundary_start = (
 489 |             UNBOUNDED + PRECEDING
 490 |             | numeric_literal + (PRECEDING | FOLLOWING)
 491 |             | CURRENT + ROW
 492 |         )
 493 |         window_frame_boundary_end = (
 494 |             UNBOUNDED + FOLLOWING
 495 |             | numeric_literal + (PRECEDING | FOLLOWING)
 496 |             | CURRENT + ROW
 497 |         )
 498 |         window_frame_clause = (ROWS | RANGE) + (
 499 |             ((UNBOUNDED + PRECEDING) | (numeric_literal + PRECEDING) | (CURRENT + ROW))
 500 |             | (BETWEEN + window_frame_boundary_start + AND + window_frame_boundary_end)
 501 |         )
 502 |         window_name = identifier.copy()("window_name")
 503 |         window_specification = (
 504 |             Optional(window_name)
 505 |             + Optional(PARTITION + BY + partition_expression_list)
 506 |             + Optional(ORDER + BY + delimitedList(ordering_term))
 507 |             + Optional(window_frame_clause)("window_specification")
 508 |         )
 509 |         analytic_function = (
 510 |             analytic_function_name
 511 |             + LPAR
 512 |             + function_args.setParseAction(debug)
 513 |             + RPAR
 514 |             + OVER
 515 |             + (window_name | LPAR + Optional(window_specification)('window') + RPAR)
 516 |         )("analytic_function")
 517 | 
 518 |         string_agg_term = (
 519 |             STRING_AGG
 520 |             + LPAR
 521 |             + Optional(DISTINCT)('has_distinct')
 522 |             + expr('string_agg_expr')
 523 |             + Optional(COMMA + string_literal('delimiter'))
 524 |             + Optional(
 525 |                 ORDER + BY + expr + Optional(ASC | DESC) + Optional(LIMIT + integer)
 526 |             )
 527 |             + RPAR
 528 |         )("string_agg")
 529 |         array_literal = (
 530 |             Optional(ARRAY + Optional(LT + delimitedList(type_name) + GT))
 531 |             + LBRACKET
 532 |             + delimitedList(expr)
 533 |             + RBRACKET
 534 |         )
 535 |         interval = INTERVAL + expr + date_part
 536 |         array_generator = (
 537 |             GENERATE_ARRAY
 538 |             + LPAR
 539 |             + numeric_literal
 540 |             + COMMA
 541 |             + numeric_literal
 542 |             + COMMA
 543 |             + numeric_literal
 544 |             + RPAR
 545 |         )
 546 |         date_array_generator = (
 547 |             (GENERATE_DATE_ARRAY | GENERATE_TIMESTAMP_ARRAY)
 548 |             + LPAR
 549 |             + expr("start_date")
 550 |             + COMMA
 551 |             + expr("end_date")
 552 |             + Optional(COMMA + interval)
 553 |             + RPAR
 554 |         )
 555 | 
 556 |         explicit_struct = (
 557 |             STRUCT
 558 |             + Optional(LT + delimitedList(type_name) + GT)
 559 |             + LPAR
 560 |             + Optional(delimitedList(expr + Optional(AS + identifier)))
 561 |             + RPAR
 562 |         )
 563 | 
 564 |         case_when = WHEN + expr.copy()("when")
 565 |         case_then = THEN + expr.copy()("then")
 566 |         case_clauses = Group(ZeroOrMore(case_when + case_then))
 567 |         case_else = ELSE + expr.copy()("_else")
 568 |         case_stmt = (
 569 |             CASE
 570 |             + Optional(case_expr.copy())
 571 |             + case_clauses("case_clauses")
 572 |             + Optional(case_else)
 573 |             + END
 574 |         )("case")
 575 | 
 576 |         class SelectStatement(SemanticToken):
 577 |             def __init__(self, tokens):
 578 |                 self.tokens = tokens
 579 | 
 580 |             def getName(self):
 581 |                 return 'select'
 582 | 
 583 |             @classmethod
 584 |             def parse(cls, tokens):
 585 |                 return SelectStatement(tokens)
 586 | 
 587 |         class Function(SemanticToken):
 588 |             def __init__(self, func, tokens):
 589 |                 self.func = func
 590 |                 self.tokens = tokens
 591 | 
 592 |             def getName(self):
 593 |                 return 'function'
 594 | 
 595 |             @classmethod
 596 |             def parse(cls, tokens):
 597 |                 method = tokens[0]
 598 |                 args = tokens[2:-1]
 599 |                 return Function(method, args)
 600 | 
 601 |             def __repr__(self):
 602 |                 return "func:{}({})".format(self.func, self.tokens)
 603 | 
 604 | 
 605 |         class WindowFunction(Function):
 606 |             def __init__(self, func, tokens, func_args, partition_args, order_args, window_args):
 607 |                 self.func = func
 608 |                 self.tokens = tokens
 609 |                 self.func_args = func_args
 610 |                 self.partition_args = partition_args
 611 |                 self.order_args = order_args
 612 |                 self.window_args = window_args
 613 | 
 614 |             def getName(self):
 615 |                 return 'window function'
 616 | 
 617 |             @classmethod
 618 |             def parse(cls, tokens):
 619 |                 return WindowFunction(
 620 |                     tokens.analytic_function_name,
 621 |                     tokens,
 622 |                     tokens.function_args,
 623 |                     tokens.partition_expression_list,
 624 |                     tokens.ordering_term,
 625 |                     tokens.window_specification
 626 |                 )
 627 | 
 628 |             def __repr__(self):
 629 |                 return "window:{}({})over({}, {}, {})".format(self.func, self.func_args, self.partition_args, self.order_args, self.window_args)
 630 | 
 631 |         class CaseStatement(SemanticToken):
 632 |             def __init__(self, tokens, whens, _else):
 633 |                 self.tokens = tokens
 634 |                 self.whens = whens
 635 |                 self._else = _else
 636 | 
 637 |             def getName(self):
 638 |                 return 'case'
 639 | 
 640 |             @classmethod
 641 |             def parse_whens(self, tokens):
 642 |                 whens = []
 643 |                 while len(tokens) > 0:
 644 |                     _, when, _, then, *tokens = tokens
 645 |                     whens.append({"when": when, "then": then})
 646 |                 return whens
 647 | 
 648 |             @classmethod
 649 |             def parse(cls, tokens):
 650 |                 whens = tokens[1]
 651 |                 _else = tokens[3]
 652 |                 return CaseStatement(
 653 |                     tokens,
 654 |                     cls.parse_whens(whens),
 655 |                     _else
 656 |                 )
 657 | 
 658 |             def __repr__(self):
 659 |                 return "<case statement ({}, {})>".format(len(self.whens), self._else)
 660 | 
 661 |         expr_term = (
 662 |             (analytic_function)("analytic_function").setParseAction(WindowFunction.parse)
 663 |             | (CAST + LPAR + expr + AS + type_name + RPAR)("cast")
 664 |             | (SAFE_CAST + LPAR + expr + AS + type_name + RPAR)("safe_cast")
 665 |             | (Optional(EXISTS) + LPAR + ungrouped_select_stmt + RPAR)("subselect")
 666 |             | (literal_value)("literal")
 667 |             | (bind_parameter)("bind_parameter")
 668 |             | (EXTRACT + LPAR + expr + FROM + expr + RPAR)("extract")
 669 |             | case_stmt.setParseAction(CaseStatement.parse)
 670 |             | (datetime_operators + LPAR + expr + COMMA + interval + RPAR)(
 671 |                 "date_operation"
 672 |             )
 673 |             #| (bad_datetime_operators + LPAR + expr + COMMA + interval + RPAR)
 674 |             | string_agg_term("string_agg_term")
 675 |             | array_literal("array_literal")
 676 |             | array_generator("array_generator")
 677 |             | date_array_generator("date_array_generator")
 678 |             | explicit_struct("explicit_struct")
 679 |             | function_call("function_call").setParseAction(Function.parse)
 680 |             | qualified_column_name("column").setParseAction(lambda x: ".".join([str(i) for i in x[0]]))
 681 |         ).setParseAction(debug) + Optional(LBRACKET + (OFFSET | ORDINAL) + LPAR + expr + RPAR + RBRACKET)(
 682 |             "offset_ordinal"
 683 |         )
 684 | 
 685 |         struct_term = (LPAR + delimitedList(expr_term) + RPAR)
 686 | 
 687 |         KNOWN_OPS = [
 688 |             (BETWEEN, AND),
 689 |             Literal("||").setName("concat"),
 690 |             Literal("*").setName("mul"),
 691 |             Literal("/").setName("div"),
 692 |             Literal("+").setName("add"),
 693 |             Literal("-").setName("sub"),
 694 |             Literal("<>").setName("neq"),
 695 |             Literal(">").setName("gt"),
 696 |             Literal("<").setName("lt"),
 697 |             Literal(">=").setName("gte"),
 698 |             Literal("<=").setName("lte"),
 699 |             Literal("=").setName("eq"),
 700 |             Literal("==").setName("eq"),
 701 |             Literal("!=").setName("neq"),
 702 |             IN.setName("in"),
 703 |             IS.setName("is"),
 704 |             LIKE.setName("like"),
 705 |             OR.setName("or"),
 706 |             AND.setName("and"),
 707 | 
 708 |             NOT.setName('not')
 709 |         ]
 710 | 
 711 |         class Operator(SemanticToken):
 712 |             def __init__(self, op, assoc, name, tokens):
 713 |                 self.op = op
 714 |                 self.assoc = assoc
 715 |                 self.name = name
 716 |                 self.tokens = tokens
 717 | 
 718 |             def getName(self):
 719 |                 return 'operator'
 720 | 
 721 |             @classmethod
 722 |             def parse(cls, tokens):
 723 |                 # ARRANGE INTO {op: params} FORMAT
 724 |                 toks = tokens[0]
 725 |                 if toks[1] in KNOWN_OPS:
 726 |                     op = KNOWN_OPS[KNOWN_OPS.index(toks[1])]
 727 |                     if toks.subselect:
 728 |                         import ipdb; ipdb.set_trace()
 729 |                     return Operator(op, 'binary', op.name, [toks[0], toks[2:]])
 730 |                 else:
 731 |                     import ipdb; ipdb.set_trace()
 732 |                     return tokens
 733 | 
 734 |             @classmethod
 735 |             def parse_unary(cls, tokens):
 736 |                 toks = tokens[0]
 737 |                 if toks[0] in KNOWN_OPS:
 738 |                     op = KNOWN_OPS[KNOWN_OPS.index(toks[0])]
 739 |                 else:
 740 |                     import ipdb; ipdb.set_trace()
 741 |                 return Operator(op, 'unary', op.name, [toks[1:]])
 742 | 
 743 |             @classmethod
 744 |             def parse_ternary(cls, tokens):
 745 |                 import ipdb; ipdb.set_trace()
 746 | 
 747 |             def __repr__(self):
 748 |                 return "<operator({}, {}, {})>".format(self.op, self.assoc, self.tokens)
 749 | 
 750 |         UNARY, BINARY, TERNARY = 1, 2, 3
 751 |         expr << infixNotation(
 752 |             (expr_term | struct_term),
 753 |             [
 754 |                 (oneOf("- + ~") | NOT, UNARY, opAssoc.RIGHT, Operator.parse_unary),
 755 |                 (ISNULL | NOTNULL | NOT + NULL, UNARY, opAssoc.LEFT, Operator.parse_unary),
 756 |                 ("||", BINARY, opAssoc.LEFT, Operator.parse),
 757 |                 (oneOf("* / %"), BINARY, opAssoc.LEFT, Operator.parse),
 758 |                 (oneOf("+ -"), BINARY, opAssoc.LEFT, Operator.parse),
 759 |                 (oneOf("<< >> & |"), BINARY, opAssoc.LEFT, Operator.parse),
 760 |                 (oneOf("= > < >= <= <> != !< !>"), BINARY, opAssoc.LEFT, Operator.parse),
 761 |                 (
 762 |                     IS + Optional(NOT)
 763 |                     | Optional(NOT) + IN
 764 |                     | Optional(NOT) + LIKE
 765 |                     | GLOB
 766 |                     | MATCH
 767 |                     | REGEXP,
 768 |                     BINARY,
 769 |                     opAssoc.LEFT,
 770 |                     Operator.parse
 771 |                 ),
 772 |                 ((BETWEEN, AND), TERNARY, opAssoc.LEFT, Operator.parse_ternary),
 773 |                 (
 774 |                     Optional(NOT)
 775 |                     + IN
 776 |                     + LPAR
 777 |                     + Group(ungrouped_select_stmt | delimitedList(expr))
 778 |                     + RPAR,
 779 |                     UNARY,
 780 |                     opAssoc.LEFT,
 781 |                     Operator.parse_unary
 782 |                 ),
 783 |                 (AND, BINARY, opAssoc.LEFT, Operator.parse),
 784 |                 (OR, BINARY, opAssoc.LEFT, Operator.parse),
 785 |             ],
 786 |             lpar=Literal('('),
 787 |             rpar=Literal(')'),
 788 |         )
 789 |         quoted_expr = (
 790 |             expr
 791 |             ^ Suppress('"') + expr + Suppress('"')
 792 |             ^ Suppress("'") + expr + Suppress("'")
 793 |             ^ Suppress("`") + expr + Suppress("`")
 794 |         )("quoted_expr")
 795 | 
 796 |         compound_operator = (
 797 |             UNION + Optional(ALL | DISTINCT)
 798 |             | INTERSECT + DISTINCT
 799 |             | EXCEPT + DISTINCT
 800 |             | INTERSECT
 801 |             | EXCEPT
 802 |         )("compound_operator")
 803 | 
 804 |         join_constraint = Group(
 805 |             Optional(
 806 |                 ON + expr
 807 |                 | USING + LPAR + Group(delimitedList(qualified_column_name)) + RPAR
 808 |             )
 809 |         )("join_constraint")
 810 | 
 811 |         join_op = (
 812 |             COMMA
 813 |             | Group(
 814 |                 Optional(NATURAL)
 815 |                 + Optional(
 816 |                     INNER
 817 |                     | CROSS
 818 |                     | LEFT + OUTER
 819 |                     | LEFT
 820 |                     | RIGHT + OUTER
 821 |                     | RIGHT
 822 |                     | FULL + OUTER
 823 |                     | OUTER
 824 |                     | FULL
 825 |                 )
 826 |                 + JOIN
 827 |             )
 828 |         )("join_op")
 829 | 
 830 |         join_source = Forward()
 831 | 
 832 |         # We support three kinds of table identifiers.
 833 |         #
 834 |         # First, dot delimited info like project.dataset.table, where
 835 |         # each component follows the rules described in the BigQuery
 836 |         # docs, namely:
 837 |         #  Contain letters (upper or lower case), numbers, and underscores
 838 |         #
 839 |         # Second, a dot delimited quoted string.  Since it's quoted, we'll be
 840 |         # liberal w.r.t. what characters we allow.  E.g.:
 841 |         #  `project.dataset.name-with-dashes`
 842 |         #
 843 |         # Third, a series of quoted strings, delimited by dots, e.g.:
 844 |         #  `project`.`dataset`.`name-with-dashes`
 845 |         #
 846 |         # We won't attempt to support combinations, like:
 847 |         #  project.dataset.`name-with-dashes`
 848 |         #  `project`.`dataset.name-with-dashes`
 849 | 
 850 |         def record_table_identifier(t):
 851 |             identifier_list = t.asList()
 852 |             padded_list = [None] * (3 - len(identifier_list)) + identifier_list
 853 |             cls._table_identifiers.add(tuple(padded_list))
 854 | 
 855 |         standard_table_part = ~keyword + Word(alphanums + "_")
 856 |         standard_table_identifier = (
 857 |             Optional(standard_table_part("project") + Suppress("."))
 858 |             + Optional(standard_table_part("dataset") + Suppress("."))
 859 |             + standard_table_part("table")
 860 |         ).setParseAction(lambda t: record_table_identifier(t))
 861 | 
 862 |         quoted_project_part = (
 863 |             Suppress('"') + CharsNotIn('"') + Suppress('"')
 864 |             | Suppress("'") + CharsNotIn("'") + Suppress("'")
 865 |             | Suppress("`") + CharsNotIn("`") + Suppress("`")
 866 |         )
 867 |         quoted_table_part = (
 868 |             Suppress('"') + CharsNotIn('".') + Suppress('"')
 869 |             | Suppress("'") + CharsNotIn("'.") + Suppress("'")
 870 |             | Suppress("`") + CharsNotIn("`.") + Suppress("`")
 871 |         )
 872 |         quoted_table_parts_identifier = (
 873 |             Optional(quoted_project_part("project") + Suppress("."))
 874 |             + Optional(quoted_table_part("dataset") + Suppress("."))
 875 |             + quoted_table_part("table")
 876 |         ).setParseAction(lambda t: record_table_identifier(t))
 877 | 
 878 |         def record_quoted_table_identifier(t):
 879 |             identifier_list = t.asList()[0].split(".")
 880 |             first = ".".join(identifier_list[0:-2]) or None
 881 |             second = identifier_list[-2]
 882 |             third = identifier_list[-1]
 883 |             identifier_list = [first, second, third]
 884 |             padded_list = [None] * (3 - len(identifier_list)) + identifier_list
 885 |             cls._table_identifiers.add(tuple(padded_list))
 886 | 
 887 |         quotable_table_parts_identifier = (
 888 |             Suppress('"') + CharsNotIn('"') + Suppress('"')
 889 |             | Suppress("'") + CharsNotIn("'") + Suppress("'")
 890 |             | Suppress("`") + CharsNotIn("`") + Suppress("`")
 891 |         ).setParseAction(lambda t: record_quoted_table_identifier(t))
 892 | 
 893 |         table_identifier = (
 894 |             standard_table_identifier
 895 |             | quoted_table_parts_identifier
 896 |             | quotable_table_parts_identifier
 897 |         )
 898 | 
 899 |         def record_ref(t):
 900 |             lol = [t.op] + t.ref_target.asList()
 901 |             cls._with_aliases.add(tuple(lol))
 902 |             cls._table_identifiers.add(tuple(lol))
 903 | 
 904 |         ref_target = identifier.copy()
 905 |         single_source = (
 906 |             # ref + source statements
 907 |             (
 908 |                 (
 909 |                     Suppress('{{')
 910 |                     + (CaselessKeyword('ref') | CaselessKeyword("source"))("op")
 911 |                     + LPAR
 912 |                     + delimitedList(
 913 |                         (Suppress("'") | Suppress('"'))
 914 |                         + ref_target
 915 |                         + (Suppress("'") | Suppress('"'))
 916 |                     )("ref_target")
 917 |                     + RPAR
 918 |                     + Suppress("}}")
 919 |                 ).setParseAction(record_ref)
 920 |                 | table_identifier
 921 |             )
 922 |             + Optional(Optional(AS) + table_alias("table_alias*"))
 923 |             + Optional(FOR + SYSTEMTIME + AS + OF + string_literal)
 924 |             + Optional(INDEXED + BY + index_name("name") | NOT + INDEXED)("index")
 925 |             | (
 926 |                 LPAR
 927 |                 + ungrouped_select_stmt
 928 |                 + RPAR
 929 |                 + Optional(Optional(AS) + table_alias)
 930 |             )('subquery')
 931 |             | (LPAR + join_source + RPAR)
 932 |             | (UNNEST + LPAR + expr + RPAR) + Optional(Optional(AS) + column_alias)
 933 |         )
 934 | 
 935 |         join_source << (
 936 |             Group(single_source + OneOrMore(Group(join_op + single_source + join_constraint)('joins*')))
 937 |             | single_source
 938 |         )('sources*')
 939 | 
 940 |         over_partition = (PARTITION + BY + delimitedList(partition_expression_list))(
 941 |             "over_partition"
 942 |         )
 943 |         over_order = ORDER + BY + delimitedList(ordering_term)
 944 |         over_unsigned_value_specification = expr
 945 |         over_window_frame_preceding = (
 946 |             UNBOUNDED + PRECEDING
 947 |             | over_unsigned_value_specification + PRECEDING
 948 |             | CURRENT + ROW
 949 |         )
 950 |         over_window_frame_following = (
 951 |             UNBOUNDED + FOLLOWING
 952 |             | over_unsigned_value_specification + FOLLOWING
 953 |             | CURRENT + ROW
 954 |         )
 955 |         over_window_frame_bound = (
 956 |             over_window_frame_preceding | over_window_frame_following
 957 |         )
 958 |         over_window_frame_between = (
 959 |             BETWEEN + over_window_frame_bound + AND + over_window_frame_bound
 960 |         )
 961 |         over_window_frame_extent = (
 962 |             over_window_frame_preceding | over_window_frame_between
 963 |         )
 964 |         over_row_or_range = (ROWS | RANGE) + over_window_frame_extent
 965 |         over = (
 966 |             OVER
 967 |             + LPAR
 968 |             + Optional(over_partition)
 969 |             + Optional(over_order)
 970 |             + Optional(over_row_or_range)
 971 |             + RPAR
 972 |         )("over")
 973 | 
 974 | 
 975 |         result_column = (
 976 |             Optional(table_name + ".")
 977 |             + "*"
 978 |             + Optional(
 979 |                 EXCEPT
 980 |                 + LPAR
 981 |                 + delimitedList(column_name)
 982 |                 + RPAR
 983 |             ) | Group(quoted_expr + Optional(over) + Optional(Optional(AS) + column_alias('alias')))
 984 |         )
 985 | 
 986 |         window_select_clause = (
 987 |             WINDOW + identifier + AS + LPAR + window_specification + RPAR
 988 |         )
 989 | 
 990 |         select_core = (
 991 |             SELECT
 992 |             + Optional(DISTINCT | ALL)
 993 |             + Group(delimitedList(result_column))("columns")
 994 |             + Optional(FROM - join_source("from*"))
 995 |             + Optional(WHERE + expr('where'))
 996 |             + Optional(
 997 |                 GROUP + BY + Group(delimitedList(grouping_term))("group_by_terms")
 998 |             )
 999 |             + Optional(HAVING + expr("having_expr"))
1000 |             + Optional(
1001 |                 ORDER + BY + Group(delimitedList(ordering_term))("order_by_terms")
1002 |             )
1003 |             + Optional(delimitedList(window_select_clause))
1004 |         )
1005 |         grouped_select_core = select_core | (LPAR + select_core + RPAR)
1006 | 
1007 |         ungrouped_select_stmt << (
1008 |             grouped_select_core
1009 |             + ZeroOrMore(compound_operator + grouped_select_core)
1010 |             + Optional(
1011 |                 LIMIT
1012 |                 + (Group(expr + OFFSET + expr) | Group(expr + COMMA + expr) | expr)(
1013 |                     "limit"
1014 |                 )
1015 |             )
1016 |         )("select")
1017 |         select_stmt = ungrouped_select_stmt | (LPAR + ungrouped_select_stmt + RPAR)
1018 | 
1019 |         # define comment format, and ignore them
1020 |         sql_comment = oneOf("-- #") + restOfLine | cStyleComment
1021 |         select_stmt.ignore(sql_comment)
1022 | 
1023 |         def record_with_alias(t):
1024 |             identifier_list = t.asList()
1025 |             padded_list = [None] * (3 - len(identifier_list)) + identifier_list
1026 |             cls._with_aliases.add(tuple(padded_list))
1027 | 
1028 |         with_stmt = Forward().setName("with statement")
1029 |         with_clause = Group(
1030 |             identifier.setParseAction(lambda t: record_with_alias(t))('cte_name')
1031 |             - AS
1032 |             - LPAR
1033 |             + (select_stmt | with_stmt)
1034 |             - RPAR
1035 |         )
1036 |         with_core = WITH + delimitedList(with_clause)('ctes')
1037 |         with_stmt << (with_core - ~Literal(',') + ungrouped_select_stmt)
1038 |         with_stmt.ignore(sql_comment)
1039 | 
1040 |         select_or_with = select_stmt | with_stmt
1041 |         select_or_with_parens = LPAR + select_or_with - RPAR
1042 | 
1043 |         cls._parser = select_or_with | select_or_with_parens
1044 |         return cls._parser
1045 | 
1046 | 
1047 | 


--------------------------------------------------------------------------------