├── .circleci └── config.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── databathing ├── __init__.py ├── pipeline.py ├── py_bathing.py └── v1.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── setuptools.json └── tests ├── README.md ├── __init__.py ├── requirements.txt ├── test_distinct.py ├── test_groupby_having.py ├── test_join_on.py ├── test_orderby.py ├── test_select_from_where.py ├── test_split.py ├── test_struct.py ├── test_windows.py └── test_with.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Use the latest 2.1 version of CircleCI pipeline process engine. 2 | # See: https://circleci.com/docs/2.0/configuration-reference 3 | version: 2.1 4 | 5 | # Define a job to be invoked later in a workflow. 6 | # See: https://circleci.com/docs/2.0/configuration-reference/#jobs 7 | jobs: 8 | build_test: 9 | # Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub. 10 | # See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor 11 | docker: 12 | - image: circleci/python:3.7.9 13 | # Add steps to the job 14 | # See: https://circleci.com/docs/2.0/configuration-reference/#steps 15 | steps: 16 | - checkout 17 | - run: 18 | command: | 19 | sudo pip install pipenv 20 | pipenv install -r tests/requirements.txt 21 | pipenv install . 22 | pipenv run python -m unittest discover tests 23 | test_pypi_publish: 24 | docker: 25 | - image: circleci/python:3.7.9 26 | steps: 27 | - checkout # checkout source code to working directory 28 | - run: 29 | command: | # create whl, install twine and publish to Test PyPI 30 | python setup.py sdist bdist_wheel 31 | sudo pip install pipenv 32 | pipenv install twine 33 | pipenv run twine upload --repository testpypi dist/* 34 | pypi_publish: 35 | docker: 36 | - image: circleci/python:3.7.9 37 | steps: 38 | - checkout # checkout source code to working directory 39 | - run: 40 | command: | # create whl, install twine and publish to PyPI 41 | python setup.py sdist bdist_wheel 42 | sudo pip install pipenv 43 | pipenv install twine 44 | pipenv run twine upload dist/* 45 | 46 | 47 | # Invoke jobs via workflows 48 | # See: https://circleci.com/docs/2.0/configuration-reference/#workflows 49 | workflows: 50 | build_test_publish: 51 | jobs: 52 | - build_test 53 | - test_pypi_publish: 54 | requires: 55 | - build_test 56 | filters: 57 | branches: 58 | only: 59 | - develop 60 | - pypi_publish: 61 | requires: 62 | - build_test 63 | filters: 64 | branches: 65 | only: 66 | - main -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | databathing.egg-info 3 | dist 4 | *.log 5 | *.pyc 6 | *.tab 7 | out 8 | .idea 9 | .svn 10 | *.iml 11 | /build 12 | vendor/ 13 | ._* 14 | .DS_Store 15 | .eggs 16 | .pytest_cache 17 | Pipfile 18 | Pipfile.lock -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 6 | and this project adheres to [Semantic Versioning](http://semver.org/). 7 | 8 | ## [0.1.2] - 2022-05-18 9 | ## [0.1.3] - 2022-05-19 10 | ## [0.2.0] - 2022-05-19 11 | ## [0.2.1] - 2022-05-24 12 | 13 | ### Added 14 | - databathing basic verison 15 | - circleci 16 | - split and struct -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2022] [Jiazhen Zhu] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # More SQL Parsing! 2 | 3 | [![PyPI Latest Release](https://img.shields.io/pypi/v/databathing.svg)](https://pypi.org/project/databathing/) 4 | [![Build Status](https://circleci.com/gh/jason-jz-zhu/databathing/tree/main.svg?style=svg)](https://app.circleci.com/pipelines/github/jason-jz-zhu/databathing) 5 | 6 | 7 | Parse SQL into JSON so we can translate it for other datastores! 8 | 9 | [See changes](https://github.com/jason-jz-zhu/databathing#version-changes) 10 | 11 | 12 | ## Problem Statement 13 | 14 | After converting from sql to spark, data engineers need to write the spark code for ETL pipeline instead of using YAML(SQL) which can improve the performance of ETL job, but it still makes the ETL development longer than before. 15 | 16 | Then we have one question: can we have a solution which can have both good calculation performance (Spark) and quick to develop (YAML - SQL)? 17 | 18 | YES, we have !!! 19 | 20 | ## Objectives 21 | 22 | We plan to combine the benefits from Spark and YAML (SQL) to create the platform or library to develop the ETL pipeline. 23 | 24 | 25 | ## Project Status 26 | 27 | May 2022 - There are [over 900 tests](https://app.circleci.com/pipelines/github/jason-jz-zhu/databathing). This parser is good enough for basic usage, including: 28 | * `SELECT` feature 29 | * `FROM` feature 30 | * `INNER` JOIN and LEFT JOIN feature 31 | * `ON` feature 32 | * `WHERE` feature 33 | * `GROUP BY` feature 34 | * `HAVING` feature 35 | * `ORDER BY` feature 36 | * `AGG` feature 37 | * WINDOWS FUNCTION feature (`SUM`, `AVG`, `MAX`, `MIN`, `MEAN`, `COUNT`) 38 | * ALIAS NAME feature 39 | * `WITH` STATEMENT feature 40 | 41 | ## Install 42 | 43 | pip install databathing 44 | 45 | 46 | ## Generating Spark Code 47 | 48 | You may also generate PySpark Code from the a given SQL Query. This is done by the Pipeline, which is in Version 1 state (May2022). 49 | 50 | >>> from databathing import pipeline 51 | >>> pipeline = pipeline.Pipeline("SELECT * FROM Test WHERE info = 1") 52 | >>> ans = pipeline.parse() 53 | 'final_df = Test\\\n.filter("info = 1")\\\n.selectExpr("a","b","c")\n\n' 54 | 55 | ## Contributing 56 | 57 | In the event that the databathing is not working for you, you can help make this better but simply pasting your sql (or JSON) into a new issue. Extra points if you describe the problem. Even more points if you submit a PR with a test. If you also submit a fix, then you also have my gratitude. 58 | 59 | Please follow this blog to update verion - https://circleci.com/blog/publishing-a-python-package/ 60 | 61 | 62 | ### Run Tests 63 | 64 | See [the tests directory](https://github.com/jason-jz-zhu/databathing/tree/develop/tests) for instructions running tests, or writing new ones. 65 | 66 | ## Version Changes 67 | 68 | 69 | ### Version 1 70 | 71 | *May 2022* 72 | 73 | Features and Functionalities - PySpark Version 74 | * `SELECT` feature 75 | * `FROM` feature 76 | * `INNER` JOIN and LEFT JOIN feature 77 | * `ON` feature 78 | * `WHERE` feature 79 | * `GROUP BY` feature 80 | * `HAVING` feature 81 | * `ORDER BY` feature 82 | * `AGG` feature 83 | * WINDOWS FUNCTION feature (`SUM`, `AVG`, `MAX`, `MIN`, `MEAN`, `COUNT`) 84 | * ALIAS NAME feature 85 | * `WITH` STATEMENT feature 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /databathing/__init__.py: -------------------------------------------------------------------------------- 1 | from databathing.pipeline import Pipeline 2 | from databathing.py_bathing import py_bathing 3 | 4 | 5 | 6 | __all__ = ["Pipeline", "py_bathing"] 7 | -------------------------------------------------------------------------------- /databathing/pipeline.py: -------------------------------------------------------------------------------- 1 | from curses import nonl 2 | # from mo_sql_parsing import parse 3 | from mo_sql_parsing import parse_bigquery as parse 4 | from mo_sql_parsing import format 5 | import json 6 | import copy 7 | 8 | from databathing.py_bathing import py_bathing 9 | 10 | 11 | class Pipeline: 12 | def __init__(self, query): 13 | # print(query) 14 | self.parsed_whole_query = parse(query) 15 | self.parsed_json_whole_query = json.loads(json.dumps(self.parsed_whole_query,indent=4)) 16 | self.parsed_json_whole_query = self.parsed_json_whole_query 17 | self.with_ans = "" 18 | self.last_ans = "" 19 | 20 | def gen_with_pipeline(self, query): 21 | if "with" in query: 22 | with_stmts = query["with"] 23 | if type(with_stmts) is dict: 24 | self.gen_with_pipeline(with_stmts) 25 | else: 26 | for with_stmt in with_stmts: 27 | self.gen_with_pipeline(with_stmt) 28 | else: 29 | dbing = py_bathing(query["value"]) 30 | self.with_ans += query["name"] + " = " + dbing.parse() + "\n\n" 31 | 32 | 33 | def gen_last_pipeline(self, query): 34 | tmp_query = copy.deepcopy(query) 35 | 36 | if "with" in query: 37 | del tmp_query["with"] 38 | 39 | dbing = py_bathing(tmp_query) 40 | self.last_ans = "final_df = " + dbing.parse() + "\n\n" 41 | 42 | def parse(self): 43 | final_ans = "" 44 | if "with" in self.parsed_json_whole_query: 45 | self.gen_with_pipeline(self.parsed_json_whole_query) 46 | final_ans += self.with_ans 47 | self.gen_last_pipeline(self.parsed_json_whole_query) 48 | final_ans += self.last_ans 49 | return final_ans 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | # query = """ 60 | 61 | # select 62 | # df1.firstname, 63 | # count(*) cnt 64 | # from df as df1 65 | # inner join df as df2 66 | # on df1.firstname = df2.firstname 67 | # group by df1.firstname 68 | # having cnt > 0 69 | 70 | # """ 71 | 72 | # query = """ 73 | # with step1 as ( 74 | # select firstname, id from df 75 | # ), step2 as ( 76 | # select gender, salary, id from df 77 | # ), step3 as ( 78 | # select 79 | # s1.id, s1.firstname, s2.gender, s2.salary 80 | # from step1 as s1 81 | # inner join step2 as s2 82 | # on s1.id = s2.id 83 | # ) 84 | # select 85 | # *, 86 | # RANK() OVER (PARTITION BY id ORDER BY salary DESC) AS seq 87 | # from step3 88 | # """ 89 | 90 | # query = """ 91 | # SELECT b.id, b.title, a.last_name AS author, e.last_name AS editor, 92 | # t.last_name AS translator 93 | # FROM book b 94 | # LEFT JOIN author a 95 | # ON b.author_id = a.id 96 | # LEFT JOIN editor e 97 | # ON b.editor_id = e.id 98 | # LEFT JOIN translator t 99 | # ON b.translator_id = t.id 100 | # ORDER BY b.id, a.id desc 101 | # """ 102 | 103 | 104 | 105 | # query = """ 106 | # WITH namePreDF AS ( 107 | # SELECT 108 | # distinct glbl_ptnt_id, 109 | # patient_name, 110 | # struct(split(patient_name, ',')[0] as firstname, split(patient_name, ',')[1] as lastname) as patient_name_info 111 | # FROM overviewDF 112 | # WHERE patient_name != '' 113 | # ORDER BY filled_date desc 114 | # ) 115 | # SELECT 116 | # glbl_ptnt_id, 117 | # collect_set(patient_name_info) as patient_name_info 118 | # FROM namePreDF 119 | # GROUP BY glbl_ptnt_id 120 | # """ 121 | 122 | # query = """ 123 | # SELECT 124 | # struct(firstname as firstname, lastname as lastname) as name 125 | # FROM df 126 | # """ 127 | 128 | # pipeline = Pipeline(query) 129 | 130 | # ans = pipeline.parse() 131 | # print(ans) -------------------------------------------------------------------------------- /databathing/py_bathing.py: -------------------------------------------------------------------------------- 1 | from curses import nonl 2 | # from mo_sql_parsing import parse 3 | from mo_sql_parsing import parse_bigquery as parse 4 | from mo_sql_parsing import format 5 | import json 6 | 7 | 8 | class py_bathing: 9 | def __init__(self, parsed_json_whole_query): 10 | self.parsed_json_whole_query = parsed_json_whole_query 11 | 12 | self.distinct_flag = False 13 | self.from_ans = "" 14 | self.select_ans = "" 15 | self.level_select = 0 16 | self.where_ans = "" 17 | self.groupby_ans = "" 18 | self.limit_ans = "" 19 | self.agg_ans = "" 20 | self.having_ans = "" 21 | self.orderby_ans = "" 22 | self.agg_list = ["sum", "avg", "max", "min", "mean", "count", "collect_list", "collect_set"] 23 | 24 | 25 | def _from_analyze(self, from_stmt): 26 | if not from_stmt: 27 | return 28 | if type(from_stmt) is str: 29 | self.from_ans += format({ "from": from_stmt })[5:] 30 | elif type(from_stmt) is dict: 31 | if "name" in from_stmt.keys(): 32 | self.from_ans += from_stmt['value']+".alias(\""+from_stmt['name']+"\")." 33 | elif "left join" in from_stmt.keys(): 34 | self.from_ans += "join({}, {}, \"{}\").".format( 35 | from_stmt['left join']['value']+".alias(\""+from_stmt['left join']['name']+"\")", 36 | "col(\""+str(from_stmt['on']['eq'][0])+"\")" + "==" + "col(\""+str(from_stmt['on']['eq'][1])+"\")" , 37 | 'left') 38 | elif "inner join" in from_stmt.keys(): 39 | self.from_ans += "join({}, {}, \"{}\").".format( 40 | from_stmt['inner join']['value']+".alias(\""+from_stmt['inner join']['name']+"\")", 41 | "col(\""+str(from_stmt['on']['eq'][0])+"\")" + "==" + "col(\""+str(from_stmt['on']['eq'][1])+"\")" , 42 | 'inner') 43 | elif "right join" in from_stmt.keys(): 44 | self.from_ans += "join({}, {}, \"{}\").".format( 45 | from_stmt['right join']['value']+".alias(\""+from_stmt['right join']['name']+"\")", 46 | "col(\""+str(from_stmt['on']['eq'][0])+"\")" + "==" + "col(\""+str(from_stmt['on']['eq'][1])+"\")" , 47 | 'right') 48 | 49 | elif type(from_stmt) is list: 50 | for item_from in from_stmt: 51 | self._from_analyze(item_from) 52 | 53 | def _select_analyze(self, select_stmt): 54 | # print(select_stmt) 55 | 56 | if not select_stmt: 57 | return 58 | 59 | if type(select_stmt) is str: 60 | self.select_ans += "\"" + format({ "select": select_stmt })[7:] + "\"," 61 | return 62 | if type(select_stmt) is dict and type(select_stmt['value']) is str: 63 | self.select_ans += "\"" + format({ "select": select_stmt })[7:] + "\"," 64 | return 65 | if type(select_stmt) is dict: 66 | if list(select_stmt["value"].keys())[0].lower() in self.agg_list: 67 | self.select_ans += "\""+ select_stmt['name'] +"\"," 68 | elif list(select_stmt["value"].keys())[0].lower() == "create_struct": 69 | self.select_ans += "\"" + format({ "select": select_stmt })[14:] + "\"," 70 | else: 71 | self.select_ans += "\"" + format({ "select": select_stmt })[7:] + "\"," 72 | elif type(select_stmt) is list and (self.level_select == 0): 73 | self.level_select += 1 74 | for inner_item in select_stmt: 75 | self._select_analyze(inner_item) 76 | 77 | 78 | def _where_analyze(self, where_stmt): 79 | self.where_ans = format({ "where": where_stmt })[6:] 80 | 81 | def _groupby_analyze(self, groupby_stmt): 82 | self.groupby_ans = format({ "groupby": groupby_stmt })[9:] 83 | 84 | def _agg_analyze(self, agg_stmt): 85 | if type(agg_stmt) is dict: 86 | if type(agg_stmt["value"]) is dict and list(agg_stmt["value"].keys())[0].lower() in self.agg_list: 87 | for funct, alias in agg_stmt["value"].items(): 88 | self.agg_ans += "{}(col(\"{}\")).alias(\"{}\"),".format(funct, alias, agg_stmt["name"]) 89 | 90 | elif type(agg_stmt) is list: 91 | for item in agg_stmt: 92 | self._agg_analyze(item) 93 | 94 | # if type(item["value"]) is dict and list(item["value"].keys())[0].lower() in self.agg_list: 95 | # for funct, alias in item["value"].items(): 96 | # self.agg_ans += "{}(col(\"{}\")).alias(\"{}\"),".format(funct, alias, item["name"]) 97 | 98 | self.agg_ans = self.agg_ans.replace("\n", "") 99 | 100 | 101 | def _having_analyze(self, having_stmt): 102 | self.having_ans = format({ "having": having_stmt })[7:] 103 | 104 | def _orderby_analyze(self, order_stmt): 105 | # print(order_stmt) 106 | if type(order_stmt) is dict: 107 | odr = "desc()" if order_stmt.get("sort", "asc") == "desc" else "asc()" 108 | self.orderby_ans += "col(\"{}\").{},".format(str(order_stmt["value"]), odr) 109 | else: 110 | for item in order_stmt: 111 | self._orderby_analyze(item) 112 | # for item in order_stmt: 113 | # odr = "desc()" if item.get("sort", "asc") == "desc" else "asc()" 114 | # self.orderby_ans += "col(\"{}\").{},".format(str(item["value"]), odr) 115 | 116 | def _limit_analyze(self, limit_stmt): 117 | self.limit_ans = limit_stmt 118 | 119 | def parse(self): 120 | from_final_ans = where_final_ans = groupby_final_ans = agg_final_ans = select_final_ans = orderby_final_ans = limit_final_ans = having_final_ans = "" 121 | 122 | for method, stmt in self.parsed_json_whole_query.items(): 123 | # handle from 124 | if str(method).lower() == "from": 125 | self._from_analyze(stmt) 126 | from_final_ans = self.from_ans[:-1] if self.from_ans[-1] == '.' else self.from_ans 127 | 128 | #handle where 129 | elif str(method).lower() == "where": 130 | self._where_analyze(stmt) 131 | where_final_ans = self.where_ans 132 | 133 | #handle groupby and agg 134 | elif str(method).lower() == "groupby": 135 | # group by 136 | self._groupby_analyze(stmt) 137 | groupby_final_ans = self.groupby_ans 138 | # agg 139 | agg_stmt = self.parsed_json_whole_query["select"] \ 140 | if "select" in self.parsed_json_whole_query.keys() \ 141 | else self.parsed_json_whole_query["select_distinct"] 142 | self._agg_analyze(agg_stmt) 143 | agg_final_ans = self.agg_ans[:-1] 144 | 145 | #handle select 146 | elif str(method).lower() in ["select", "select_distinct"]: 147 | self._select_analyze(stmt) 148 | select_final_ans = self.select_ans[:-1] 149 | self.distinct_flag = True if str(method) == "select_distinct" else False 150 | 151 | # handle having 152 | elif str(method) =="having": 153 | self._having_analyze(stmt) 154 | having_final_ans = self.having_ans 155 | 156 | #handle sort 157 | elif str(method) =="orderby": 158 | self._orderby_analyze(stmt) 159 | orderby_final_ans = self.orderby_ans[:-1] 160 | 161 | #handle limit 162 | elif str(method).lower() =="limit": 163 | self._limit_analyze(stmt) 164 | limit_final_ans = self.limit_ans 165 | 166 | final_ans = "" 167 | if from_final_ans: 168 | final_ans += from_final_ans + "\\" 169 | if where_final_ans: 170 | final_ans += "\n.filter(\"{}\")\\".format(where_final_ans) 171 | if groupby_final_ans: 172 | final_ans += "\n.groupBy(\"{}\")\\".format(groupby_final_ans) 173 | if agg_final_ans: 174 | final_ans += "\n.agg({})\\".format(agg_final_ans) 175 | if having_final_ans: 176 | final_ans += "\n.filter(\"{}\")\\".format(having_final_ans) 177 | if select_final_ans: 178 | final_ans += "\n.selectExpr({})\\".format(select_final_ans) 179 | if self.distinct_flag: 180 | final_ans += "\n.distinct()\\" 181 | if orderby_final_ans: 182 | final_ans += "\n.orderBy("+orderby_final_ans+")\\" 183 | if limit_final_ans: 184 | final_ans += "\n.limit("+str(limit_final_ans)+")\\" 185 | 186 | return final_ans[:-1] 187 | 188 | 189 | 190 | 191 | # query = """ 192 | # SELECT 193 | # distinct glbl_ptnt_id, 194 | # patient_name, 195 | # split(patient_name, ',')[0] as first_name, 196 | # split(patient_name, ',')[1] as last_name, 197 | # struct(first_name as firstname, last_name as lastname) as patient_name_info 198 | # FROM overviewDF 199 | # WHERE patient_name != '' 200 | # ORDER BY filled_date desc 201 | # """ 202 | 203 | 204 | # parsed_whole_query = parse(query) 205 | # parsed_json_whole_query = json.loads(json.dumps(parsed_whole_query,indent=4)) 206 | 207 | # # print(parsed_json_whole_query) 208 | 209 | # dbing = py_bathing(parsed_json_whole_query) 210 | # ans = dbing.parse() 211 | # print(ans) -------------------------------------------------------------------------------- /databathing/v1.py: -------------------------------------------------------------------------------- 1 | from curses import nonl 2 | # from mo_sql_parsing import parse 3 | from mo_sql_parsing import parse_bigquery as parse 4 | 5 | from mo_sql_parsing import format 6 | import json 7 | 8 | # query = """ 9 | # SELECT product_id as new_product_id, 10 | # Count(star_rating) as total_rating, 11 | # Max(star_rating) AS best_rating, 12 | # Min(star_rating) AS worst_rating, 13 | # ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE 14 | # FROM tbl_books 15 | # WHERE verified_purchase = 'Y' 16 | # AND review_date BETWEEN '1995-07-22' AND '2015-08-31' 17 | # AND marketplace IN ( 'DE', 'US', 'UK', 'FR', 'JP' ) 18 | # GROUP BY product_id 19 | # ORDER BY total_rating asc,product_id desc,best_rating 20 | # LIMIT 10; 21 | # """ 22 | 23 | # query = """select distinct 24 | # firstname, 25 | # lastname, 26 | # case when gender == "M" then "m" else "f" end as new, 27 | # ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE 28 | # from test1 t1 29 | # where t1.cc = 'cc'""" 30 | 31 | 32 | query = """ 33 | select distinct t1 as tt2, t2 as tt2 34 | from test 35 | """ 36 | 37 | # query = """select 38 | # t1.a, t2.b 39 | # from test1 t1 40 | # left join test2 t2 41 | # on t1.a = t2.a 42 | # inner join test3 t3 43 | # on t2.b = t3.b 44 | # where t1.cc = 'cc'""" 45 | 46 | # query = """select 47 | # t1.a, t2.b 48 | # from test1 t1 49 | # where t1.cc = 'cc'""" 50 | 51 | # query = """ 52 | # with tmp as ( 53 | # SELECT product_id as new_product_id, 54 | # Count(star_rating) as total_rating, 55 | # Max(star_rating) AS best_rating, 56 | # Min(star_rating) AS worst_rating, 57 | # ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE 58 | # FROM tbl_books 59 | # WHERE verified_purchase = 'Y' 60 | # AND review_date BETWEEN '1995-07-22' AND '2015-08-31' 61 | # AND marketplace IN ( 'DE', 'US', 'UK', 'FR', 'JP' ) 62 | # GROUP BY product_id 63 | # ORDER BY total_rating asc,product_id desc,best_rating 64 | # LIMIT 10 65 | # ), aa as (select * from tmp) 66 | # select * from aa 67 | # """ 68 | 69 | 70 | # query = """ 71 | # with step1 as ( 72 | # select * from t1 73 | # ), step2 as ( 74 | # select * from t2 75 | # ), step3 as ( 76 | # select 77 | # s1.a, s2.b 78 | # from step1 as s1 79 | # inner join step2 as s2 80 | # on s1.a = s2.a 81 | # ) 82 | # select 83 | # a, b, 84 | # ROW_NUMBER() OVER (PARTITION BY a ORDER BY b DESC) AS seq 85 | # from step3 86 | # """ 87 | 88 | 89 | v_parse = parse(query) 90 | v_json = json.loads(json.dumps(v_parse,indent=4)) 91 | print(v_json) 92 | 93 | distinct_flag = False 94 | 95 | result_from="" 96 | def fn_from(value): 97 | global result_from 98 | if not value: 99 | return 100 | if type(value) is str: 101 | result_from += format({ "from": value })[5:] 102 | elif type(value) is dict: 103 | if "name" in value.keys(): 104 | result_from += value['value']+".alias(\""+value['name']+"\")." 105 | elif "left join" in value.keys(): 106 | result_from += "join({}, {}, \"{}\").".format( 107 | value['left join']['value']+".alias(\""+value['left join']['name']+"\")", 108 | "col(\""+str(value['on']['eq'][0])+"\")" + "===" + "col(\""+str(value['on']['eq'][1])+"\")" , 109 | 'left') 110 | elif "inner join" in value.keys(): 111 | result_from += "join({}, {}, \"{}\").".format( 112 | value['inner join']['value']+".alias(\""+value['inner join']['name']+"\")", 113 | "col(\""+str(value['on']['eq'][0])+"\")" + "===" + "col(\""+str(value['on']['eq'][1])+"\")" , 114 | 'inner') 115 | elif "right join" in value.keys(): 116 | result_from += "join({}, {}, \"{}\").".format( 117 | value['right join']['value']+".alias(\""+value['right join']['name']+"\")", 118 | "col(\""+str(value['on']['eq'][0])+"\")" + "===" + "col(\""+str(value['on']['eq'][1])+"\")" , 119 | 'right') 120 | 121 | elif type(value) is list: 122 | for item_from in value: 123 | fn_from(item_from) 124 | 125 | 126 | 127 | 128 | # def fn_from(value): 129 | # print("------") 130 | # print(value) 131 | # print("------") 132 | # result_from="" 133 | # if type(value) is str: 134 | # result_from = format({ "from": value }) 135 | # result_from = result_from[5:] 136 | # # elif type(value) is dict: 137 | # # if "name" in value.keys(): 138 | # # result_from = result_from + value['value']+".alias(\""+value['name']+"\")" 139 | # # else: 140 | # # result_from = result_from + value['value']+"" 141 | # elif type(value) is list: 142 | # for item_from in value: 143 | # if type(item_from) is dict: 144 | # if "name" in item_from.keys(): 145 | # result_from = result_from + item_from['value']+".alias(\""+item_from['name']+"\")," 146 | # else: 147 | # result_from = result_from + item_from['value']+"," 148 | # elif type(item_from) is str: 149 | # result_from = result_from + item_from+"," 150 | # return result_from 151 | 152 | 153 | agg_list = ["sum", "avg", "max", "min", "mean", "count"] 154 | result_select = "" 155 | level_select =0 156 | def fn_select(value): 157 | global distinct_flag 158 | global result_select 159 | global level_select 160 | if not value: 161 | return 162 | 163 | if type(value) is str: 164 | result_select += "\"" + format({ "select": value })[7:] + "\"," 165 | return 166 | if type(value) is dict and type(value['value']) is str: 167 | result_select += "\"" + format({ "select": value })[7:] + "\"," 168 | return 169 | if type(value) is dict: 170 | if "distinct" in value["value"].keys(): 171 | distinct_flag = True 172 | level_select += 1 173 | fn_select(value["value"]["distinct"]) 174 | elif list(value["value"].keys())[0].lower() in agg_list: 175 | result_select += "\""+ value['name'] +"\"," 176 | else: 177 | result_select += "\"" + format({ "select": value })[7:] + "\"," 178 | elif type(value) is list and (level_select == 0 or (level_select == 1 and distinct_flag)): 179 | for inner_item in value: 180 | fn_select(inner_item) 181 | 182 | def fn_where(value): 183 | result_where="" 184 | result_where = format({ "where": value })[6:] 185 | return result_where 186 | 187 | 188 | def fn_groupby(value): 189 | result_groupby="" 190 | result_groupby = format({ "groupby": value })[9:] 191 | return result_groupby 192 | 193 | def fn_agg(query): 194 | v_parse = parse(query) 195 | v_agg = "" 196 | for i in v_parse["select"]: 197 | if type(i["value"]) is dict: 198 | for key,value in i["value"].items(): 199 | v_agg = v_agg + (key+"("+"col(\""+str(value)+"\")"+").alias('"+i["name"]+"')") +"," 200 | v_agg = v_agg.replace("\n", "") 201 | return v_agg[:-1] 202 | 203 | 204 | def fn_orderby(query): 205 | v_parse = parse(query) 206 | v_orderby_collist="" 207 | v_orderby = v_parse["orderby"] 208 | for i in v_orderby: 209 | if i.get("sort", "asc") == "desc": 210 | v_sortorder = "desc()" 211 | else: 212 | v_sortorder = "asc()" 213 | v_orderby_collist = v_orderby_collist + "col(\""+str(i.get("value", ""))+"\")." +v_sortorder+"," 214 | return v_orderby_collist[:-1] 215 | 216 | 217 | def fn_limit(query): 218 | v_parse = parse(query) 219 | v_limit = v_parse["limit"] 220 | return v_limit 221 | 222 | 223 | def fn_genSQL(data): 224 | v_fn_from = v_fn_where = v_fn_groupby = v_fn_agg = v_fn_select = v_fn_orderby = v_fn_limit = "" 225 | for key,value in data.items(): 226 | # handle from 227 | if str(key)=="from": 228 | fn_from(value) 229 | v_fn_from = result_from[:-1] 230 | 231 | #handle where 232 | if str(key) =="where": 233 | v_fn_where = fn_where(value) 234 | 235 | #handle groupby 236 | if str(key) =="groupby": 237 | v_fn_groupby = fn_groupby(value) 238 | 239 | #handle agg 240 | if str(key) =="groupby": 241 | v_fn_agg = fn_agg(query) 242 | 243 | #handle select 244 | if str(key) =="select": 245 | fn_select(value) 246 | v_fn_select = result_select[:-1] 247 | 248 | 249 | #handle sort 250 | if str(key) =="orderby": 251 | v_fn_orderby = fn_orderby(query) 252 | 253 | #handle limit 254 | if str(key) =="limit": 255 | v_fn_limit = fn_limit(query) 256 | 257 | v_final_stmt = "" 258 | if v_fn_from: 259 | v_final_stmt = v_final_stmt + v_fn_from 260 | if v_fn_where: 261 | v_final_stmt = v_final_stmt + "\n.filter(\""+v_fn_where+"\")" 262 | if v_fn_groupby: 263 | v_final_stmt = v_final_stmt + "\n.groupBy(\""+v_fn_groupby+"\")" 264 | if v_fn_agg: 265 | v_final_stmt = v_final_stmt + "\n.agg("+v_fn_agg+"\")" 266 | if v_fn_select: 267 | v_final_stmt = v_final_stmt + "\n.selectExpr("+v_fn_select+")" 268 | if distinct_flag: 269 | v_final_stmt = v_final_stmt + "\n.distinct()" 270 | if v_fn_orderby: 271 | v_final_stmt = v_final_stmt + "\n.orderBy("+v_fn_orderby+")" 272 | if v_fn_limit: 273 | v_final_stmt = v_final_stmt + "\n.limit("+str(v_fn_limit)+")" 274 | 275 | return v_final_stmt 276 | 277 | 278 | print (fn_genSQL(v_json)) 279 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mo-sql-parsing -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file=README.md 3 | license_files=LICENSE -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name='databathing', 5 | version='0.2.1', 6 | description="build spark job based on query", 7 | author="Jiazhen Zhu", 8 | author_email="jason.jz.zhu@gmail.com", 9 | classifiers=[ 10 | "Development Status :: 3 - Alpha", 11 | "Topic :: Software Development :: Libraries", 12 | "Topic :: Software Development :: Libraries :: Python Modules", 13 | "Programming Language :: SQL","Programming Language :: Python :: 3.7", 14 | "Programming Language :: Python :: 3.8", 15 | "Programming Language :: Python :: 3.9", 16 | 'License :: OSI Approved :: MIT License'], 17 | license="MIT", 18 | packages=['databathing'], 19 | install_requires=[ 20 | 'mo-sql-parsing', 21 | ], 22 | long_description='# Convert SQL to Spark Code!\n\n[![PyPI Latest Release]', 23 | long_description_content_type='text/markdown' 24 | ) 25 | -------------------------------------------------------------------------------- /setuptools.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jason-jz-zhu/databathing/67674f81912b562b31f1cfa5bbed6602ecc2908f/setuptools.json -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # More Query Parsing Tests 2 | 3 | The test suite has over 400 tests. 4 | 5 | ## Running Tests 6 | 7 | For __Linux__: 8 | 9 | git clone https://github.com/jason-jz-zhu/databathing.git 10 | cd databathing 11 | pip install -r requirements.txt 12 | pip install -r tests/requirements.txt 13 | export PYTHONPATH=. 14 | python -m unittest discover tests -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jason-jz-zhu/databathing/67674f81912b562b31f1cfa5bbed6602ecc2908f/tests/__init__.py -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | mo-sql-parsing -------------------------------------------------------------------------------- /tests/test_distinct.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestDistinct(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT distinct a, b, c 17 | FROM Test 18 | """ 19 | pipeline = Pipeline(sql) 20 | ans = pipeline.parse() 21 | expected = """final_df = Test\\\n.selectExpr("a","b","c")\\\n.distinct()\n\n""" 22 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_groupby_having.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | class TestGroupbyHaving(TestCase): 10 | def test_decisive_equailty(self): 11 | 12 | sql = """ 13 | select 14 | product_id, 15 | count(*) cnt 16 | from Test 17 | group by product_id 18 | having cnt > 1 19 | """ 20 | 21 | pipeline = Pipeline(sql) 22 | ans = pipeline.parse() 23 | expected = """final_df = Test\\\n.groupBy("product_id")\\\n.agg(count(col("*")).alias("cnt"))\\\n.filter("cnt > 1")\\\n.selectExpr("product_id","cnt")\n\n""" 24 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_join_on.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestJoinOn(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT 17 | t1.id as id, 18 | t1.val as t1_val, 19 | t2.val as t1_val 20 | FROM Test t1 21 | LEFT JOIN Test t2 22 | ON t1.id = t2.id 23 | """ 24 | pipeline = Pipeline(sql) 25 | ans = pipeline.parse() 26 | expected = """final_df = Test.alias("t1").join(Test.alias("t2"), col("t1.id")==col("t2.id"), "left")\\\n.selectExpr("t1.id AS id","t1.val AS t1_val","t2.val AS t1_val")\n\n""" 27 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_orderby.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestOrderby(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT id, name 17 | FROM Test 18 | ORDER BY id, name 19 | """ 20 | pipeline = Pipeline(sql) 21 | ans = pipeline.parse() 22 | expected = """final_df = Test\\\n.selectExpr("id","name")\\\n.orderBy(col("id").asc(),col("name").asc())\n\n""" 23 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_select_from_where.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestSelectFromWhere(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT a, b, c 17 | FROM Test 18 | WHERE info = 1 19 | """ 20 | pipeline = Pipeline(sql) 21 | ans = pipeline.parse() 22 | expected = """final_df = Test\\\n.filter("info = 1")\\\n.selectExpr("a","b","c")\n\n""" 23 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_split.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestDistinct(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT 17 | split(name, ",")[0] as first_name 18 | FROM df 19 | """ 20 | pipeline = Pipeline(sql) 21 | ans = pipeline.parse() 22 | expected = """final_df = df\\\n.selectExpr("SPLIT(name, ',')[0] AS first_name")\n\n""" 23 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_struct.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestDistinct(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT 17 | struct(firstname as firstname, lastname as lastname) as name 18 | FROM df 19 | """ 20 | pipeline = Pipeline(sql) 21 | ans = pipeline.parse() 22 | expected = """final_df = df\\\n.selectExpr("STRUCT(firstname AS firstname, lastname AS lastname) AS name")\n\n""" 23 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_windows.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestWindows(TestCase): 13 | def test_decisive_equailty(self): 14 | 15 | sql = """ 16 | SELECT 17 | name, 18 | ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE 19 | FROM Test 20 | """ 21 | pipeline = Pipeline(sql) 22 | ans = pipeline.parse() 23 | expected = """final_df = Test\\\n.selectExpr("name","ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE")\n\n""" 24 | self.assertEqual(ans, expected) -------------------------------------------------------------------------------- /tests/test_with.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, unicode_literals 2 | 3 | from unittest import TestCase 4 | from mo_sql_parsing import parse 5 | 6 | import json 7 | from databathing.pipeline import Pipeline 8 | 9 | 10 | # python -m unittest discover tests 11 | 12 | class TestWith(TestCase): 13 | maxDiff = None 14 | def test_decisive_equailty(self): 15 | 16 | sql = """ 17 | with step1 as ( 18 | select firstname, id from df 19 | ), step2 as ( 20 | select gender, salary, id from df 21 | ), step3 as ( 22 | select 23 | s1.id, s1.firstname, s2.gender, s2.salary 24 | from step1 as s1 25 | inner join step2 as s2 26 | on s1.id = s2.id 27 | ) 28 | select 29 | * 30 | from step3 31 | """ 32 | pipeline = Pipeline(sql) 33 | ans = pipeline.parse() 34 | expected = """step1 = df\\\n.selectExpr("firstname","id")\n\nstep2 = df\\\n.selectExpr("gender","salary","id")\n\nstep3 = step1.alias("s1").join(step2.alias("s2"), col("s1.id")==col("s2.id"), "inner")\\\n.selectExpr("s1.id","s1.firstname","s2.gender","s2.salary")\n\nfinal_df = step3\\\n.selectExpr("*")\n\n""" 35 | self.assertEqual(ans, expected) --------------------------------------------------------------------------------