├── .circleci
    └── config.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── databathing
    ├── __init__.py
    ├── pipeline.py
    ├── py_bathing.py
    └── v1.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── setuptools.json
└── tests
    ├── README.md
    ├── __init__.py
    ├── requirements.txt
    ├── test_distinct.py
    ├── test_groupby_having.py
    ├── test_join_on.py
    ├── test_orderby.py
    ├── test_select_from_where.py
    ├── test_split.py
    ├── test_struct.py
    ├── test_windows.py
    └── test_with.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Use the latest 2.1 version of CircleCI pipeline process engine.
 2 | # See: https://circleci.com/docs/2.0/configuration-reference
 3 | version: 2.1
 4 | 
 5 | # Define a job to be invoked later in a workflow.
 6 | # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
 7 | jobs:
 8 |   build_test:
 9 |     # Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub.
10 |     # See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor
11 |     docker:
12 |       - image: circleci/python:3.7.9
13 |     # Add steps to the job
14 |     # See: https://circleci.com/docs/2.0/configuration-reference/#steps
15 |     steps:
16 |       - checkout
17 |       - run:
18 |           command: |
19 |             sudo pip install pipenv
20 |             pipenv install -r tests/requirements.txt
21 |             pipenv install .
22 |             pipenv run python -m unittest discover tests
23 |   test_pypi_publish:
24 |     docker:
25 |       - image: circleci/python:3.7.9
26 |     steps:
27 |       - checkout  # checkout source code to working directory
28 |       - run:
29 |           command: |  # create whl, install twine and publish to Test PyPI
30 |             python setup.py sdist bdist_wheel
31 |             sudo pip install pipenv
32 |             pipenv install twine
33 |             pipenv run twine upload --repository testpypi dist/*
34 |   pypi_publish:
35 |     docker:
36 |       - image: circleci/python:3.7.9
37 |     steps:
38 |       - checkout  # checkout source code to working directory
39 |       - run:
40 |           command: |  # create whl, install twine and publish to PyPI
41 |             python setup.py sdist bdist_wheel
42 |             sudo pip install pipenv
43 |             pipenv install twine
44 |             pipenv run twine upload dist/*
45 | 
46 | 
47 | # Invoke jobs via workflows
48 | # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
49 | workflows:
50 |   build_test_publish:
51 |     jobs:
52 |       - build_test
53 |       - test_pypi_publish:
54 |           requires:
55 |             - build_test
56 |           filters:
57 |             branches:
58 |               only:
59 |                 - develop
60 |       - pypi_publish:
61 |           requires:
62 |             - build_test
63 |           filters:
64 |             branches:
65 |               only:
66 |                 - main


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__/
 2 | databathing.egg-info
 3 | dist
 4 | *.log
 5 | *.pyc
 6 | *.tab
 7 | out
 8 | .idea
 9 | .svn
10 | *.iml
11 | /build
12 | vendor/
13 | ._*
14 | .DS_Store
15 | .eggs
16 | .pytest_cache
17 | Pipfile
18 | Pipfile.lock


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/)
 6 | and this project adheres to [Semantic Versioning](http://semver.org/).
 7 | 
 8 | ## [0.1.2] - 2022-05-18
 9 | ## [0.1.3] - 2022-05-19
10 | ## [0.2.0] - 2022-05-19
11 | ## [0.2.1] - 2022-05-24
12 | 
13 | ### Added
14 | - databathing basic verison
15 | - circleci
16 | - split and struct


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2022] [Jiazhen Zhu]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # More SQL Parsing!
 2 | 
 3 | [![PyPI Latest Release](https://img.shields.io/pypi/v/databathing.svg)](https://pypi.org/project/databathing/)
 4 | [![Build Status](https://circleci.com/gh/jason-jz-zhu/databathing/tree/main.svg?style=svg)](https://app.circleci.com/pipelines/github/jason-jz-zhu/databathing)
 5 | 
 6 | 
 7 | Parse SQL into JSON so we can translate it for other datastores!
 8 | 
 9 | [See changes](https://github.com/jason-jz-zhu/databathing#version-changes)
10 | 
11 | 
12 | ## Problem Statement
13 | 
14 | After converting from sql to spark, data engineers need to write the spark code for ETL pipeline instead of using YAML(SQL) which can improve the performance of ETL job, but it still makes the ETL development longer than before. 
15 | 
16 | Then we have one question: can we have a solution which can have both good calculation performance (Spark) and quick to develop (YAML - SQL)?
17 | 
18 | YES, we have !!!
19 | 
20 | ## Objectives
21 | 
22 | We plan to combine the benefits from Spark and YAML (SQL) to create the platform or library to develop the ETL pipeline. 
23 | 
24 | 
25 | ## Project Status
26 | 
27 | May 2022 - There are [over 900 tests](https://app.circleci.com/pipelines/github/jason-jz-zhu/databathing). This parser is good enough for basic usage, including:
28 | * `SELECT` feature
29 | * `FROM` feature
30 | * `INNER` JOIN and LEFT JOIN feature
31 | * `ON` feature
32 | * `WHERE` feature
33 | * `GROUP BY` feature
34 | * `HAVING` feature
35 | * `ORDER BY` feature
36 | * `AGG` feature
37 | * WINDOWS FUNCTION feature (`SUM`, `AVG`, `MAX`, `MIN`, `MEAN`, `COUNT`)
38 | * ALIAS NAME feature
39 | * `WITH` STATEMENT feature
40 | 
41 | ## Install
42 | 
43 |     pip install databathing
44 | 
45 | 
46 | ## Generating Spark Code
47 | 
48 | You may also generate PySpark Code from the a given SQL Query. This is done by the Pipeline, which is in Version 1 state (May2022).
49 | 
50 |     >>> from databathing import pipeline
51 |     >>> pipeline = pipeline.Pipeline("SELECT * FROM Test WHERE info = 1")
52 |     >>> ans = pipeline.parse()
53 |     'final_df = Test\\\n.filter("info = 1")\\\n.selectExpr("a","b","c")\n\n'
54 | 
55 | ## Contributing
56 | 
57 | In the event that the databathing is not working for you, you can help make this better but simply pasting your sql (or JSON) into a new issue. Extra points if you describe the problem. Even more points if you submit a PR with a test. If you also submit a fix, then you also have my gratitude. 
58 | 
59 | Please follow this blog to update verion - https://circleci.com/blog/publishing-a-python-package/
60 | 
61 | 
62 | ### Run Tests
63 | 
64 | See [the tests directory](https://github.com/jason-jz-zhu/databathing/tree/develop/tests) for instructions running tests, or writing new ones.
65 | 
66 | ## Version Changes
67 | 
68 | 
69 | ### Version 1
70 | 
71 | *May 2022*
72 | 
73 | Features and Functionalities - PySpark Version
74 | * `SELECT` feature
75 | * `FROM` feature
76 | * `INNER` JOIN and LEFT JOIN feature
77 | * `ON` feature
78 | * `WHERE` feature
79 | * `GROUP BY` feature
80 | * `HAVING` feature
81 | * `ORDER BY` feature
82 | * `AGG` feature
83 | * WINDOWS FUNCTION feature (`SUM`, `AVG`, `MAX`, `MIN`, `MEAN`, `COUNT`)
84 | * ALIAS NAME feature
85 | * `WITH` STATEMENT feature
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/databathing/__init__.py:
--------------------------------------------------------------------------------
1 | from databathing.pipeline import Pipeline
2 | from databathing.py_bathing import py_bathing
3 | 
4 | 
5 | 
6 | __all__ = ["Pipeline", "py_bathing"]
7 | 


--------------------------------------------------------------------------------
/databathing/pipeline.py:
--------------------------------------------------------------------------------
  1 | from curses import nonl
  2 | # from mo_sql_parsing import parse
  3 | from mo_sql_parsing import parse_bigquery as parse
  4 | from mo_sql_parsing import format
  5 | import json
  6 | import copy
  7 | 
  8 | from databathing.py_bathing import py_bathing
  9 | 
 10 | 
 11 | class Pipeline:
 12 |     def __init__(self, query):
 13 |         # print(query)
 14 |         self.parsed_whole_query = parse(query)
 15 |         self.parsed_json_whole_query = json.loads(json.dumps(self.parsed_whole_query,indent=4))
 16 |         self.parsed_json_whole_query = self.parsed_json_whole_query
 17 |         self.with_ans = ""
 18 |         self.last_ans = ""
 19 | 
 20 |     def gen_with_pipeline(self, query):
 21 |         if "with" in query:
 22 |             with_stmts =  query["with"]
 23 |             if type(with_stmts) is dict:
 24 |                 self.gen_with_pipeline(with_stmts)
 25 |             else:
 26 |                 for with_stmt in with_stmts:
 27 |                     self.gen_with_pipeline(with_stmt)   
 28 |         else:
 29 |             dbing = py_bathing(query["value"])
 30 |             self.with_ans += query["name"] + " = " + dbing.parse() + "\n\n"
 31 | 
 32 | 
 33 |     def gen_last_pipeline(self, query):
 34 |         tmp_query = copy.deepcopy(query) 
 35 | 
 36 |         if "with" in query:
 37 |             del tmp_query["with"]
 38 |         
 39 |         dbing = py_bathing(tmp_query)
 40 |         self.last_ans = "final_df = " + dbing.parse() + "\n\n"
 41 | 
 42 |     def parse(self):
 43 |         final_ans = ""
 44 |         if "with" in self.parsed_json_whole_query:
 45 |             self.gen_with_pipeline(self.parsed_json_whole_query)
 46 |             final_ans += self.with_ans
 47 |         self.gen_last_pipeline(self.parsed_json_whole_query)
 48 |         final_ans += self.last_ans
 49 |         return final_ans
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | # query = """
 60 | 
 61 | # select
 62 | # df1.firstname,
 63 | # count(*) cnt
 64 | # from df as df1
 65 | # inner join df as df2
 66 | # on df1.firstname = df2.firstname
 67 | # group by df1.firstname
 68 | # having cnt > 0
 69 | 
 70 | # """
 71 | 
 72 | # query = """
 73 | # with step1 as (
 74 | #     select firstname, id from df
 75 | # ), step2 as (
 76 | #     select gender, salary, id from df
 77 | # ), step3 as (
 78 | #     select 
 79 | #         s1.id, s1.firstname, s2.gender, s2.salary
 80 | #     from step1 as s1
 81 | #     inner join step2 as s2
 82 | #     on s1.id = s2.id
 83 | # )
 84 | # select
 85 | #     *,
 86 | #     RANK() OVER (PARTITION BY id ORDER BY salary DESC) AS seq
 87 | # from step3
 88 | # """
 89 | 
 90 | # query = """
 91 | # SELECT b.id, b.title, a.last_name AS author, e.last_name AS editor,
 92 | #             t.last_name AS translator
 93 | #         FROM book b
 94 | #         LEFT JOIN author a
 95 | #         ON b.author_id = a.id
 96 | #         LEFT JOIN editor e
 97 | #         ON b.editor_id = e.id
 98 | #         LEFT JOIN translator t
 99 | #         ON b.translator_id = t.id
100 | #         ORDER BY b.id, a.id desc
101 | # """
102 | 
103 | 
104 | 
105 | # query = """
106 | #     WITH namePreDF AS (
107 | #         SELECT 
108 | #             distinct glbl_ptnt_id, 
109 | #             patient_name,
110 | #             struct(split(patient_name, ',')[0] as firstname, split(patient_name, ',')[1] as lastname) as patient_name_info
111 | #         FROM overviewDF
112 | #         WHERE patient_name != ''
113 | #         ORDER BY filled_date desc
114 | #         )
115 | #         SELECT
116 | #             glbl_ptnt_id,
117 | #             collect_set(patient_name_info) as patient_name_info
118 | #         FROM namePreDF
119 | #         GROUP BY glbl_ptnt_id
120 | #         """
121 | 
122 | # query = """
123 | #             SELECT 
124 | #                 struct(firstname as firstname, lastname as lastname) as name
125 | #             FROM df
126 | # """
127 | 
128 | # pipeline = Pipeline(query)
129 | 
130 | # ans = pipeline.parse()
131 | # print(ans)


--------------------------------------------------------------------------------
/databathing/py_bathing.py:
--------------------------------------------------------------------------------
  1 | from curses import nonl
  2 | # from mo_sql_parsing import parse
  3 | from mo_sql_parsing import parse_bigquery as parse
  4 | from mo_sql_parsing import format
  5 | import json
  6 | 
  7 | 
  8 | class py_bathing:
  9 |     def __init__(self, parsed_json_whole_query):
 10 |         self.parsed_json_whole_query = parsed_json_whole_query
 11 | 
 12 |         self.distinct_flag = False
 13 |         self.from_ans = ""
 14 |         self.select_ans = ""
 15 |         self.level_select = 0
 16 |         self.where_ans = ""
 17 |         self.groupby_ans = ""
 18 |         self.limit_ans = ""
 19 |         self.agg_ans = ""
 20 |         self.having_ans = ""
 21 |         self.orderby_ans = ""
 22 |         self.agg_list = ["sum", "avg", "max", "min", "mean", "count", "collect_list", "collect_set"]
 23 | 
 24 | 
 25 |     def _from_analyze(self, from_stmt):
 26 |         if not from_stmt:
 27 |             return 
 28 |         if type(from_stmt) is str:
 29 |             self.from_ans += format({ "from": from_stmt })[5:]
 30 |         elif type(from_stmt) is dict:
 31 |             if "name" in from_stmt.keys():
 32 |                 self.from_ans += from_stmt['value']+".alias(\""+from_stmt['name']+"\")."
 33 |             elif "left join" in from_stmt.keys():
 34 |                 self.from_ans += "join({}, {}, \"{}\").".format( 
 35 |                     from_stmt['left join']['value']+".alias(\""+from_stmt['left join']['name']+"\")", 
 36 |                     "col(\""+str(from_stmt['on']['eq'][0])+"\")" + "==" + "col(\""+str(from_stmt['on']['eq'][1])+"\")" , 
 37 |                     'left')
 38 |             elif "inner join" in from_stmt.keys():
 39 |                 self.from_ans += "join({}, {}, \"{}\").".format( 
 40 |                     from_stmt['inner join']['value']+".alias(\""+from_stmt['inner join']['name']+"\")", 
 41 |                     "col(\""+str(from_stmt['on']['eq'][0])+"\")" + "==" + "col(\""+str(from_stmt['on']['eq'][1])+"\")" , 
 42 |                     'inner')
 43 |             elif "right join" in from_stmt.keys():
 44 |                 self.from_ans += "join({}, {}, \"{}\").".format( 
 45 |                     from_stmt['right join']['value']+".alias(\""+from_stmt['right join']['name']+"\")", 
 46 |                     "col(\""+str(from_stmt['on']['eq'][0])+"\")" + "==" + "col(\""+str(from_stmt['on']['eq'][1])+"\")" , 
 47 |                     'right')
 48 |                 
 49 |         elif type(from_stmt) is list:
 50 |             for item_from in from_stmt:
 51 |                 self._from_analyze(item_from)    
 52 |     
 53 |     def _select_analyze(self, select_stmt):
 54 |         # print(select_stmt)
 55 | 
 56 |         if not select_stmt:
 57 |             return
 58 | 
 59 |         if  type(select_stmt) is str:
 60 |             self.select_ans  += "\"" + format({ "select": select_stmt })[7:] + "\","
 61 |             return  
 62 |         if type(select_stmt) is dict and type(select_stmt['value']) is str:
 63 |             self.select_ans  += "\"" + format({ "select": select_stmt })[7:] + "\","
 64 |             return
 65 |         if type(select_stmt) is dict:
 66 |             if list(select_stmt["value"].keys())[0].lower() in self.agg_list:
 67 |                 self.select_ans  += "\""+ select_stmt['name'] +"\","
 68 |             elif list(select_stmt["value"].keys())[0].lower() == "create_struct":
 69 |                 self.select_ans  += "\"" + format({ "select": select_stmt })[14:] + "\","
 70 |             else:
 71 |                 self.select_ans  += "\"" + format({ "select": select_stmt })[7:] + "\","
 72 |         elif type(select_stmt) is list and (self.level_select == 0):
 73 |             self.level_select += 1
 74 |             for inner_item in select_stmt:
 75 |                 self._select_analyze(inner_item)
 76 | 
 77 | 
 78 |     def _where_analyze(self, where_stmt):
 79 |         self.where_ans = format({ "where": where_stmt })[6:]
 80 | 
 81 |     def _groupby_analyze(self, groupby_stmt):
 82 |         self.groupby_ans = format({ "groupby": groupby_stmt })[9:]
 83 | 
 84 |     def _agg_analyze(self, agg_stmt):
 85 |         if type(agg_stmt) is dict:
 86 |             if type(agg_stmt["value"]) is dict and list(agg_stmt["value"].keys())[0].lower() in self.agg_list:
 87 |                 for funct, alias in agg_stmt["value"].items():
 88 |                     self.agg_ans += "{}(col(\"{}\")).alias(\"{}\"),".format(funct, alias, agg_stmt["name"])
 89 | 
 90 |         elif type(agg_stmt) is list:
 91 |             for item in agg_stmt:
 92 |                 self._agg_analyze(item)
 93 | 
 94 |                 # if type(item["value"]) is dict and list(item["value"].keys())[0].lower() in self.agg_list:
 95 |                 #     for funct, alias in item["value"].items():
 96 |                 #         self.agg_ans += "{}(col(\"{}\")).alias(\"{}\"),".format(funct, alias, item["name"])
 97 |                     
 98 |         self.agg_ans = self.agg_ans.replace("\n", "")
 99 | 
100 | 
101 |     def _having_analyze(self, having_stmt):
102 |             self.having_ans = format({ "having": having_stmt })[7:]
103 | 
104 |     def _orderby_analyze(self, order_stmt):
105 |         # print(order_stmt)
106 |         if type(order_stmt) is dict:
107 |             odr = "desc()" if order_stmt.get("sort", "asc") == "desc" else "asc()"
108 |             self.orderby_ans += "col(\"{}\").{},".format(str(order_stmt["value"]), odr)
109 |         else:
110 |             for item in order_stmt:
111 |                 self._orderby_analyze(item)
112 |         # for item in order_stmt:
113 |         #     odr = "desc()" if item.get("sort", "asc") == "desc" else "asc()"
114 |         #     self.orderby_ans += "col(\"{}\").{},".format(str(item["value"]), odr)
115 | 
116 |     def _limit_analyze(self, limit_stmt):
117 |         self.limit_ans = limit_stmt
118 | 
119 |     def parse(self):
120 |         from_final_ans = where_final_ans = groupby_final_ans = agg_final_ans = select_final_ans = orderby_final_ans = limit_final_ans = having_final_ans = ""
121 | 
122 |         for method, stmt in self.parsed_json_whole_query.items():
123 |             # handle from
124 |             if str(method).lower() == "from":
125 |                 self._from_analyze(stmt)
126 |                 from_final_ans = self.from_ans[:-1] if self.from_ans[-1] == '.' else self.from_ans
127 | 
128 |             #handle where
129 |             elif str(method).lower() == "where":
130 |                 self._where_analyze(stmt)
131 |                 where_final_ans = self.where_ans
132 | 
133 |             #handle groupby and agg
134 |             elif str(method).lower() == "groupby":
135 |                 # group by
136 |                 self._groupby_analyze(stmt)
137 |                 groupby_final_ans = self.groupby_ans
138 |                 # agg
139 |                 agg_stmt = self.parsed_json_whole_query["select"] \
140 |                     if "select" in self.parsed_json_whole_query.keys() \
141 |                     else self.parsed_json_whole_query["select_distinct"]
142 |                 self._agg_analyze(agg_stmt)
143 |                 agg_final_ans = self.agg_ans[:-1]
144 | 
145 |             #handle select
146 |             elif str(method).lower() in ["select", "select_distinct"]:
147 |                 self._select_analyze(stmt)
148 |                 select_final_ans = self.select_ans[:-1]
149 |                 self.distinct_flag = True if str(method) == "select_distinct" else  False
150 | 
151 |             # handle having
152 |             elif str(method) =="having": 
153 |                 self._having_analyze(stmt)
154 |                 having_final_ans = self.having_ans
155 | 
156 |             #handle sort
157 |             elif str(method) =="orderby":
158 |                 self._orderby_analyze(stmt)
159 |                 orderby_final_ans = self.orderby_ans[:-1]
160 | 
161 |             #handle limit
162 |             elif str(method).lower() =="limit":
163 |                 self._limit_analyze(stmt)
164 |                 limit_final_ans = self.limit_ans
165 | 
166 |         final_ans = ""
167 |         if from_final_ans:
168 |             final_ans += from_final_ans + "\\"
169 |         if where_final_ans:
170 |             final_ans += "\n.filter(\"{}\")\\".format(where_final_ans) 
171 |         if groupby_final_ans:
172 |             final_ans += "\n.groupBy(\"{}\")\\".format(groupby_final_ans)
173 |         if agg_final_ans:
174 |             final_ans += "\n.agg({})\\".format(agg_final_ans)
175 |         if having_final_ans:
176 |             final_ans += "\n.filter(\"{}\")\\".format(having_final_ans) 
177 |         if select_final_ans:
178 |             final_ans += "\n.selectExpr({})\\".format(select_final_ans)
179 |         if self.distinct_flag:
180 |             final_ans += "\n.distinct()\\"
181 |         if orderby_final_ans:
182 |             final_ans += "\n.orderBy("+orderby_final_ans+")\\"
183 |         if limit_final_ans:
184 |             final_ans +=  "\n.limit("+str(limit_final_ans)+")\\"
185 |         
186 |         return final_ans[:-1]
187 | 
188 | 
189 | 
190 | 
191 | # query = """
192 | #         SELECT 
193 | #             distinct glbl_ptnt_id, 
194 | #             patient_name,
195 | #             split(patient_name, ',')[0] as first_name,
196 | #             split(patient_name, ',')[1] as last_name,
197 | #             struct(first_name as firstname, last_name as lastname) as patient_name_info
198 | #         FROM overviewDF
199 | #         WHERE patient_name != ''
200 | #         ORDER BY filled_date desc
201 | #         """
202 | 
203 | 
204 | # parsed_whole_query = parse(query)
205 | # parsed_json_whole_query = json.loads(json.dumps(parsed_whole_query,indent=4))
206 | 
207 | # # print(parsed_json_whole_query)
208 | 
209 | # dbing = py_bathing(parsed_json_whole_query)
210 | # ans = dbing.parse()
211 | # print(ans)


--------------------------------------------------------------------------------
/databathing/v1.py:
--------------------------------------------------------------------------------
  1 | from curses import nonl
  2 | # from mo_sql_parsing import parse
  3 | from mo_sql_parsing import parse_bigquery as parse
  4 | 
  5 | from mo_sql_parsing import format
  6 | import json
  7 | 
  8 | # query = """
  9 | # SELECT product_id as new_product_id,
 10 | #     Count(star_rating) as total_rating,
 11 | #     Max(star_rating)   AS best_rating,
 12 | #     Min(star_rating)   AS worst_rating,
 13 | #     ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE
 14 | # FROM   tbl_books
 15 | # WHERE  verified_purchase = 'Y'
 16 | #     AND review_date BETWEEN '1995-07-22' AND '2015-08-31'
 17 | #     AND marketplace IN ( 'DE', 'US', 'UK', 'FR', 'JP' )
 18 | # GROUP  BY product_id
 19 | # ORDER  BY total_rating asc,product_id desc,best_rating
 20 | # LIMIT  10;
 21 | # """
 22 | 
 23 | # query = """select distinct 
 24 | # firstname,
 25 | # lastname,
 26 | # case when gender == "M" then "m" else "f" end as new,
 27 | # ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE
 28 | # from test1 t1
 29 | # where t1.cc = 'cc'"""
 30 | 
 31 | 
 32 | query = """
 33 | select distinct t1 as tt2, t2 as tt2
 34 | from test
 35 | """
 36 | 
 37 | # query = """select 
 38 | # t1.a, t2.b
 39 | # from test1 t1
 40 | # left join test2 t2
 41 | # on t1.a = t2.a
 42 | # inner join test3 t3
 43 | # on t2.b = t3.b
 44 | # where t1.cc = 'cc'"""
 45 | 
 46 | # query = """select 
 47 | # t1.a, t2.b
 48 | # from test1 t1
 49 | # where t1.cc = 'cc'"""
 50 | 
 51 | # query = """
 52 | # with tmp as (
 53 | # SELECT product_id as new_product_id,
 54 | #     Count(star_rating) as total_rating,
 55 | #     Max(star_rating)   AS best_rating,
 56 | #     Min(star_rating)   AS worst_rating,
 57 | #     ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE
 58 | # FROM   tbl_books
 59 | # WHERE  verified_purchase = 'Y'
 60 | #     AND review_date BETWEEN '1995-07-22' AND '2015-08-31'
 61 | #     AND marketplace IN ( 'DE', 'US', 'UK', 'FR', 'JP' )
 62 | # GROUP  BY product_id
 63 | # ORDER  BY total_rating asc,product_id desc,best_rating
 64 | # LIMIT  10
 65 | # ), aa as (select * from tmp)
 66 | # select * from aa
 67 | # """
 68 | 
 69 | 
 70 | # query = """
 71 | # with step1 as (
 72 | #     select * from t1
 73 | # ), step2 as (
 74 | #     select * from t2
 75 | # ), step3 as (
 76 | #     select 
 77 | #         s1.a, s2.b
 78 | #     from step1 as s1
 79 | #     inner join step2 as s2
 80 | #     on s1.a = s2.a
 81 | # )
 82 | # select
 83 | #     a, b, 
 84 | #     ROW_NUMBER() OVER (PARTITION BY a ORDER BY b DESC) AS seq
 85 | # from step3
 86 | # """
 87 | 
 88 | 
 89 | v_parse = parse(query)
 90 | v_json = json.loads(json.dumps(v_parse,indent=4))
 91 | print(v_json)
 92 | 
 93 | distinct_flag = False
 94 | 
 95 | result_from=""
 96 | def fn_from(value):
 97 |     global result_from
 98 |     if not value:
 99 |         return 
100 |     if type(value) is str:
101 |         result_from += format({ "from": value })[5:]
102 |     elif type(value) is dict:
103 |         if "name" in value.keys():
104 |             result_from += value['value']+".alias(\""+value['name']+"\")."
105 |         elif "left join" in value.keys():
106 |             result_from += "join({}, {}, \"{}\").".format( 
107 |                 value['left join']['value']+".alias(\""+value['left join']['name']+"\")", 
108 |                 "col(\""+str(value['on']['eq'][0])+"\")" + "===" + "col(\""+str(value['on']['eq'][1])+"\")" , 
109 |                 'left')
110 |         elif "inner join" in value.keys():
111 |             result_from += "join({}, {}, \"{}\").".format( 
112 |                 value['inner join']['value']+".alias(\""+value['inner join']['name']+"\")", 
113 |                 "col(\""+str(value['on']['eq'][0])+"\")" + "===" + "col(\""+str(value['on']['eq'][1])+"\")" , 
114 |                 'inner')
115 |         elif "right join" in value.keys():
116 |             result_from += "join({}, {}, \"{}\").".format( 
117 |                 value['right join']['value']+".alias(\""+value['right join']['name']+"\")", 
118 |                 "col(\""+str(value['on']['eq'][0])+"\")" + "===" + "col(\""+str(value['on']['eq'][1])+"\")" , 
119 |                 'right')
120 |             
121 |     elif type(value) is list:
122 |         for item_from in value:
123 |             fn_from(item_from)
124 | 
125 | 
126 | 
127 | 
128 | # def fn_from(value):
129 | #     print("------")
130 | #     print(value)
131 | #     print("------")
132 | #     result_from=""
133 | #     if type(value) is str:
134 | #         result_from = format({ "from": value })
135 | #         result_from = result_from[5:]
136 | #     # elif type(value) is dict:
137 | #     #     if "name" in value.keys():
138 | #     #         result_from = result_from + value['value']+".alias(\""+value['name']+"\")"
139 | #     #     else:
140 | #     #         result_from = result_from + value['value']+""
141 | #     elif type(value) is list:
142 | #         for item_from in value:
143 | #             if type(item_from) is dict:
144 | #                 if "name" in item_from.keys():
145 | #                     result_from = result_from + item_from['value']+".alias(\""+item_from['name']+"\"),"
146 | #                 else:
147 | #                     result_from = result_from + item_from['value']+","
148 | #             elif type(item_from) is str:
149 | #                 result_from = result_from + item_from+","
150 | #     return result_from
151 |         
152 | 
153 | agg_list = ["sum", "avg", "max", "min", "mean", "count"]
154 | result_select = ""
155 | level_select =0
156 | def fn_select(value):
157 |     global distinct_flag
158 |     global result_select
159 |     global level_select
160 |     if not value:
161 |         return
162 |     
163 |     if  type(value) is str:
164 |         result_select += "\"" + format({ "select": value })[7:] + "\","
165 |         return  
166 |     if type(value) is dict and type(value['value']) is str:
167 |         result_select += "\"" + format({ "select": value })[7:] + "\","
168 |         return
169 |     if type(value) is dict:
170 |         if "distinct" in value["value"].keys():
171 |             distinct_flag = True
172 |             level_select += 1
173 |             fn_select(value["value"]["distinct"])
174 |         elif list(value["value"].keys())[0].lower() in agg_list:
175 |             result_select += "\""+ value['name'] +"\","
176 |         else:
177 |             result_select += "\"" + format({ "select": value })[7:] + "\","
178 |     elif type(value) is list and (level_select == 0 or (level_select == 1 and distinct_flag)):
179 |         for inner_item in value:
180 |             fn_select(inner_item)
181 | 
182 | def fn_where(value):
183 |     result_where=""
184 |     result_where = format({ "where": value })[6:]
185 |     return result_where
186 | 
187 | 
188 | def fn_groupby(value):
189 |     result_groupby=""
190 |     result_groupby = format({ "groupby": value })[9:]
191 |     return result_groupby
192 | 
193 | def fn_agg(query):
194 |     v_parse = parse(query)
195 |     v_agg = ""
196 |     for i in v_parse["select"]:
197 |         if type(i["value"]) is dict:
198 |             for key,value in i["value"].items():
199 |                 v_agg = v_agg + (key+"("+"col(\""+str(value)+"\")"+").alias('"+i["name"]+"')") +","
200 |     v_agg = v_agg.replace("\n", "")
201 |     return v_agg[:-1]
202 | 
203 | 
204 | def fn_orderby(query):
205 |     v_parse = parse(query)
206 |     v_orderby_collist=""
207 |     v_orderby = v_parse["orderby"]
208 |     for i in v_orderby:
209 |         if i.get("sort", "asc") == "desc":
210 |             v_sortorder = "desc()"
211 |         else:
212 |             v_sortorder = "asc()"
213 |         v_orderby_collist = v_orderby_collist + "col(\""+str(i.get("value", ""))+"\")." +v_sortorder+","
214 |     return v_orderby_collist[:-1]
215 | 
216 | 
217 | def fn_limit(query):
218 |     v_parse = parse(query)
219 |     v_limit = v_parse["limit"]
220 |     return v_limit
221 | 
222 | 
223 | def fn_genSQL(data):
224 |     v_fn_from = v_fn_where = v_fn_groupby = v_fn_agg = v_fn_select = v_fn_orderby = v_fn_limit = ""
225 |     for key,value in data.items():
226 |         # handle from
227 |         if str(key)=="from":
228 |             fn_from(value)
229 |             v_fn_from = result_from[:-1]
230 | 
231 |         #handle where
232 |         if str(key) =="where":
233 |             v_fn_where = fn_where(value)
234 | 
235 |         #handle groupby
236 |         if str(key) =="groupby":
237 |             v_fn_groupby = fn_groupby(value)
238 | 
239 |         #handle agg
240 |         if str(key) =="groupby":
241 |             v_fn_agg = fn_agg(query)
242 | 
243 |         #handle select
244 |         if str(key) =="select":
245 |             fn_select(value)
246 |             v_fn_select = result_select[:-1]
247 | 
248 | 
249 |         #handle sort
250 |         if str(key) =="orderby":
251 |             v_fn_orderby = fn_orderby(query)
252 | 
253 |         #handle limit
254 |         if str(key) =="limit":
255 |             v_fn_limit = fn_limit(query)
256 | 
257 |     v_final_stmt = ""
258 |     if v_fn_from:
259 |         v_final_stmt = v_final_stmt + v_fn_from
260 |     if v_fn_where:
261 |         v_final_stmt = v_final_stmt + "\n.filter(\""+v_fn_where+"\")"
262 |     if v_fn_groupby:
263 |         v_final_stmt = v_final_stmt + "\n.groupBy(\""+v_fn_groupby+"\")"
264 |     if v_fn_agg:
265 |         v_final_stmt = v_final_stmt + "\n.agg("+v_fn_agg+"\")"
266 |     if v_fn_select:
267 |         v_final_stmt = v_final_stmt + "\n.selectExpr("+v_fn_select+")"
268 |     if distinct_flag:
269 |         v_final_stmt = v_final_stmt + "\n.distinct()"
270 |     if v_fn_orderby:
271 |         v_final_stmt = v_final_stmt + "\n.orderBy("+v_fn_orderby+")"
272 |     if v_fn_limit:
273 |         v_final_stmt = v_final_stmt + "\n.limit("+str(v_fn_limit)+")"
274 |     
275 |     return v_final_stmt
276 |     
277 | 
278 | print (fn_genSQL(v_json))
279 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mo-sql-parsing


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file=README.md
3 | license_files=LICENSE


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name='databathing',
 5 |     version='0.2.1',
 6 |     description="build spark job based on query",
 7 |     author="Jiazhen Zhu",
 8 |     author_email="jason.jz.zhu@gmail.com",
 9 |     classifiers=[
10 |         "Development Status :: 3 - Alpha",
11 |         "Topic :: Software Development :: Libraries",
12 |         "Topic :: Software Development :: Libraries :: Python Modules",
13 |         "Programming Language :: SQL","Programming Language :: Python :: 3.7",
14 |         "Programming Language :: Python :: 3.8",
15 |         "Programming Language :: Python :: 3.9",
16 |         'License :: OSI Approved :: MIT License'],
17 |     license="MIT",
18 |     packages=['databathing'],
19 |     install_requires=[
20 |           'mo-sql-parsing',
21 |       ],
22 |     long_description='# Convert SQL to Spark Code!\n\n[![PyPI Latest Release]',
23 |     long_description_content_type='text/markdown'
24 | )
25 | 


--------------------------------------------------------------------------------
/setuptools.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jason-jz-zhu/databathing/67674f81912b562b31f1cfa5bbed6602ecc2908f/setuptools.json


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # More Query Parsing Tests
 2 | 
 3 | The test suite has over 400 tests.
 4 | 
 5 | ## Running Tests
 6 | 
 7 | For __Linux__:
 8 | 
 9 | 	git clone https://github.com/jason-jz-zhu/databathing.git
10 | 	cd databathing
11 | 	pip install -r requirements.txt
12 | 	pip install -r tests/requirements.txt
13 | 	export PYTHONPATH=.	
14 | 	python -m unittest discover tests


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jason-jz-zhu/databathing/67674f81912b562b31f1cfa5bbed6602ecc2908f/tests/__init__.py


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | mo-sql-parsing


--------------------------------------------------------------------------------
/tests/test_distinct.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestDistinct(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql = """
16 |         SELECT distinct a, b, c
17 |         FROM Test
18 |         """
19 |         pipeline = Pipeline(sql)
20 |         ans = pipeline.parse()
21 |         expected = """final_df = Test\\\n.selectExpr("a","b","c")\\\n.distinct()\n\n"""
22 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_groupby_having.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | class TestGroupbyHaving(TestCase):
10 |     def test_decisive_equailty(self):
11 | 
12 |         sql = """
13 |         select 
14 |             product_id, 
15 |             count(*) cnt
16 |         from Test
17 |         group by product_id
18 |         having cnt > 1
19 |         """
20 | 
21 |         pipeline = Pipeline(sql)
22 |         ans = pipeline.parse()
23 |         expected = """final_df = Test\\\n.groupBy("product_id")\\\n.agg(count(col("*")).alias("cnt"))\\\n.filter("cnt > 1")\\\n.selectExpr("product_id","cnt")\n\n"""
24 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_join_on.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestJoinOn(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql =  """
16 |         SELECT 
17 |             t1.id as id, 
18 |             t1.val as t1_val,
19 |             t2.val as t1_val
20 |         FROM Test t1
21 |         LEFT JOIN Test t2
22 |         ON t1.id = t2.id
23 |         """
24 |         pipeline = Pipeline(sql)
25 |         ans = pipeline.parse()
26 |         expected = """final_df = Test.alias("t1").join(Test.alias("t2"), col("t1.id")==col("t2.id"), "left")\\\n.selectExpr("t1.id AS id","t1.val AS t1_val","t2.val AS t1_val")\n\n"""
27 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_orderby.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestOrderby(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql = """
16 |         SELECT id, name
17 |         FROM Test
18 |         ORDER BY id, name
19 |         """
20 |         pipeline = Pipeline(sql)
21 |         ans = pipeline.parse()
22 |         expected = """final_df = Test\\\n.selectExpr("id","name")\\\n.orderBy(col("id").asc(),col("name").asc())\n\n"""
23 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_select_from_where.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestSelectFromWhere(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql = """
16 |         SELECT a, b, c
17 |         FROM Test
18 |         WHERE info = 1
19 |         """
20 |         pipeline = Pipeline(sql)
21 |         ans = pipeline.parse()
22 |         expected = """final_df = Test\\\n.filter("info = 1")\\\n.selectExpr("a","b","c")\n\n"""
23 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_split.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestDistinct(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql = """
16 |             SELECT 
17 |                 split(name, ",")[0] as first_name
18 |             FROM df
19 |             """
20 |         pipeline = Pipeline(sql)
21 |         ans = pipeline.parse()
22 |         expected = """final_df = df\\\n.selectExpr("SPLIT(name, ',')[0] AS first_name")\n\n"""
23 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_struct.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestDistinct(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql = """
16 |             SELECT 
17 |                 struct(firstname as firstname, lastname as lastname) as name
18 |             FROM df
19 |             """
20 |         pipeline = Pipeline(sql)
21 |         ans = pipeline.parse()
22 |         expected = """final_df = df\\\n.selectExpr("STRUCT(firstname AS firstname, lastname AS lastname) AS name")\n\n"""
23 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_windows.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestWindows(TestCase):
13 |     def test_decisive_equailty(self):
14 | 
15 |         sql = """
16 |         SELECT 
17 |             name,
18 |             ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE
19 |         FROM Test
20 |         """
21 |         pipeline = Pipeline(sql)
22 |         ans = pipeline.parse()
23 |         expected = """final_df = Test\\\n.selectExpr("name","ROW_NUMBER() OVER (PARTITION BY firstname ORDER BY salary DESC) AS SEQUENCE")\n\n"""
24 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------
/tests/test_with.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, unicode_literals
 2 | 
 3 | from unittest import TestCase
 4 | from mo_sql_parsing import parse
 5 | 
 6 | import json
 7 | from databathing.pipeline import Pipeline
 8 | 
 9 | 
10 | # python -m unittest discover tests
11 | 
12 | class TestWith(TestCase):
13 |     maxDiff = None
14 |     def test_decisive_equailty(self):
15 | 
16 |         sql = """
17 |             with step1 as (
18 |                 select firstname, id from df
19 |             ), step2 as (
20 |                 select gender, salary, id from df
21 |             ), step3 as (
22 |                 select 
23 |                     s1.id, s1.firstname, s2.gender, s2.salary
24 |                 from step1 as s1
25 |                 inner join step2 as s2
26 |                 on s1.id = s2.id
27 |             )
28 |             select
29 |                 *
30 |             from step3
31 |         """
32 |         pipeline = Pipeline(sql)
33 |         ans = pipeline.parse()
34 |         expected = """step1 = df\\\n.selectExpr("firstname","id")\n\nstep2 = df\\\n.selectExpr("gender","salary","id")\n\nstep3 = step1.alias("s1").join(step2.alias("s2"), col("s1.id")==col("s2.id"), "inner")\\\n.selectExpr("s1.id","s1.firstname","s2.gender","s2.salary")\n\nfinal_df = step3\\\n.selectExpr("*")\n\n"""
35 |         self.assertEqual(ans, expected)


--------------------------------------------------------------------------------