├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── bq_sql_gen.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | *.key 132 | *.json 133 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | COPY ./requirements.txt /app/ 4 | COPY ./bq_sql_gen.py /app/ 5 | WORKDIR /app 6 | RUN pip install -r requirements.txt 7 | ENTRYPOINT [ "python", "bq_sql_gen.py" ] 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Masahiro Yamauchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bigquery-generator-ai 2 | 3 | A tool to create a BigQuery SQL using natural language in ChatGPT. 4 | Just write the table name and what you want to achieve with the query, and SQL will be generated. 5 | It refers to the table schema instead of the data in the table to understand the data structure. 6 | 7 | https://github.com/algas/bigquery-generator-ai 8 | 9 | ChatGPT user registration is required to use it. 10 | You will also need to configure your environment and download user credentials to get the table schema in BigQuery. 11 | 12 | ## Setup 13 | 14 | ### ChatGPT API 15 | 16 | 1. Sign up for ChatGPT 17 | https://platform.openai.com/signup 18 | 1. Create a API Key of OpenAI (do not forget it) 19 | https://platform.openai.com/account/api-keys 20 | 1. Set your API key to the enviromnent variable 21 | `export OPENAI_API_KEY=xxxxxx` 22 | 23 | ### Google Cloud 24 | 25 | 1. Set up BigQuery 26 | https://cloud.google.com/bigquery/docs/quickstarts/query-public-dataset-console 27 | 1. Create a service account 28 | https://cloud.google.com/iam/docs/service-accounts-create 29 | 1. Apply "BigQuery Metadata Viewer" (roles/bigquery.metadataViewer) role to the service account 30 | https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer 31 | https://cloud.google.com/iam/docs/manage-access-service-accounts#grant-single-role 32 | 1. Create a service account key (and save to `./credential.json`) 33 | https://cloud.google.com/iam/docs/keys-create-delete#iam-service-account-keys-create-console 34 | 1. Set the path to the credential file to the enviromnent variable 35 | `export GOOGLE_APPLICATION_CREDENTIALS=$PWD/credential.json` 36 | 37 | ## Usage 38 | 39 | ```sh 40 | docker run --rm -e OPENAI_API_KEY=$OPENAI_API_KEY \ 41 | -e GOOGLE_APPLICATION_CREDENTIALS=/app/credential.json \ 42 | -v $GOOGLE_APPLICATION_CREDENTIALS:/app/credential.json \ 43 | -it algas/bigquery-generator-ai:latest \ 44 | 'Instruction' \ 45 | 'Bigquery Table' \ 46 | ['Optional Bigquery Tables'] 47 | ``` 48 | 49 | ### Example 50 | 51 | ```sh 52 | docker run --rm -e OPENAI_API_KEY=$OPENAI_API_KEY \ 53 | -e GOOGLE_APPLICATION_CREDENTIALS=/app/credential.json \ 54 | -v $GOOGLE_APPLICATION_CREDENTIALS:/app/credential.json \ 55 | -it algas/bigquery-generator-ai:latest \ 56 | 'Retrieve the names of customers who purchased products in March 2018.' \ 57 | 'dbt-tutorial.jaffle_shop.customers' \ 58 | 'dbt-tutorial.jaffle_shop.orders' 59 | ``` 60 | 61 | ### Example Result 62 | 63 | ```sql 64 | SELECT c.FIRST_NAME, c.LAST_NAME 65 | FROM `dbt-tutorial.jaffle_shop.customers` c 66 | INNER JOIN `dbt-tutorial.jaffle_shop.orders` o 67 | ON c.ID = o.USER_ID 68 | WHERE EXTRACT(MONTH FROM o.ORDER_DATE) = 3 69 | AND EXTRACT(YEAR FROM o.ORDER_DATE) = 2018; 70 | ``` 71 | 72 | ## Build 73 | 74 | If you want to run your code in your own python environment without docker, the following steps are required. 75 | 76 | 1. Clone the git reposigory 77 | `git clone https://github.com/algas/bigquery-generator-ai.git` 78 | 1. Install dependencies 79 | `pip install -r requirements.txt` 80 | 1. Run a script 81 | ```sh 82 | python bq_sql_gen.py \ 83 | 'Retrieve the names of customers who purchased products in March 2018.' \ 84 | 'dbt-tutorial.jaffle_shop.customers' \ 85 | 'dbt-tutorial.jaffle_shop.orders' 86 | ``` 87 | 88 | ## Note 89 | 90 | - Adding `-v` or `--verbose` at the end of the command will also output the contents of the prompt. 91 | - It may not output correct SQL if complex instructions or statements unrelated to the query are given. 92 | -------------------------------------------------------------------------------- /bq_sql_gen.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from langchain import PromptTemplate, OpenAI, LLMChain 4 | from google.cloud import bigquery 5 | import json 6 | import argparse 7 | 8 | TEMPLATE = ''' 9 | Write a BigQuery SQL that achieves the following. 10 | ``` 11 | {{ content }} 12 | ``` 13 | 14 | The format of the target tables is as follows. 15 | ```json 16 | {{ schema }} 17 | ``` 18 | 19 | Output the SQL in raw text. 20 | ''' 21 | 22 | def get_schema(table_name: str) -> str: 23 | client = bigquery.Client() 24 | table = client.get_table(table_name) 25 | project_id = table.project 26 | dataset_id = table.dataset_id 27 | table_id = table.table_id 28 | schema = list(map(lambda x: x.to_api_repr(), table.schema)) 29 | return {'project':project_id,'dataset':dataset_id,'table':table_id,'schema':schema} 30 | 31 | def get_schemas(table_names: list[str]): 32 | return json.dumps([get_schema(n) for n in table_names]) 33 | 34 | def predict(content: str, table_names: list[str], verbose: bool = False): 35 | prompt = PromptTemplate( 36 | input_variables=["content","schema"], 37 | template=TEMPLATE, 38 | template_format='jinja2', 39 | ) 40 | llm_chain = LLMChain( 41 | llm=OpenAI(temperature=0), 42 | prompt=prompt, 43 | verbose=verbose, 44 | ) 45 | return llm_chain.predict(content=content, schema=get_schemas(table_names)) 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description='BigQuery SQL generator with ChatGPT.') 49 | parser.add_argument('-v', '--verbose', action='store_true') 50 | parser.add_argument('content') 51 | parser.add_argument('table_name', nargs='+') 52 | args = parser.parse_args() 53 | print(predict(args.content, args.table_name, args.verbose)) 54 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.116 2 | openai==0.27.2 3 | google-cloud-bigquery==3.7.0 4 | jinja2==3.1.2 5 | --------------------------------------------------------------------------------