├── .gitignore ├── LICENSE ├── README.md ├── assets ├── bigpato.gif └── bigpato_arch.png ├── bigpato ├── __init__.py ├── bigpato.py └── common │ ├── __init__.py │ └── constants.py ├── launch_build_install_package_source.sh ├── sample └── sample_streamlit_app.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | #Others 163 | 164 | sample/datasets/ 165 | sample/secrets/ 166 | sample/local_test_env/ 167 | sample/*.duckdb 168 | sample/*.sh 169 | sample/*.txt 170 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Luis Velasco 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigPato: a smart SQL client for BigQuery and duckDB 2 | 3 | 4 | 5 | BigPato is a experimental python package that implements a smart SQL router between two SQL backend engines: `BigQuery` and `duckDB`. 6 | 7 | The overal idea is maintaning a hybrid database, where the most used tables will be located at our local computer (e.g. laptop) and the rest in a cloud DWH like `BigQuery`. 8 | 9 | 10 | ![BigPato working with streamlit](assets/bigpato_arch.png) 11 | 12 | `BigPato` implements: 13 | - A simple automatic data tiering mechanism that updates local storage on request. 14 | - A SQL transpilation between `BigQuery` and `duckDB` SQL engines, so same query works on both engines. 15 | - A unified interface that redirects queries to the most performant engine and return a `pandas` dataframe with they query ouptput 16 | 17 | ## Demo 18 | 19 | A demo showcasing the integration of BigPato with streamlit is included under the `sample/` folder 20 | 21 | ![BigPato working with streamlit](assets/bigpato.gif) 22 | 23 | ## Example run 24 | 25 | Import the package and create a new `BigPato` client calling `BigPato`: 26 | 27 | ```python 28 | from bigpato import bigpato 29 | 30 | bq_project = 'GCP_PROJECT' 31 | bq_database = 'BQ_DATASET_NAME' 32 | bq_key = 'JSON_FILE' 33 | duckdb_db = 'DUCKDB_FILE' 34 | local_duck_folder = 'LOCAL_DATA_FOLDER' 35 | export_bq_bucket = 'GCS_BUCKET' 36 | 37 | bp_client = bigpato.BigPato(bq_project=bq_project, bq_database=bq_database, bq_key=bq_key, 38 | duckdb_db=duckdb_db, local_duck_folder=local_duck_folder, export_bq_bucket=export_bq_bucket) 39 | ``` 40 | 41 | The constructor takes the following arguments: 42 | 43 | * `bq_project` = Google Cloud Platform project 44 | * `bq_database` = BigQuery dataset to use, it should be already created. BigPato only supports querying tables under the same dataset 45 | * `bq_key` = Service Account secrets JSON key with enough permissions to execute SQL queries and export data to GCS 46 | * `duckdb_db` = Local duckDB metadata file, if it is not present it will be created on the first run 47 | * `local_duck_folder` = A local folder where the duckDB will cache tables 48 | * `export_bq_bucket` = A GCS bucket that will be used to export BigQuery tables in parquet format. 49 | 50 | Inspect the table location metadata calling `get_metadata_dict()`: 51 | 52 | ```python 53 | print(bigpato_client.get_metadata_dict()) 54 | 55 | ``` 56 | `BigPato` maintains a interal catalog with information about the tables deployed in the specified BigQuery dataset and the tables promoted to local storage (also registered on the `duckDB` catalog). That will return the tables location: 57 | 58 | ```python 59 | {'call_center': {'location': 'local', 'usage': 0}, 60 | ... 61 | 'catalog_page': {'location': 'bigquery', 'usage': 0}, 62 | 'web_returns': {'location': 'bigquery', 'usage': 0} 63 | } 64 | ``` 65 | 66 | In the example above we have a dataset at BigQuery with the TPC-DS tables, but some tables have been already promoted to local in previous executions. 67 | In this case, the table `call_center` is a local table accesible by `duckDB` and the tables `catalog_page` and `web_returns` are tables in `BigQuery`. 68 | 69 | Run a query against a `BigQuery` located table , the `exec_query` method get a SQL query and returns a `pandas` dataframe 70 | 71 | ```python 72 | df = bigpato_client.exec_query("SELECT * FROM catalog_page") 73 | ``` 74 | Note that there is no need for using the `BigQuery` `.` syntax 75 | 76 | 77 | Now, run a query against a `duckDB` located table , the `exec_query` method get a SQL query and returns a `pandas` dataframe 78 | 79 | ```python 80 | df = bigpato_client.exec_query("SELECT * FROM call_center") 81 | ``` 82 | Note that there is no need for using the `duckDB` SQL dialect, `BigPato` transpiles between `BigQuery` and `duckDB` SQL dialects 83 | 84 | 85 | Inspect the LRU candidate cache, this keeps track of the most recently used tables: 86 | 87 | ```python 88 | print(bigpato_client.get_cache()) 89 | 90 | ``` 91 | 92 | This will show the top `LRU_TABLE_CAPACITY` (by default setup at 10) most used tables by previous queries, in this case it will show the previous queried tables `catalog_page` and `call_centers`. Once the cache limit is reached , old tables will be evicted 93 | 94 | ```python 95 | OrderedDict([('catalog_page', 'catalog_page'), ('call_center', 'call_center')]) 96 | ``` 97 | Now, you can run a rebalance operation to promote candidate tables to local storage, in this case only the `catalog_page` will be promoted (`call_center` is already local) 98 | 99 | ```python 100 | bigpato_client.launch_balance_storage() 101 | ``` 102 | 103 | Logs will show: 104 | 105 | ```bash 106 | 107 | 022-11-25 16:10:58.262 Table catalog_page extracted 108 | Copying gs://bigpato-export-bucket/catalog_page/catalog_page000000000000... 109 | - [1/1 files][ 2.2 MiB/ 2.2 MiB] 100% Done 110 | Operation completed over 1 objects/2.2 MiB. 111 | 2022-11-25 16:11:00.408 Table catalog_page downloaded 112 | 2022-11-25 16:11:00.408 Populating metadata from duckDB .. 113 | 2022-11-25 16:11:01.856 Removing orphaned files... 114 | ``` 115 | 116 | Inspect the table location metadata once again: 117 | 118 | ```python 119 | print(bigpato_client.get_metadata_dict()) 120 | 121 | ``` 122 | That will show the updated localtion of the `catalog_page` table: 123 | 124 | ```python 125 | { 'catalog_page': {'location': 'local', 'usage': 0} } 126 | ``` 127 | Re-run the initial query again, it will be executed locally: 128 | 129 | ```python 130 | df = bigpato_client.exec_query("SELECT * FROM catalog_page") 131 | ``` 132 | Logs will show: 133 | ```bash 134 | 2022-11-25 16:21:29.122 Executing query SELECT * FROM catalog_page with duckDB .. 135 | ``` 136 | 137 | 138 | ## Install 139 | From PyPI: 140 | 141 | ```bash 142 | pip install bigpato 143 | ``` 144 | 145 | From source: 146 | 147 | ```bash 148 | source launch_build_install_package_source.sh 149 | ``` 150 | 151 | 152 | -------------------------------------------------------------------------------- /assets/bigpato.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/assets/bigpato.gif -------------------------------------------------------------------------------- /assets/bigpato_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/assets/bigpato_arch.png -------------------------------------------------------------------------------- /bigpato/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/bigpato/__init__.py -------------------------------------------------------------------------------- /bigpato/bigpato.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # https://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os 14 | import duckdb 15 | from sqlglot import parse_one, exp, transpile 16 | from google.cloud import bigquery 17 | from google.cloud import storage 18 | import logging 19 | import shutil 20 | from collections import OrderedDict 21 | import bigpato.common.constants as constants 22 | 23 | 24 | class BigPato: 25 | def __init__(self,bq_project,bq_database,bq_key,duckdb_db,local_duck_folder,export_bq_bucket): 26 | self.__metadata_dict = {} 27 | self.__bq_project=bq_project 28 | #BQ dataset really - used database for name consistency 29 | self.__bq_database=bq_database 30 | self.__bq_key=bq_key 31 | self.__duckdb_db=duckdb_db 32 | self.__local_duck_folder=local_duck_folder 33 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=self.__bq_key 34 | self.__export_bq_bucket=export_bq_bucket 35 | #LRU Cache 36 | self.__lru_cache = OrderedDict() 37 | self.__lru_capacity = constants.LRU_TABLE_CAPACITY 38 | # Purge local storage 39 | self.__purge_local_storage() 40 | #Populate metadata 41 | self.__populate_metadata_bq() 42 | self.__populate_metadata_duckdb() 43 | 44 | 45 | 46 | 47 | def get_metadata_dict(self): 48 | return self.__metadata_dict 49 | 50 | def get_cache(self): 51 | return self.__lru_cache 52 | 53 | def launch_balance_storage(self): 54 | logging.info("Start rebalancing storage ..") 55 | for table in self.__lru_cache.keys(): 56 | if self.__metadata_dict[table]['location'] != constants.LOCAL: 57 | self.__promote_table_to_local(table) 58 | 59 | def __purge_local_storage(self): 60 | if not os.path.exists('{}'.format(self.__local_duck_folder)): 61 | os.makedirs('{}'.format(self.__local_duck_folder)) 62 | 63 | 64 | def __lru_get(self, table_name): 65 | if table_name not in self.__lru_cache: 66 | return -1 67 | else: 68 | self.__lru_cache.move_to_end(table_name) 69 | return self.cache[table_name] 70 | 71 | def __lru_put(self, table_name): 72 | self.__lru_cache[table_name] = table_name 73 | self.__lru_cache.move_to_end(table_name) 74 | if len(self.__lru_cache) > self.__lru_capacity: 75 | self.__lru_cache.popitem(last = False) 76 | 77 | 78 | def __check_all_tables_are_local(self,query): 79 | all_local = True 80 | query_duckdb = transpile(query, read=constants.BIGQUERY, write=constants.DUCKDB)[0] 81 | for table in parse_one(query_duckdb).find_all(exp.Table): 82 | if not table.name in self.__metadata_dict: 83 | raise ValueError("There is no table {} in dictionary".format(table)) 84 | else: 85 | if self.__metadata_dict[table.name]['location'] != constants.LOCAL: 86 | all_local = False 87 | return all_local 88 | 89 | 90 | def __populate_metadata_bq(self): 91 | logging.info("Populating metadata from BigQuery ..") 92 | client = bigquery.Client() 93 | tables = client.list_tables(self.__bq_database) 94 | for table in tables: 95 | self.__metadata_dict[table.table_id] = {'location' : constants.BIGQUERY, 'usage': 0} 96 | 97 | def __populate_metadata_duckdb(self): 98 | logging.info("Populating metadata from duckDB ..") 99 | con = duckdb.connect(database=self.__duckdb_db, read_only=False) 100 | tables_registered = con.execute("SHOW TABLES").df() 101 | for table in [f.name for f in os.scandir(self.__local_duck_folder) if f.is_dir()]: 102 | self.__metadata_dict[table] = {'location' : constants.LOCAL, 'usage': 0} 103 | if table not in tables_registered['name'].values: 104 | #Add the table to the local duckdb dictionary 105 | con.execute("CREATE OR REPLACE TABLE {} AS SELECT * FROM read_parquet('{}/{}/*')".format(table,self.__local_duck_folder,table)) 106 | 107 | 108 | def __exec_query_duckdb(self,query): 109 | logging.info("Executing query {} with duckDB ..".format(query)) 110 | con = duckdb.connect(database=self.__duckdb_db, read_only=False) 111 | #Transpile to duckDB SQL dialect (from BQ) 112 | query_duckdb = transpile(query, read=constants.BIGQUERY, write=constants.DUCKDB)[0] 113 | output = con.execute(query_duckdb).df() 114 | self.__update_table_usage(query) 115 | return output 116 | 117 | def __exec_query_bq(self, query): 118 | logging.info("Executing query {} with BigQuery ..".format(query)) 119 | client = bigquery.Client(project=self.__bq_project) 120 | # Append dataset to table name 121 | sql_tree = parse_one(query) 122 | for table in sql_tree.find_all(exp.Table): 123 | new_table = table.copy() 124 | new_table.set("this", "{}.{}".format( 125 | self.__bq_database, table.name)) 126 | table.replace(new_table) 127 | query_job = client.query(sql_tree.sql()) 128 | output = query_job.result().to_dataframe() 129 | self.__update_table_usage(query) 130 | return output 131 | 132 | 133 | def __update_table_usage(self,query): 134 | query_duckdb = transpile(query, read=constants.BIGQUERY, write=constants.DUCKDB)[0] 135 | for table in parse_one(query_duckdb).find_all(exp.Table): 136 | logging.info("Updating usage +1 for table {}".format(table.name)) 137 | self.__metadata_dict[table.name]['usage'] = self.__metadata_dict[table.name]['usage'] + 1 138 | self.__lru_put(table.name) 139 | 140 | def __promote_table_to_local(self,table_name): 141 | logging.info("Table {} will be promoted to local storage".format(table_name)) 142 | bq_client = bigquery.Client() 143 | storage_client = storage.Client() 144 | dataset_ref = bigquery.DatasetReference(self.__bq_project, self.__bq_database) 145 | table_ref = dataset_ref.table(table_name) 146 | export_gcs_filename = '{}/{}*'.format(table_name,table_name) 147 | job_config = bigquery.ExtractJobConfig() 148 | job_config.destination_format = bigquery.DestinationFormat.PARQUET 149 | job_config.print_header = False 150 | destination_uri = 'gs://{}/{}'.format(self.__export_bq_bucket, export_gcs_filename) 151 | extract_job = bq_client.extract_table( 152 | table_ref, 153 | destination_uri, 154 | job_config=job_config, 155 | location=constants.MULTI_REGION_LOCATION) 156 | extract_job.result() 157 | logging.info( 158 | "Table {} extracted".format(table_name)) 159 | #Create dir in local 160 | if os.path.exists('{}/{}'.format(self.__local_duck_folder,table_name)): 161 | shutil.rmtree('{}/{}'.format(self.__local_duck_folder,table_name)) 162 | os.makedirs('{}/{}'.format(self.__local_duck_folder,table_name)) 163 | else: 164 | os.makedirs('{}/{}'.format(self.__local_duck_folder,table_name)) 165 | #Download the parquet files 166 | command = "gsutil -m cp -r gs://{}/{} {}/{}/".format(self.__export_bq_bucket, export_gcs_filename,self.__local_duck_folder,table_name) 167 | os.system(command) 168 | logging.info( 169 | "Table {} downloaded".format(table_name)) 170 | #Register the tables 171 | self.__populate_metadata_duckdb() 172 | 173 | def exec_query(self,query): 174 | if self.__check_all_tables_are_local(query): 175 | #All things equal duckDB is prioritized 176 | return self.__exec_query_duckdb(query) 177 | else: 178 | return self.__exec_query_bq(query) -------------------------------------------------------------------------------- /bigpato/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/bigpato/common/__init__.py -------------------------------------------------------------------------------- /bigpato/common/constants.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # https://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | BIGQUERY = "bigquery" 14 | DUCKDB = "duckdb" 15 | LOCAL = "local" 16 | MULTI_REGION_LOCATION = "US" 17 | LRU_TABLE_CAPACITY = 10 -------------------------------------------------------------------------------- /launch_build_install_package_source.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # https://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | rm -Rf build 14 | rm -Rf dist 15 | rm -Rf bigpato.egg-info 16 | python3 setup.py sdist bdist_wheel 17 | -------------------------------------------------------------------------------- /sample/sample_streamlit_app.py: -------------------------------------------------------------------------------- 1 | from bigpato import bigpato 2 | import pandas as pd 3 | import streamlit as st 4 | 5 | bq_project = 'velascoluis-dev-sandbox' 6 | bq_database = 'tpcds_100G' 7 | #Under this dataset, we have all the TPCDS tables 8 | bq_key = 'secrets/secret-key.json' 9 | duckdb_db = 'my-db-tpcds.duckdb' 10 | local_duck_folder = 'datasets/bigpato' 11 | export_bq_bucket = 'bigpato-export-bucket' 12 | 13 | if 'bigpato_client' not in st.session_state: 14 | bp_client = bigpato.BigPato(bq_project=bq_project, bq_database=bq_database, bq_key=bq_key, 15 | duckdb_db=duckdb_db, local_duck_folder=local_duck_folder, export_bq_bucket=export_bq_bucket) 16 | st.session_state['bigpato_client'] = bp_client 17 | 18 | 19 | bigpato_client = st.session_state['bigpato_client'] 20 | 21 | query_1 = "SELECT * from web_page" 22 | query_2 = "SELECT * from warehouse" 23 | #Query72 TPC-DS 24 | query_3 = "SELECT i_item_desc, w_warehouse_name, d1.d_week_seq, SUM(CASE WHEN p_promo_sk IS NULL THEN 1 ELSE 0 END) no_promo, SUM(CASE WHEN p_promo_sk IS NOT NULL THEN 1 ELSE 0 END) promo, count(*) total_cnt FROM catalog_sales JOIN inventory ON ( cs_item_sk = inv_item_sk ) JOIN warehouse ON ( w_warehouse_sk = inv_warehouse_sk ) JOIN item ON ( i_item_sk = cs_item_sk ) JOIN customer_demographics ON ( cs_bill_cdemo_sk = cd_demo_sk ) JOIN household_demographics ON ( cs_bill_hdemo_sk = hd_demo_sk ) JOIN date_dim d1 ON ( cs_sold_date_sk = d1.d_date_sk ) JOIN date_dim d2 ON ( inv_date_sk = d2.d_date_sk ) JOIN date_dim d3 ON ( cs_ship_date_sk = d3.d_date_sk ) LEFT OUTER JOIN promotion ON ( cs_promo_sk = p_promo_sk ) LEFT OUTER JOIN catalog_returns ON ( cr_item_sk = cs_item_sk AND cr_order_number = cs_order_number ) WHERE d1.d_week_seq = d2.d_week_seq AND inv_quantity_on_hand < cs_quantity AND d3.d_date > d1.d_date + INTERVAL '5' day AND hd_buy_potential = '501-1000' AND d1.d_year = 2002 AND cd_marital_status = 'M' GROUP BY i_item_desc, w_warehouse_name, d1.d_week_seq ORDER BY total_cnt DESC, i_item_desc, w_warehouse_name, d1.d_week_seq LIMIT 100" 25 | 26 | 27 | st.title("Big🦆 demo") 28 | st.text("This demo shows how to integrate BigPato smart query client in a streamlit application") 29 | st.text("Query 1 : {}".format(query_1)) 30 | 31 | if st.button('Exec Q1'): 32 | st.text("Executing query 1 ...") 33 | df = bigpato_client.exec_query(query_1) 34 | st.text("OK") 35 | st.dataframe(df) 36 | st.text("Query 2 : {}".format(query_2)) 37 | if st.button('Exec Q2'): 38 | st.text("Executing query 2 ...") 39 | df = bigpato_client.exec_query(query_2) 40 | st.text("OK") 41 | st.dataframe(df) 42 | st.text("Query 3 : {}".format(query_3)) 43 | if st.button('Exec Q3'): 44 | st.text("Executing query 3 ...") 45 | df = bigpato_client.exec_query(query_3) 46 | st.text("OK") 47 | st.dataframe(df) 48 | 49 | custom_query = st.text_input('Enter custom query', 'select foo from bar') 50 | if st.button('Exec custom query'): 51 | st.text("Executing query 3 ...") 52 | df = bigpato_client.exec_query(custom_query) 53 | st.text("OK") 54 | st.dataframe(df) 55 | 56 | 57 | if st.button('View BigPato metadata dict'): 58 | st.text("Accesing metadata ...") 59 | st.text(bigpato_client.get_metadata_dict()) 60 | st.text("OK") 61 | 62 | 63 | if st.button('View BigPato LRU cache'): 64 | st.text("Accesing cache ...") 65 | st.text(bigpato_client.get_cache()) 66 | st.text("OK") 67 | 68 | if st.button('Rebalance storage'): 69 | st.text("Rebalancing storage ...") 70 | bigpato_client.launch_balance_storage() 71 | st.text("Rebalancing finished!") 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # https://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | from setuptools import setup, find_packages 13 | 14 | VERSION = "0.0.1" 15 | DESCRIPTION = "bigpato" 16 | LONG_DESCRIPTION = "bigpato package" 17 | 18 | # Setting up 19 | setup( 20 | name="bigpato", 21 | version=VERSION, 22 | author="Luis Velasco", 23 | author_email="", 24 | description=DESCRIPTION, 25 | long_description=LONG_DESCRIPTION, 26 | packages=find_packages(), 27 | package_dir={"bigpatopkg": "bigpato"}, 28 | include_package_data=True, 29 | install_requires=[ 30 | "duckdb", 31 | "sqlglot", 32 | "google-cloud-bigquery", 33 | "google-cloud-storage", 34 | "pandas", 35 | "db-dtypes" 36 | ], 37 | keywords=["python", "bigpato"], 38 | classifiers=[ 39 | "Development Status :: 3 - Alpha", 40 | "Intended Audience :: Education", 41 | "Programming Language :: Python :: 3", 42 | "Operating System :: MacOS :: MacOS X" 43 | ], 44 | ) --------------------------------------------------------------------------------