├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── bigpato.gif
    └── bigpato_arch.png
├── bigpato
    ├── __init__.py
    ├── bigpato.py
    └── common
    │   ├── __init__.py
    │   └── constants.py
├── launch_build_install_package_source.sh
├── sample
    └── sample_streamlit_app.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | #Others
163 | 
164 | sample/datasets/
165 | sample/secrets/
166 | sample/local_test_env/
167 | sample/*.duckdb
168 | sample/*.sh
169 | sample/*.txt
170 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Luis Velasco
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BigPato: a smart SQL client for BigQuery and duckDB
  2 | 
  3 | 
  4 | 
  5 | BigPato is a experimental python package that implements a smart SQL router between two SQL backend engines: `BigQuery` and `duckDB`.
  6 | 
  7 | The overal idea is maintaning a hybrid database, where the most used tables will be located at our local computer (e.g. laptop) and the rest in a cloud DWH like `BigQuery`.
  8 | 
  9 | 
 10 | ![BigPato working with streamlit](assets/bigpato_arch.png)
 11 | 
 12 | `BigPato` implements:
 13 | - A simple automatic data tiering mechanism that updates local storage on request.
 14 | - A SQL transpilation between `BigQuery` and `duckDB` SQL engines, so same query works on both engines.
 15 | - A unified interface that redirects queries to the most performant engine and return a `pandas` dataframe with they query ouptput
 16 | 
 17 | ## Demo
 18 | 
 19 | A demo showcasing the integration of BigPato with streamlit is included under the `sample/` folder
 20 | 
 21 | ![BigPato working with streamlit](assets/bigpato.gif)
 22 | 
 23 | ## Example run
 24 | 
 25 | Import the package and create a new `BigPato` client calling `BigPato`:
 26 | 
 27 | ```python
 28 | from bigpato import bigpato
 29 | 
 30 | bq_project = 'GCP_PROJECT'
 31 | bq_database = 'BQ_DATASET_NAME'
 32 | bq_key = 'JSON_FILE'
 33 | duckdb_db = 'DUCKDB_FILE'
 34 | local_duck_folder = 'LOCAL_DATA_FOLDER'
 35 | export_bq_bucket = 'GCS_BUCKET'
 36 | 
 37 | bp_client = bigpato.BigPato(bq_project=bq_project, bq_database=bq_database, bq_key=bq_key,
 38 |                                      duckdb_db=duckdb_db, local_duck_folder=local_duck_folder, export_bq_bucket=export_bq_bucket)
 39 | ```
 40 | 
 41 | The constructor takes the following arguments:
 42 | 
 43 | * `bq_project` = Google Cloud Platform project 
 44 | * `bq_database` = BigQuery dataset to use, it should be already created. BigPato only supports querying tables under the same dataset
 45 | * `bq_key` = Service Account secrets JSON key with enough permissions to execute SQL queries and export data to GCS
 46 | * `duckdb_db` = Local duckDB metadata file, if it is not present it will be created on the first run
 47 | * `local_duck_folder` = A local folder where the duckDB will cache tables 
 48 | * `export_bq_bucket` = A GCS bucket that will be used to export BigQuery tables in parquet format.
 49 | 
 50 | Inspect the table location metadata calling `get_metadata_dict()`:
 51 | 
 52 | ```python
 53 | print(bigpato_client.get_metadata_dict())
 54 | 
 55 | ```
 56 | `BigPato` maintains a interal catalog with information about the tables deployed in the specified BigQuery dataset and the tables promoted to local storage (also registered on the `duckDB` catalog). That will return the tables location:
 57 | 
 58 | ```python
 59 | {'call_center': {'location': 'local', 'usage': 0},
 60 | ...
 61 | 'catalog_page': {'location': 'bigquery', 'usage': 0},
 62 | 'web_returns': {'location': 'bigquery', 'usage': 0}
 63 | }
 64 | ```
 65 | 
 66 | In the example above we have a dataset at BigQuery with the TPC-DS tables, but some tables have been already promoted to local in previous executions.
 67 | In this case, the table `call_center` is a local table accesible by `duckDB` and the tables `catalog_page` and `web_returns` are tables in `BigQuery`.
 68 | 
 69 | Run a query against a `BigQuery` located table , the `exec_query` method get a SQL query and returns a `pandas` dataframe
 70 | 
 71 | ```python
 72 |   df = bigpato_client.exec_query("SELECT * FROM catalog_page")
 73 | ```
 74 | Note that there is no need for using the `BigQuery` `<DATASET>.<TABLE_NAME>` syntax
 75 | 
 76 | 
 77 | Now, run a query against a `duckDB` located table , the `exec_query` method get a SQL query and returns a `pandas` dataframe
 78 | 
 79 | ```python
 80 |   df = bigpato_client.exec_query("SELECT * FROM call_center")
 81 | ```
 82 | Note that there is no need for using the `duckDB` SQL dialect, `BigPato` transpiles between `BigQuery` and `duckDB` SQL dialects
 83 | 
 84 | 
 85 | Inspect the LRU candidate cache, this keeps track of the most recently used tables:
 86 | 
 87 | ```python
 88 | print(bigpato_client.get_cache())
 89 | 
 90 | ```
 91 | 
 92 | This will show the top `LRU_TABLE_CAPACITY` (by default setup at 10) most used tables by previous queries, in this case it will show the previous queried tables `catalog_page` and `call_centers`. Once the cache limit is reached , old tables will be evicted
 93 | 
 94 | ```python
 95 | OrderedDict([('catalog_page', 'catalog_page'), ('call_center', 'call_center')])
 96 | ```
 97 | Now, you can run a rebalance operation to promote candidate tables to local storage, in this case only the `catalog_page` will be promoted (`call_center` is already local)
 98 | 
 99 | ```python
100 | bigpato_client.launch_balance_storage()
101 | ```
102 | 
103 | Logs will show:
104 | 
105 | ```bash
106 | 
107 | 022-11-25 16:10:58.262 Table catalog_page extracted
108 | Copying gs://bigpato-export-bucket/catalog_page/catalog_page000000000000...
109 | - [1/1 files][  2.2 MiB/  2.2 MiB] 100% Done                                    
110 | Operation completed over 1 objects/2.2 MiB.                                      
111 | 2022-11-25 16:11:00.408 Table catalog_page downloaded
112 | 2022-11-25 16:11:00.408 Populating metadata from duckDB ..
113 | 2022-11-25 16:11:01.856 Removing orphaned files...
114 | ```
115 | 
116 | Inspect the table location metadata once again:
117 | 
118 | ```python
119 | print(bigpato_client.get_metadata_dict())
120 | 
121 | ```
122 | That will show the updated localtion of the `catalog_page` table:
123 | 
124 | ```python
125 | { 'catalog_page': {'location': 'local', 'usage': 0} }
126 | ```
127 | Re-run the initial query again, it will be executed locally:
128 | 
129 | ```python
130 |   df = bigpato_client.exec_query("SELECT * FROM catalog_page")
131 | ```
132 | Logs will show:
133 | ```bash
134 | 2022-11-25 16:21:29.122 Executing query SELECT * FROM catalog_page with duckDB ..
135 | ```
136 | 
137 | 
138 | ## Install
139 | From PyPI:
140 | 
141 | ```bash
142 | pip install bigpato
143 | ```
144 | 
145 | From source:
146 | 
147 | ```bash
148 | source launch_build_install_package_source.sh
149 | ```
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/assets/bigpato.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/assets/bigpato.gif


--------------------------------------------------------------------------------
/assets/bigpato_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/assets/bigpato_arch.png


--------------------------------------------------------------------------------
/bigpato/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/bigpato/__init__.py


--------------------------------------------------------------------------------
/bigpato/bigpato.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | # https://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import os
 14 | import duckdb
 15 | from sqlglot import parse_one, exp, transpile
 16 | from google.cloud import bigquery
 17 | from google.cloud import storage
 18 | import logging
 19 | import shutil
 20 | from collections import OrderedDict
 21 | import bigpato.common.constants as constants
 22 | 
 23 | 
 24 | class BigPato:
 25 |     def __init__(self,bq_project,bq_database,bq_key,duckdb_db,local_duck_folder,export_bq_bucket):
 26 |         self.__metadata_dict = {}
 27 |         self.__bq_project=bq_project
 28 |         #BQ dataset really - used database for name consistency
 29 |         self.__bq_database=bq_database
 30 |         self.__bq_key=bq_key
 31 |         self.__duckdb_db=duckdb_db
 32 |         self.__local_duck_folder=local_duck_folder
 33 |         os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=self.__bq_key
 34 |         self.__export_bq_bucket=export_bq_bucket
 35 |         #LRU Cache
 36 |         self.__lru_cache = OrderedDict()
 37 |         self.__lru_capacity = constants.LRU_TABLE_CAPACITY
 38 |         # Purge local storage
 39 |         self.__purge_local_storage()
 40 |         #Populate metadata
 41 |         self.__populate_metadata_bq()
 42 |         self.__populate_metadata_duckdb()
 43 |        
 44 |         
 45 |         
 46 |     
 47 |     def get_metadata_dict(self):
 48 |         return self.__metadata_dict
 49 | 
 50 |     def get_cache(self):
 51 |         return self.__lru_cache
 52 |  
 53 |     def launch_balance_storage(self):
 54 |         logging.info("Start rebalancing storage ..")
 55 |         for table in self.__lru_cache.keys():
 56 |             if self.__metadata_dict[table]['location'] != constants.LOCAL:
 57 |                 self.__promote_table_to_local(table)
 58 | 
 59 |     def __purge_local_storage(self):
 60 |         if not os.path.exists('{}'.format(self.__local_duck_folder)):
 61 |             os.makedirs('{}'.format(self.__local_duck_folder))
 62 |       
 63 |     
 64 |     def __lru_get(self, table_name):
 65 |         if table_name not in self.__lru_cache:
 66 |             return -1
 67 |         else:
 68 |             self.__lru_cache.move_to_end(table_name)
 69 |             return self.cache[table_name]
 70 |     
 71 |     def __lru_put(self, table_name):
 72 |         self.__lru_cache[table_name] = table_name
 73 |         self.__lru_cache.move_to_end(table_name)
 74 |         if len(self.__lru_cache) > self.__lru_capacity:
 75 |             self.__lru_cache.popitem(last = False)
 76 |     
 77 |     
 78 |     def __check_all_tables_are_local(self,query):
 79 |         all_local = True
 80 |         query_duckdb = transpile(query, read=constants.BIGQUERY, write=constants.DUCKDB)[0]
 81 |         for table in parse_one(query_duckdb).find_all(exp.Table):
 82 |             if not table.name in self.__metadata_dict:
 83 |                 raise ValueError("There is no table {} in dictionary".format(table))
 84 |             else:
 85 |                 if self.__metadata_dict[table.name]['location'] != constants.LOCAL:
 86 |                     all_local = False
 87 |         return  all_local    
 88 |     
 89 |     
 90 |     def __populate_metadata_bq(self):
 91 |         logging.info("Populating metadata from BigQuery ..")
 92 |         client = bigquery.Client()
 93 |         tables = client.list_tables(self.__bq_database)  
 94 |         for table in tables:
 95 |             self.__metadata_dict[table.table_id] = {'location' : constants.BIGQUERY, 'usage': 0}
 96 |             
 97 |     def __populate_metadata_duckdb(self):
 98 |         logging.info("Populating metadata from duckDB ..")
 99 |         con = duckdb.connect(database=self.__duckdb_db, read_only=False)
100 |         tables_registered = con.execute("SHOW TABLES").df()
101 |         for table in [f.name for f in os.scandir(self.__local_duck_folder) if f.is_dir()]:
102 |             self.__metadata_dict[table] = {'location' : constants.LOCAL, 'usage': 0}
103 |             if table not in tables_registered['name'].values:
104 |                 #Add the table to the local duckdb dictionary
105 |                 con.execute("CREATE OR REPLACE TABLE {} AS SELECT * FROM read_parquet('{}/{}/*')".format(table,self.__local_duck_folder,table))
106 |     
107 | 
108 |     def __exec_query_duckdb(self,query):
109 |         logging.info("Executing query {} with duckDB ..".format(query))
110 |         con = duckdb.connect(database=self.__duckdb_db, read_only=False)
111 |         #Transpile to duckDB SQL dialect (from BQ)
112 |         query_duckdb = transpile(query, read=constants.BIGQUERY, write=constants.DUCKDB)[0]
113 |         output = con.execute(query_duckdb).df()
114 |         self.__update_table_usage(query)
115 |         return output
116 |     
117 |     def __exec_query_bq(self, query):
118 |         logging.info("Executing query {} with BigQuery ..".format(query))
119 |         client = bigquery.Client(project=self.__bq_project)
120 |         # Append dataset to table name
121 |         sql_tree = parse_one(query)
122 |         for table in sql_tree.find_all(exp.Table):
123 |             new_table = table.copy()
124 |             new_table.set("this", "{}.{}".format(
125 |                 self.__bq_database, table.name))
126 |             table.replace(new_table)
127 |         query_job = client.query(sql_tree.sql())
128 |         output = query_job.result().to_dataframe()
129 |         self.__update_table_usage(query)
130 |         return output
131 |     
132 |     
133 |     def __update_table_usage(self,query):
134 |         query_duckdb = transpile(query, read=constants.BIGQUERY, write=constants.DUCKDB)[0]
135 |         for table in parse_one(query_duckdb).find_all(exp.Table):
136 |             logging.info("Updating usage +1 for table {}".format(table.name))
137 |             self.__metadata_dict[table.name]['usage'] = self.__metadata_dict[table.name]['usage'] + 1
138 |             self.__lru_put(table.name)
139 |     
140 |     def __promote_table_to_local(self,table_name):
141 |         logging.info("Table {} will be promoted to local storage".format(table_name))
142 |         bq_client = bigquery.Client()
143 |         storage_client = storage.Client()
144 |         dataset_ref = bigquery.DatasetReference(self.__bq_project, self.__bq_database)
145 |         table_ref = dataset_ref.table(table_name)
146 |         export_gcs_filename = '{}/{}*'.format(table_name,table_name)
147 |         job_config = bigquery.ExtractJobConfig()
148 |         job_config.destination_format = bigquery.DestinationFormat.PARQUET
149 |         job_config.print_header = False
150 |         destination_uri = 'gs://{}/{}'.format(self.__export_bq_bucket, export_gcs_filename)
151 |         extract_job = bq_client.extract_table(
152 |             table_ref,
153 |             destination_uri,
154 |             job_config=job_config,
155 |             location=constants.MULTI_REGION_LOCATION)  
156 |         extract_job.result()
157 |         logging.info(
158 |             "Table {} extracted".format(table_name))
159 |         #Create dir in local    
160 |         if os.path.exists('{}/{}'.format(self.__local_duck_folder,table_name)):
161 |             shutil.rmtree('{}/{}'.format(self.__local_duck_folder,table_name))
162 |             os.makedirs('{}/{}'.format(self.__local_duck_folder,table_name))
163 |         else:
164 |             os.makedirs('{}/{}'.format(self.__local_duck_folder,table_name))
165 |         #Download the parquet files
166 |         command = "gsutil -m cp -r gs://{}/{} {}/{}/".format(self.__export_bq_bucket, export_gcs_filename,self.__local_duck_folder,table_name)
167 |         os.system(command)
168 |         logging.info(
169 |             "Table {} downloaded".format(table_name))
170 |         #Register the tables
171 |         self.__populate_metadata_duckdb()
172 |         
173 |     def exec_query(self,query):
174 |         if self.__check_all_tables_are_local(query):
175 |             #All things equal duckDB is prioritized
176 |             return self.__exec_query_duckdb(query)
177 |         else:
178 |             return self.__exec_query_bq(query)   


--------------------------------------------------------------------------------
/bigpato/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/velascoluis/bigpato/30a6959174803d5118f4a269e11b13d7fd125016/bigpato/common/__init__.py


--------------------------------------------------------------------------------
/bigpato/common/constants.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | # https://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | BIGQUERY = "bigquery"
14 | DUCKDB = "duckdb"
15 | LOCAL = "local"
16 | MULTI_REGION_LOCATION = "US"
17 | LRU_TABLE_CAPACITY = 10


--------------------------------------------------------------------------------
/launch_build_install_package_source.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | # https://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | rm -Rf build
14 | rm -Rf dist
15 | rm -Rf bigpato.egg-info
16 | python3 setup.py sdist bdist_wheel
17 | 


--------------------------------------------------------------------------------
/sample/sample_streamlit_app.py:
--------------------------------------------------------------------------------
 1 | from  bigpato import bigpato
 2 | import pandas as pd
 3 | import streamlit as st
 4 | 
 5 | bq_project = 'velascoluis-dev-sandbox'
 6 | bq_database = 'tpcds_100G'
 7 | #Under this dataset, we have all the TPCDS tables
 8 | bq_key = 'secrets/secret-key.json'
 9 | duckdb_db = 'my-db-tpcds.duckdb'
10 | local_duck_folder = 'datasets/bigpato'
11 | export_bq_bucket = 'bigpato-export-bucket'
12 | 
13 | if 'bigpato_client' not in st.session_state:
14 |     bp_client = bigpato.BigPato(bq_project=bq_project, bq_database=bq_database, bq_key=bq_key,
15 |                                      duckdb_db=duckdb_db, local_duck_folder=local_duck_folder, export_bq_bucket=export_bq_bucket)
16 |     st.session_state['bigpato_client'] = bp_client
17 | 
18 | 
19 | bigpato_client = st.session_state['bigpato_client']
20 | 
21 | query_1 = "SELECT * from web_page"
22 | query_2 = "SELECT * from warehouse"
23 | #Query72 TPC-DS
24 | query_3 = "SELECT i_item_desc, w_warehouse_name, d1.d_week_seq, SUM(CASE  WHEN p_promo_sk IS NULL THEN 1 ELSE 0  END) no_promo, SUM(CASE  WHEN p_promo_sk IS NOT NULL THEN 1 ELSE 0 END) promo, count(*) total_cnt  FROM   catalog_sales  JOIN inventory  ON ( cs_item_sk = inv_item_sk )  JOIN warehouse  ON ( w_warehouse_sk = inv_warehouse_sk )  JOIN item  ON ( i_item_sk = cs_item_sk )  JOIN customer_demographics  ON ( cs_bill_cdemo_sk = cd_demo_sk )  JOIN household_demographics  ON ( cs_bill_hdemo_sk = hd_demo_sk )  JOIN date_dim d1  ON ( cs_sold_date_sk = d1.d_date_sk )  JOIN date_dim d2  ON ( inv_date_sk = d2.d_date_sk )  JOIN date_dim d3  ON ( cs_ship_date_sk = d3.d_date_sk )  LEFT OUTER JOIN promotion  ON ( cs_promo_sk = p_promo_sk )  LEFT OUTER JOIN catalog_returns  ON ( cr_item_sk = cs_item_sk  AND cr_order_number = cs_order_number )  WHERE  d1.d_week_seq = d2.d_week_seq  AND inv_quantity_on_hand < cs_quantity  AND d3.d_date > d1.d_date + INTERVAL '5' day  AND hd_buy_potential = '501-1000'  AND d1.d_year = 2002  AND cd_marital_status = 'M'  GROUP  BY i_item_desc,  w_warehouse_name,  d1.d_week_seq  ORDER  BY total_cnt DESC,  i_item_desc,  w_warehouse_name,  d1.d_week_seq  LIMIT  100"
25 | 
26 | 
27 | st.title("Big🦆 demo")
28 | st.text("This demo shows how to integrate BigPato smart query client in a streamlit application")
29 | st.text("Query 1 : {}".format(query_1))
30 | 
31 | if st.button('Exec Q1'):
32 |     st.text("Executing query 1 ...")
33 |     df = bigpato_client.exec_query(query_1)
34 |     st.text("OK")
35 |     st.dataframe(df)
36 | st.text("Query 2 : {}".format(query_2))
37 | if st.button('Exec Q2'):
38 |     st.text("Executing query 2 ...")
39 |     df = bigpato_client.exec_query(query_2)
40 |     st.text("OK")
41 |     st.dataframe(df)
42 | st.text("Query 3 : {}".format(query_3))
43 | if st.button('Exec Q3'):
44 |     st.text("Executing query 3 ...")
45 |     df = bigpato_client.exec_query(query_3)
46 |     st.text("OK")
47 |     st.dataframe(df)
48 | 
49 | custom_query = st.text_input('Enter custom query', 'select foo from bar')
50 | if st.button('Exec custom query'):
51 |     st.text("Executing query 3 ...")
52 |     df = bigpato_client.exec_query(custom_query)
53 |     st.text("OK")
54 |     st.dataframe(df)
55 | 
56 | 
57 | if st.button('View BigPato metadata dict'):
58 |     st.text("Accesing metadata ...")
59 |     st.text(bigpato_client.get_metadata_dict())
60 |     st.text("OK")
61 | 
62 | 
63 | if st.button('View BigPato LRU cache'):
64 |     st.text("Accesing cache ...")
65 |     st.text(bigpato_client.get_cache())
66 |     st.text("OK")
67 | 
68 | if st.button('Rebalance storage'):
69 |     st.text("Rebalancing storage ...")
70 |     bigpato_client.launch_balance_storage()
71 |     st.text("Rebalancing finished!")
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | # https://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | from setuptools import setup, find_packages
13 | 
14 | VERSION = "0.0.1"
15 | DESCRIPTION = "bigpato"
16 | LONG_DESCRIPTION = "bigpato package"
17 | 
18 | # Setting up
19 | setup(
20 |     name="bigpato",
21 |     version=VERSION,
22 |     author="Luis Velasco",
23 |     author_email="<luis.velasco@gmail.com>",
24 |     description=DESCRIPTION,
25 |     long_description=LONG_DESCRIPTION,
26 |     packages=find_packages(),
27 |     package_dir={"bigpatopkg": "bigpato"},
28 |     include_package_data=True,
29 |     install_requires=[
30 |         "duckdb",
31 |         "sqlglot",
32 |         "google-cloud-bigquery",
33 |         "google-cloud-storage",
34 |         "pandas",
35 |         "db-dtypes"
36 |     ],
37 |     keywords=["python", "bigpato"],
38 |     classifiers=[
39 |         "Development Status :: 3 - Alpha",
40 |         "Intended Audience :: Education",
41 |         "Programming Language :: Python :: 3",
42 |         "Operating System :: MacOS :: MacOS X"
43 |     ],
44 | )


--------------------------------------------------------------------------------