├── .github └── workflows │ └── deploy_demo_objects.yaml ├── .gitignore ├── 00_start_here.ipynb ├── LICENSE ├── README.md ├── environment.yml ├── images └── quickstart_overview.png ├── notebooks ├── 06_load_excel_files │ ├── 06_load_excel_files.ipynb │ └── environment.yml └── 07_load_daily_city_metrics │ ├── 07_load_daily_city_metrics.ipynb │ └── environment.yml ├── requirements.txt └── scripts ├── deploy_notebooks.sql ├── deploy_task_dag.py ├── setup.sql └── teardown.sql /.github/workflows/deploy_demo_objects.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy Demo Objects 2 | 3 | # Controls when the action will run. 4 | on: 5 | push: 6 | branches: 7 | - main 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | jobs: 13 | deploy: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | 21 | - name: Setup Python 3.10 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: '3.10' 25 | 26 | - name: Install Python packages 27 | run: pip install -r requirements.txt 28 | 29 | # Make sure all 7 SNOWFLAKE_ environment variables are set 30 | # SnowCLI accesses the passowrd directly from the SNOWFLAKE_PASSWORD environmnet variable 31 | - name: Deploy notebooks 32 | env: 33 | SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_ACCOUNT }} 34 | SNOWFLAKE_USER: ${{ secrets.SNOWFLAKE_USER }} 35 | SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_PASSWORD }} 36 | SNOWFLAKE_ROLE: ${{ secrets.SNOWFLAKE_ROLE }} 37 | SNOWFLAKE_WAREHOUSE: ${{ secrets.SNOWFLAKE_WAREHOUSE }} 38 | SNOWFLAKE_DATABASE: ${{ secrets.SNOWFLAKE_DATABASE }} 39 | SNOWFLAKE_SCHEMA: ${{ secrets.SNOWFLAKE_SCHEMA }} 40 | run: | 41 | snow sql -q "ALTER GIT REPOSITORY DEMO_GIT_REPO FETCH" --temporary-connection --account $SNOWFLAKE_ACCOUNT --user $SNOWFLAKE_USER --role $SNOWFLAKE_ROLE --warehouse $SNOWFLAKE_WAREHOUSE --database $SNOWFLAKE_DATABASE --schema $SNOWFLAKE_SCHEMA 42 | snow sql -q "EXECUTE IMMEDIATE FROM @DEMO_GIT_REPO/branches/main/scripts/deploy_notebooks.sql USING (env => 'PROD', branch => 'main')" --temporary-connection --account $SNOWFLAKE_ACCOUNT --user $SNOWFLAKE_USER --role $SNOWFLAKE_ROLE --warehouse $SNOWFLAKE_WAREHOUSE --database $SNOWFLAKE_DATABASE --schema $SNOWFLAKE_SCHEMA 43 | 44 | : # Ideally both Snow CLI and the Python Connector could be configured entirely from env variables 45 | : #python $GITHUB_WORKSPACE/scripts/deploy_task_dag.py DEMO_DB PROD_SCHEMA 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac files 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /00_start_here.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "623743e8-2cd7-47c6-99e7-100979384579", 14 | "metadata": { 15 | "name": "md_intro", 16 | "collapsed": false 17 | }, 18 | "source": "# Snowflake Notebook Data Engineering\n\n* Author: Jeremiah Hansen\n* Last Updated: 6/11/2024\n\nWelcome to the beginning of the Quickstart! Please refer to [the official Snowflake Notebook Data Engineering Quickstart](https://quickstarts.snowflake.com/guide/data_engineering_with_notebooks/index.html?index=..%2F..index#0) for all the details including set up steps." 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "9e6273e5-bcf7-4492-92f6-cc161da082c6", 23 | "metadata": { 24 | "name": "md_step03", 25 | "collapsed": false 26 | }, 27 | "source": "## Step 03 Setup Snowflake\n\nDuring this step we will create our demo environment. Update the SQL variables below with your GitHub username and Personal Access Token (PAT) as well as with your forked GitHub repository information.\n\n**Important**: Please make sure you have created the `dev` branch in your forked repository before continuing here. For instructions please see [Step 2 in the Quickstart](https://quickstarts.snowflake.com/guide/data_engineering_with_notebooks/index.html?index=..%2F..index#1)." 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "e898c514-831d-4aa7-9697-004994953950", 32 | "metadata": { 33 | "language": "sql", 34 | "name": "sql_step03_set_context" 35 | }, 36 | "outputs": [], 37 | "source": "SET MY_USER = CURRENT_USER();\n\nSET GITHUB_SECRET_USERNAME = 'username';\nSET GITHUB_SECRET_PASSWORD = 'personal access token';\nSET GITHUB_URL_PREFIX = 'https://github.com/username';\nSET GITHUB_REPO_ORIGIN = 'https://github.com/username/sfguide-data-engineering-with-notebooks.git';", 38 | "execution_count": null 39 | }, 40 | { 41 | "cell_type": "code", 42 | "id": "dc608c96-0957-47e1-8492-bc8d382925e3", 43 | "metadata": { 44 | "language": "sql", 45 | "name": "sql_step03_account_objects" 46 | }, 47 | "outputs": [], 48 | "source": "-- ----------------------------------------------------------------------------\n-- Create the account level objects (ACCOUNTADMIN part)\n-- ----------------------------------------------------------------------------\n\nUSE ROLE ACCOUNTADMIN;\n\n-- Roles\nCREATE OR REPLACE ROLE DEMO_ROLE;\nGRANT ROLE DEMO_ROLE TO ROLE SYSADMIN;\nGRANT ROLE DEMO_ROLE TO USER IDENTIFIER($MY_USER);\n\nGRANT CREATE INTEGRATION ON ACCOUNT TO ROLE DEMO_ROLE;\nGRANT EXECUTE TASK ON ACCOUNT TO ROLE DEMO_ROLE;\nGRANT EXECUTE MANAGED TASK ON ACCOUNT TO ROLE DEMO_ROLE;\nGRANT MONITOR EXECUTION ON ACCOUNT TO ROLE DEMO_ROLE;\nGRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE DEMO_ROLE;\n\n-- Databases\nCREATE OR REPLACE DATABASE DEMO_DB;\nGRANT OWNERSHIP ON DATABASE DEMO_DB TO ROLE DEMO_ROLE;\n\n-- Warehouses\nCREATE OR REPLACE WAREHOUSE DEMO_WH WAREHOUSE_SIZE = XSMALL, AUTO_SUSPEND = 300, AUTO_RESUME= TRUE;\nGRANT OWNERSHIP ON WAREHOUSE DEMO_WH TO ROLE DEMO_ROLE;", 49 | "execution_count": null 50 | }, 51 | { 52 | "cell_type": "code", 53 | "id": "a1e2ae2c-241b-4d8f-aa99-11a35f9833a4", 54 | "metadata": { 55 | "language": "sql", 56 | "name": "sql_step03_database_objects" 57 | }, 58 | "outputs": [], 59 | "source": "-- ----------------------------------------------------------------------------\n-- Create the database level objects\n-- ----------------------------------------------------------------------------\nUSE ROLE DEMO_ROLE;\nUSE WAREHOUSE DEMO_WH;\nUSE DATABASE DEMO_DB;\n\n-- Schemas\nCREATE OR REPLACE SCHEMA INTEGRATIONS;\nCREATE OR REPLACE SCHEMA DEV_SCHEMA;\nCREATE OR REPLACE SCHEMA PROD_SCHEMA;\n\nUSE SCHEMA INTEGRATIONS;\n\n-- External Frostbyte objects\nCREATE OR REPLACE STAGE FROSTBYTE_RAW_STAGE\n URL = 's3://sfquickstarts/data-engineering-with-snowpark-python/'\n;\n\n-- Secrets (schema level)\nCREATE OR REPLACE SECRET DEMO_GITHUB_SECRET\n TYPE = password\n USERNAME = $GITHUB_SECRET_USERNAME\n PASSWORD = $GITHUB_SECRET_PASSWORD;\n\n-- API Integration (account level)\n-- This depends on the schema level secret!\nCREATE OR REPLACE API INTEGRATION DEMO_GITHUB_API_INTEGRATION\n API_PROVIDER = GIT_HTTPS_API\n API_ALLOWED_PREFIXES = ($GITHUB_URL_PREFIX)\n ALLOWED_AUTHENTICATION_SECRETS = (DEMO_GITHUB_SECRET)\n ENABLED = TRUE;\n\n-- Git Repository\nCREATE OR REPLACE GIT REPOSITORY DEMO_GIT_REPO\n API_INTEGRATION = DEMO_GITHUB_API_INTEGRATION\n GIT_CREDENTIALS = DEMO_GITHUB_SECRET\n ORIGIN = $GITHUB_REPO_ORIGIN;", 60 | "execution_count": null 61 | }, 62 | { 63 | "cell_type": "code", 64 | "id": "06f26add-547e-4d60-8897-d5ad79b3311d", 65 | "metadata": { 66 | "language": "sql", 67 | "name": "sql_step03_event_table" 68 | }, 69 | "outputs": [], 70 | "source": "-- ----------------------------------------------------------------------------\n-- Create the event table\n-- ----------------------------------------------------------------------------\nUSE ROLE ACCOUNTADMIN;\n\nCREATE EVENT TABLE DEMO_DB.INTEGRATIONS.DEMO_EVENTS;\nGRANT SELECT ON EVENT TABLE DEMO_DB.INTEGRATIONS.DEMO_EVENTS TO ROLE DEMO_ROLE;\nGRANT INSERT ON EVENT TABLE DEMO_DB.INTEGRATIONS.DEMO_EVENTS TO ROLE DEMO_ROLE;\n\nALTER ACCOUNT SET EVENT_TABLE = DEMO_DB.INTEGRATIONS.DEMO_EVENTS;\nALTER DATABASE DEMO_DB SET LOG_LEVEL = INFO;", 71 | "execution_count": null 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "9531119f-76fc-4a2f-a635-a5a7526ac152", 76 | "metadata": { 77 | "name": "md_step04_deploy_dev_notebooks", 78 | "collapsed": false 79 | }, 80 | "source": "## Step 04 Deploy to Dev\n\nFinally we will use `EXECUTE IMMEDIATE FROM ` along with Jinja templating to deploy the Dev version of our Notebooks. We will directly execute the SQL script `scripts/deploy_notebooks.sql` from our Git repository which has the SQL commands to deploy a Notebook from a Git repo.\n\nSee [EXECUTE IMMEDIATE FROM](https://docs.snowflake.com/en/sql-reference/sql/execute-immediate-from) for more details." 81 | }, 82 | { 83 | "cell_type": "code", 84 | "id": "ad8676d0-7f82-4639-a5e2-29f7f9dca0f5", 85 | "metadata": { 86 | "language": "sql", 87 | "name": "sql_step04_deploy_dev_notebooks", 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": "USE ROLE DEMO_ROLE;\nUSE WAREHOUSE DEMO_WH;\nUSE SCHEMA DEMO_DB.INTEGRATIONS;\n\nEXECUTE IMMEDIATE FROM @DEMO_GIT_REPO/branches/main/scripts/deploy_notebooks.sql\n USING (env => 'DEV', branch => 'dev');", 92 | "execution_count": null 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "753bb327-95e4-4559-b7c7-f034607196c9", 97 | "metadata": { 98 | "name": "md_step05", 99 | "collapsed": false 100 | }, 101 | "source": "## Step 05 Load Weather\n\nBut what about data that needs constant updating - like the WEATHER data? We would need to build a pipeline process to constantly update that data to keep it fresh.\n\nPerhaps a better way to get this external data would be to source it from a trusted data supplier. Let them manage the data, keeping it accurate and up to date.\n\nEnter the Snowflake Data Cloud...\n\nWeather Source is a leading provider of global weather and climate data and their OnPoint Product Suite provides businesses with the necessary weather and climate data to quickly generate meaningful and actionable insights for a wide range of use cases across industries. Let's connect to the \"Weather Source LLC: frostbyte\" feed from Weather Source in the Snowflake Data Marketplace by following these steps in Snowsight\n\n* In the left navigation bar click on \"Data Products\" and then \"Marketplace\"\n* Search: \"Weather Source LLC: frostbyte\" (and click on tile in results)\n* Click the blue \"Get\" button\n* Under \"Options\", adjust the Database name to read \"FROSTBYTE_WEATHERSOURCE\" (all capital letters)\n* Grant to \"HOL_ROLE\"\n\nThat's it... we don't have to do anything from here to keep this data updated. The provider will do that for us and data sharing means we are always seeing whatever they they have published." 102 | }, 103 | { 104 | "cell_type": "code", 105 | "id": "04a850e3-44a4-4829-882e-84724f7e77d7", 106 | "metadata": { 107 | "language": "sql", 108 | "name": "sql_step05_create_share" 109 | }, 110 | "outputs": [], 111 | "source": "/*---\n-- You can also do it via code if you know the account/share details...\nSET WEATHERSOURCE_ACCT_NAME = '*** PUT ACCOUNT NAME HERE AS PART OF DEMO SETUP ***';\nSET WEATHERSOURCE_SHARE_NAME = '*** PUT ACCOUNT SHARE HERE AS PART OF DEMO SETUP ***';\nSET WEATHERSOURCE_SHARE = $WEATHERSOURCE_ACCT_NAME || '.' || $WEATHERSOURCE_SHARE_NAME;\n\nCREATE OR REPLACE DATABASE FROSTBYTE_WEATHERSOURCE\n FROM SHARE IDENTIFIER($WEATHERSOURCE_SHARE);\n\nGRANT IMPORTED PRIVILEGES ON DATABASE FROSTBYTE_WEATHERSOURCE TO ROLE HOL_ROLE;\n---*/", 112 | "execution_count": null 113 | }, 114 | { 115 | "cell_type": "code", 116 | "id": "7e2762d1-fe91-4a7c-b89a-56e1baf0001c", 117 | "metadata": { 118 | "language": "sql", 119 | "name": "sql_step05_test_share" 120 | }, 121 | "outputs": [], 122 | "source": "-- Let's look at the data - same 3-part naming convention as any other table\nSELECT * FROM FROSTBYTE_WEATHERSOURCE.ONPOINT_ID.POSTAL_CODES LIMIT 100;", 123 | "execution_count": null 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "d3d93974-9a75-46c2-876f-95b6e1562f75", 128 | "metadata": { 129 | "name": "md_step06", 130 | "collapsed": false 131 | }, 132 | "source": "## Step 06 Load Excel Files\n\nPlease follow the instructions in [Step 6 of the Quickstart](https://quickstarts.snowflake.com/guide/data_engineering_with_notebooks/index.html?index=..%2F..index#5) to open and run the `DEV_06_load_excel_files` Notebook. That Notebook will define the pipeline used to load data into the `LOCATION` and `ORDER_DETAIL` tables from the staged Excel files." 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "b5587283-a264-444d-b9ec-55b4d926a5c7", 137 | "metadata": { 138 | "name": "md_step07", 139 | "collapsed": false 140 | }, 141 | "source": "## Step 07 Load Daily City Metrics\n\nPlease follow the instructions in [Step 7 of the Quickstart](https://quickstarts.snowflake.com/guide/data_engineering_with_notebooks/index.html?index=..%2F..index#6) to open and run the `DEV_07_load_daily_city_metrics` Notebook. That Notebook will define the pipeline used to create the `DAILY_CITY_METRICS` table." 142 | }, 143 | { 144 | "cell_type": "code", 145 | "id": "63bfff6b-067e-4f24-8424-19d0231c0ee1", 146 | "metadata": { 147 | "language": "sql", 148 | "name": "sql_step07_logs" 149 | }, 150 | "outputs": [], 151 | "source": "USE ROLE DEMO_ROLE;\nUSE WAREHOUSE DEMO_WH;\nUSE SCHEMA DEMO_DB.INTEGRATIONS;\n\nSELECT TOP 100\n RECORD['severity_text'] AS SEVERITY,\n VALUE AS MESSAGE\nFROM\n DEMO_DB.INTEGRATIONS.DEMO_EVENTS\nWHERE 1 = 1\n AND SCOPE['name'] = 'demo_logger'\n AND RECORD_TYPE = 'LOG';", 152 | "execution_count": null 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "c873a1db-1fbe-4a44-b02a-03e1b2084cb2", 157 | "metadata": { 158 | "name": "md_step08", 159 | "collapsed": false 160 | }, 161 | "source": "## Step 08 Orchestrate Pipelines\n\nIn this step we will create a DAG (or Directed Acyclic Graph) of Tasks using the new [Snowflake Python Management API](https://docs.snowflake.com/en/developer-guide/snowflake-python-api/snowflake-python-overview). The Task DAG API builds upon the Python Management API to provide advanced Task management capabilities. For more details see [Managing Snowflake tasks and task graphs with Python](https://docs.snowflake.com/en/developer-guide/snowflake-python-api/snowflake-python-managing-tasks).\n\nThis code is also available in the `scripts/deploy_task_dag.py` script which could be used to automate the Task DAG deployment." 162 | }, 163 | { 164 | "cell_type": "code", 165 | "id": "bdac9ad2-2858-4fb7-b3a2-6394db5b0b4c", 166 | "metadata": { 167 | "language": "python", 168 | "name": "py_step08_imports" 169 | }, 170 | "outputs": [], 171 | "source": "# Import python packages\nfrom snowflake.core import Root\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n\nsession.use_role(\"DEMO_ROLE\")\nsession.use_warehouse(\"DEMO_WH\")", 172 | "execution_count": null 173 | }, 174 | { 175 | "cell_type": "code", 176 | "id": "cb2bbc8e-c525-4cd0-b147-a832a9060c47", 177 | "metadata": { 178 | "language": "python", 179 | "name": "py_step08_set_env" 180 | }, 181 | "outputs": [], 182 | "source": "database_name = \"DEMO_DB\"\nschema_name = \"DEV_SCHEMA\"\n#schema_name = \"PROD_SCHEMA\"\nenv = 'PROD' if schema_name == 'PROD_SCHEMA' else 'DEV'\n\nsession.use_schema(f\"{database_name}.{schema_name}\")", 183 | "execution_count": null 184 | }, 185 | { 186 | "cell_type": "code", 187 | "id": "0c030976-5888-4d9f-a329-3248b25abd78", 188 | "metadata": { 189 | "language": "python", 190 | "name": "py_step08_create_dag" 191 | }, 192 | "outputs": [], 193 | "source": "from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask\nfrom datetime import timedelta\n\n# Create the tasks using the DAG API\nwarehouse_name = \"DEMO_WH\"\ndag_name = \"DEMO_DAG\"\n\napi_root = Root(session)\nschema = api_root.databases[database_name].schemas[schema_name]\ndag_op = DAGOperation(schema)\n\n# Define the DAG\nwith DAG(dag_name, schedule=timedelta(days=1), warehouse=warehouse_name) as dag:\n dag_task1 = DAGTask(\"LOAD_EXCEL_FILES_TASK\", definition=f'''EXECUTE NOTEBOOK \"{database_name}\".\"{schema_name}\".\"{env}_06_load_excel_files\"()''', warehouse=warehouse_name)\n dag_task2 = DAGTask(\"LOAD_DAILY_CITY_METRICS\", definition=f'''EXECUTE NOTEBOOK \"{database_name}\".\"{schema_name}\".\"{env}_07_load_daily_city_metrics\"()''', warehouse=warehouse_name)\n\n # Define the dependencies between the tasks\n dag_task1 >> dag_task2 # dag_task1 is a predecessor of dag_task2\n\n# Create the DAG in Snowflake\ndag_op.deploy(dag, mode=\"orreplace\")", 194 | "execution_count": null 195 | }, 196 | { 197 | "cell_type": "code", 198 | "id": "f560c909-5523-4a12-ad21-0b9044bdaff6", 199 | "metadata": { 200 | "language": "python", 201 | "name": "py_step08_run_dag" 202 | }, 203 | "outputs": [], 204 | "source": "dagiter = dag_op.iter_dags(like='demo_dag%')\nfor dag_name in dagiter:\n print(dag_name)\n\n#dag_op.run(dag)", 205 | "execution_count": null 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "8ad46ffc-1137-43dc-add8-7fc02914bbaa", 210 | "metadata": { 211 | "name": "md_step09", 212 | "collapsed": false 213 | }, 214 | "source": "## Step 09 Deploy to Production\n\nSteps\n1. Make a small change to a notebook and commit it to the dev branch\n1. Go into GitHub and create a PR and Merge to main branch\n1. Review GitHub Actions workflow definition and run results\n1. See new \"PROD_\" versions of the Notebooks\n1. Deploy the production version of the task DAG\n1. Run production version of the task DAG and see new tables created!" 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "ba497c01-0988-4c07-af66-79ee2918cffa", 219 | "metadata": { 220 | "name": "md_step10", 221 | "collapsed": false 222 | }, 223 | "source": "## Step 10 Teardown\n\nFinally, we will tear down our demo environment." 224 | }, 225 | { 226 | "cell_type": "code", 227 | "id": "f47ca116-4585-4668-bb72-cf74b0e7b587", 228 | "metadata": { 229 | "language": "sql", 230 | "name": "sql_step10" 231 | }, 232 | "outputs": [], 233 | "source": "USE ROLE ACCOUNTADMIN;\n\nDROP API INTEGRATION DEMO_GITHUB_API_INTEGRATION;\nDROP DATABASE DEMO_DB;\nDROP WAREHOUSE DEMO_WH;\nDROP ROLE DEMO_ROLE;\n\n-- Drop the weather share\nDROP DATABASE FROSTBYTE_WEATHERSOURCE;\n\n-- Remove the \"dev\" branch in your repo", 234 | "execution_count": null 235 | } 236 | ] 237 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering with Notebooks 2 | This repository contains the code for the *Data Engineering with Notebooks* Snowflake Quickstart. 3 | 4 | ### ➡️ For overview, prerequisites, and to learn more, complete this end-to-end tutorial [Data Engineering with Notebooks](https://quickstarts.snowflake.com/guide/data_engineering_with_notebooks/index.html?index=..%2F..index#0) on quickstarts.snowflake.com. 5 | 6 | ___ 7 | Here is an overview of what we'll build in this tutorial: 8 | 9 | 10 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: snowflake-demo 2 | channels: 3 | - https://repo.anaconda.com/pkgs/snowflake 4 | - nodefaults 5 | dependencies: 6 | - python=3.9 7 | - pip 8 | - openssl=1.1.1 9 | - pip: 10 | # Snowflake 11 | - snowflake-cli-labs 12 | - snowflake-snowpark-python 13 | - snowflake 14 | -------------------------------------------------------------------------------- /images/quickstart_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/sfguide-data-engineering-with-notebooks/39e28f8bcaa90f38fcabe2ecb1fd5a8d58e24dcb/images/quickstart_overview.png -------------------------------------------------------------------------------- /notebooks/06_load_excel_files/06_load_excel_files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "c1970118-7b46-4dcf-acd2-cd8836d14408", 14 | "metadata": { 15 | "name": "md_overview", 16 | "collapsed": false 17 | }, 18 | "source": "# 06 Load Excel Files\n\n* Author: Jeremiah Hansen\n* Last Updated: 10/25/2024\n\nThis notebook will load data into the `LOCATION` and `ORDER_DETAIL` tables from Excel files.\n\nThis currently does not use Snowpark File Access as it doesn't yet work in Notebooks. So for now we copy the file locally first." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "id": "8873bc96-287b-4f47-a929-013c1487a088", 23 | "metadata": { 24 | "language": "sql", 25 | "name": "sql_get_context" 26 | }, 27 | "outputs": [], 28 | "source": "-- This won't be needed when we can pass variables to Notebooks!\nSELECT current_database() AS DATABASE_NAME, current_schema() AS SCHEMA_NAME", 29 | "execution_count": null 30 | }, 31 | { 32 | "cell_type": "code", 33 | "id": "3775908f-ca36-4846-8f38-5adca39217f2", 34 | "metadata": { 35 | "language": "python", 36 | "name": "py_imports", 37 | "collapsed": false 38 | }, 39 | "source": "# Import python packages\nimport logging\nimport pandas as pd\n\nlogger = logging.getLogger(\"demo_logger\")\n\n# Get the target database and schema using the results from the SQL cell above\n# This won't be needed when we can pass variables to Notebooks!\ncurrent_context_df = cells.sql_get_context.to_pandas()\ndatabase_name = current_context_df.iloc[0,0]\nschema_name = current_context_df.iloc[0,1]\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n#session.use_schema(f\"{database_name}.{schema_name}\")\n\nlogger.info(\"06_load_excel_files start\")", 40 | "execution_count": null, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "id": "413fad43-3fec-4379-b34b-7e1728599a7a", 46 | "metadata": { 47 | "language": "sql", 48 | "name": "sql_get_spreadsheets", 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": "-- Temporary solution to load in the metadata, this should be replaced with a directy query to a directory table (or a metadata table)\nSELECT '@INTEGRATIONS.FROSTBYTE_RAW_STAGE/intro/order_detail.xlsx' AS STAGE_FILE_PATH, 'order_detail' AS WORKSHEET_NAME, 'ORDER_DETAIL' AS TARGET_TABLE\nUNION\nSELECT '@INTEGRATIONS.FROSTBYTE_RAW_STAGE/intro/location.xlsx', 'location', 'LOCATION';", 53 | "execution_count": null 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "07fd7441-1c12-4195-a7cd-f04fcc3e4242", 58 | "metadata": { 59 | "name": "md_function", 60 | "collapsed": false 61 | }, 62 | "source": "## Create a function to load Excel worksheet to table\n\nCreate a reusable function to load an Excel worksheet to a table in Snowflake.\n\nNote: Until we can use the `SnowflakeFile` class in Notebooks, we need to temporarily copy the file to a local temp folder and then process from there." 63 | }, 64 | { 65 | "cell_type": "code", 66 | "id": "d92d7957-762a-49ae-95e6-8b407ddba0f6", 67 | "metadata": { 68 | "language": "python", 69 | "name": "py_load_excel_function", 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": "import os\nfrom openpyxl import load_workbook\n\ndef load_excel_worksheet_to_table_local(session, stage_file_path, worksheet_name, target_table):\n local_directory = \"./\"\n file_name = os.path.basename(stage_file_path)\n\n # First copy file from stage to local storage\n get_status = session.file.get(stage_file_path, local_directory)\n\n with open(f\"{local_directory}{file_name}\", 'rb') as f:\n workbook = load_workbook(f)\n sheet = workbook[worksheet_name]\n data = sheet.values\n\n # Get the first line in file as a header line\n columns = next(data)[0:]\n # Create a DataFrame based on the second and subsequent lines of data\n df = pd.DataFrame(data, columns=columns)\n \n df2 = session.create_dataframe(df)\n df2.write.mode(\"overwrite\").save_as_table(target_table)\n \n return True", 74 | "execution_count": null 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "97c2fc79-50d4-4a81-af5d-5c80d37070ec", 79 | "metadata": { 80 | "name": "md_process_spreadsheets", 81 | "collapsed": false 82 | }, 83 | "source": "## Process all Excel worksheets\n\nLoop through each Excel worksheet to process and call our `load_excel_worksheet_to_table_local()` function." 84 | }, 85 | { 86 | "cell_type": "code", 87 | "id": "4e73f895-6b24-4ce9-b357-7a9a879be1e4", 88 | "metadata": { 89 | "language": "python", 90 | "name": "py_process_spreadsheets" 91 | }, 92 | "outputs": [], 93 | "source": "# Process each file from the sql_get_spreadsheets cell above\nfiles_to_load = cells.sql_get_spreadsheets.to_pandas()\nfor index, excel_file in files_to_load.iterrows():\n logger.info(f\"Processing Excel file {excel_file['STAGE_FILE_PATH']}\")\n load_excel_worksheet_to_table_local(session, excel_file['STAGE_FILE_PATH'], excel_file['WORKSHEET_NAME'], excel_file['TARGET_TABLE'])\n\nlogger.info(\"06_load_excel_files end\")", 94 | "execution_count": null 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "16d6be04-3690-4c5d-91ee-5d0d425355b8", 99 | "metadata": { 100 | "name": "md_debugging", 101 | "collapsed": false 102 | }, 103 | "source": "### Debugging" 104 | }, 105 | { 106 | "cell_type": "code", 107 | "id": "a878dd75-f426-427f-bbef-e5401097d9d6", 108 | "metadata": { 109 | "language": "sql", 110 | "name": "sql_debugging" 111 | }, 112 | "outputs": [], 113 | "source": "--DESCRIBE TABLE LOCATION;\n--SELECT * FROM LOCATION;\n--SHOW TABLES;", 114 | "execution_count": null 115 | } 116 | ] 117 | } -------------------------------------------------------------------------------- /notebooks/06_load_excel_files/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - openpyxl 6 | -------------------------------------------------------------------------------- /notebooks/07_load_daily_city_metrics/07_load_daily_city_metrics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | }, 7 | "lastEditStatus": { 8 | "notebookId": "kss7vquztn5742emuo3m", 9 | "authorId": "350843732387", 10 | "authorName": "JOHN", 11 | "authorEmail": "jeremiah.hansen@snowflake.com", 12 | "sessionId": "d322f813-fe43-46ce-a12e-8f6586bfa9bf", 13 | "lastEditTime": 1744841249680 14 | } 15 | }, 16 | "nbformat_minor": 5, 17 | "nbformat": 4, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "id": "4132a4ef-a90f-4aa4-b334-d7b1aeb2e91e", 22 | "metadata": { 23 | "name": "md_overview", 24 | "collapsed": false 25 | }, 26 | "source": "# 07 Load Daily City Metrics\n\n* Author: Jeremiah Hansen\n* Last Updated: 6/11/2024\n\nThis notebook will load data into the `DAILY_CITY_METRICS` table with support for incremental processing." 27 | }, 28 | { 29 | "cell_type": "code", 30 | "id": "1c47f41d-b110-4662-a907-fb9d0566fe94", 31 | "metadata": { 32 | "language": "sql", 33 | "name": "sql_get_context" 34 | }, 35 | "outputs": [], 36 | "source": "-- This won't be needed when we can pass variables to Notebooks!\nSELECT current_database() AS DATABASE_NAME, current_schema() AS SCHEMA_NAME", 37 | "execution_count": null 38 | }, 39 | { 40 | "cell_type": "code", 41 | "id": "2283d2ff-6b0e-479c-9c1d-3d6066043d04", 42 | "metadata": { 43 | "language": "python", 44 | "name": "py_imports", 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": "# Import python packages\nimport logging\nfrom snowflake.core import Root\n\nlogger = logging.getLogger(\"demo_logger\")\n\n# Get the target database and schema using the results from the SQL cell above\n# This won't be needed when we can pass variables to Notebooks!\ncurrent_context_df = cells.sql_get_context.to_pandas()\ndatabase_name = current_context_df.iloc[0,0]\nschema_name = current_context_df.iloc[0,1]\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n#session.use_schema(f\"{database_name}.{schema_name}\")\n\nlogger.info(\"07_load_daily_city_metrics start\")", 49 | "execution_count": null 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "2dd608eb-bc1f-45a9-81bb-35da23528eed", 54 | "metadata": { 55 | "name": "md_function", 56 | "collapsed": false 57 | }, 58 | "source": "## Create a function to check if a table exists\n\nThis function uses the [Snowflake Python Management API](https://docs.snowflake.com/en/developer-guide/snowflake-python-api/snowflake-python-overview)." 59 | }, 60 | { 61 | "cell_type": "code", 62 | "id": "f9b7500f-5c4f-4c87-a14f-542427705e07", 63 | "metadata": { 64 | "language": "python", 65 | "name": "py_table_exists" 66 | }, 67 | "outputs": [], 68 | "source": "def table_exists(session, database_name='', schema_name='', table_name=''):\n root = Root(session)\n tables = root.databases[database_name].schemas[schema_name].tables.iter(like=table_name)\n for table_obj in tables:\n if table_obj.name == table_name:\n return True\n\n return False\n\n# Not used, SQL alternative to Python version above\ndef table_exists2(session, database_name='', schema_name='', table_name=''):\n exists = session.sql(\"SELECT EXISTS (SELECT * FROM {}.INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{}' AND TABLE_NAME = '{}') AS TABLE_EXISTS\".format(database_name, schema_name, table_name)).collect()[0]['TABLE_EXISTS']\n return exists", 69 | "execution_count": null 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "37822d24-6c8f-4afe-b010-ac1e7f4a9fdf", 74 | "metadata": { 75 | "name": "md_pipeline", 76 | "collapsed": false 77 | }, 78 | "source": "## Pipeline to update daily_city_metrics" 79 | }, 80 | { 81 | "cell_type": "code", 82 | "id": "93b0b39d-d272-46a5-a367-93bccd2f7a80", 83 | "metadata": { 84 | "language": "python", 85 | "name": "py_process_dcm", 86 | "collapsed": false, 87 | "codeCollapsed": false 88 | }, 89 | "outputs": [], 90 | "source": "import snowflake.snowpark.functions as F\n\ntable_name = \"DAILY_CITY_METRICS\"\n\n# Define the tables\norder_detail = session.table(\"ORDER_DETAIL\")\nhistory_day = session.table(\"FROSTBYTE_WEATHERSOURCE.ONPOINT_ID.HISTORY_DAY\")\nlocation = session.table(\"LOCATION\")\n\n# Join the tables\norder_detail = order_detail.join(location, order_detail['LOCATION_ID'] == location['LOCATION_ID'])\norder_detail = order_detail.join(history_day, (F.builtin(\"DATE\")(order_detail['ORDER_TS']) == history_day['DATE_VALID_STD']) & (location['ISO_COUNTRY_CODE'] == history_day['COUNTRY']) & (location['CITY'] == history_day['CITY_NAME']))\n\n# Aggregate the data\nfinal_agg = order_detail.group_by(F.col('DATE_VALID_STD'), F.col('CITY_NAME'), F.col('ISO_COUNTRY_CODE')) \\\n .agg( \\\n F.sum('PRICE').alias('DAILY_SALES_SUM'), \\\n F.avg('AVG_TEMPERATURE_AIR_2M_F').alias(\"AVG_TEMPERATURE_F\"), \\\n F.avg(\"TOT_PRECIPITATION_IN\").alias(\"AVG_PRECIPITATION_IN\"), \\\n ) \\\n .select(F.col(\"DATE_VALID_STD\").alias(\"DATE\"), F.col(\"CITY_NAME\"), F.col(\"ISO_COUNTRY_CODE\").alias(\"COUNTRY_DESC\"), \\\n F.builtin(\"ZEROIFNULL\")(F.col(\"DAILY_SALES_SUM\")).alias(\"DAILY_SALES\"), \\\n F.round(F.col(\"AVG_TEMPERATURE_F\"), 2).alias(\"AVG_TEMPERATURE_FAHRENHEIT\"), \\\n F.round(F.col(\"AVG_PRECIPITATION_IN\"), 2).alias(\"AVG_PRECIPITATION_INCHES\"), \\\n )\n\n# If the table doesn't exist then create it\nif not table_exists(session, database_name=database_name, schema_name=schema_name, table_name=table_name):\n final_agg.write.mode(\"overwrite\").save_as_table(table_name)\n\n logger.info(f\"Successfully created {table_name}\")\n# Otherwise update it\nelse:\n cols_to_update = {c: final_agg[c] for c in final_agg.schema.names}\n\n dcm = session.table(table_name)\n dcm.merge(final_agg, (dcm['DATE'] == final_agg['DATE']) & (dcm['CITY_NAME'] == final_agg['CITY_NAME']) & (dcm['COUNTRY_DESC'] == final_agg['COUNTRY_DESC']), \\\n [F.when_matched().update(cols_to_update), F.when_not_matched().insert(cols_to_update)])\n\n logger.info(f\"Successfully updated {table_name}\")\n", 91 | "execution_count": null 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "35b06e41-3330-43db-8026-02dfc8d8ecac", 96 | "metadata": { 97 | "name": "md_debugging", 98 | "collapsed": false 99 | }, 100 | "source": "## Debugging" 101 | }, 102 | { 103 | "cell_type": "code", 104 | "id": "df8e0cb8-3c80-4bd7-87e2-88526e3377ff", 105 | "metadata": { 106 | "language": "sql", 107 | "name": "sql_debugging", 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": "SELECT * FROM DAILY_CITY_METRICS LIMIT 10;", 112 | "execution_count": null 113 | } 114 | ] 115 | } -------------------------------------------------------------------------------- /notebooks/07_load_daily_city_metrics/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | snowflake-cli-labs 2 | snowflake-snowpark-python 3 | snowflake 4 | -------------------------------------------------------------------------------- /scripts/deploy_notebooks.sql: -------------------------------------------------------------------------------- 1 | --!jinja 2 | 3 | /*----------------------------------------------------------------------------- 4 | Hands-On Lab: Intro to Data Engineering with Notebooks 5 | Script: deploy_notebooks.sql 6 | Author: Jeremiah Hansen 7 | Last Updated: 6/11/2024 8 | -----------------------------------------------------------------------------*/ 9 | 10 | -- See https://docs.snowflake.com/en/LIMITEDACCESS/execute-immediate-from-template 11 | 12 | -- Create the Notebooks 13 | --USE SCHEMA {{env}}_SCHEMA; 14 | 15 | CREATE OR REPLACE NOTEBOOK IDENTIFIER('"DEMO_DB"."{{env}}_SCHEMA"."{{env}}_06_load_excel_files"') 16 | FROM '@"DEMO_DB"."INTEGRATIONS"."DEMO_GIT_REPO"/branches/"{{branch}}"/notebooks/06_load_excel_files/' 17 | QUERY_WAREHOUSE = 'DEMO_WH' 18 | MAIN_FILE = '06_load_excel_files.ipynb'; 19 | 20 | ALTER NOTEBOOK "DEMO_DB"."{{env}}_SCHEMA"."{{env}}_06_load_excel_files" ADD LIVE VERSION FROM LAST; 21 | 22 | CREATE OR REPLACE NOTEBOOK IDENTIFIER('"DEMO_DB"."{{env}}_SCHEMA"."{{env}}_07_load_daily_city_metrics"') 23 | FROM '@"DEMO_DB"."INTEGRATIONS"."DEMO_GIT_REPO"/branches/"{{branch}}"/notebooks/07_load_daily_city_metrics/' 24 | QUERY_WAREHOUSE = 'DEMO_WH' 25 | MAIN_FILE = '07_load_daily_city_metrics.ipynb'; 26 | 27 | ALTER NOTEBOOK "DEMO_DB"."{{env}}_SCHEMA"."{{env}}_07_load_daily_city_metrics" ADD LIVE VERSION FROM LAST; 28 | -------------------------------------------------------------------------------- /scripts/deploy_task_dag.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------ 2 | # Hands-On Lab: Intro to Data Engineering with Notebooks 3 | # Script: deploy_task_dag.py 4 | # Author: Jeremiah Hansen 5 | # Last Updated: 6/11/2024 6 | #------------------------------------------------------------------------------ 7 | 8 | 9 | from snowflake.snowpark import Session 10 | from snowflake.core import Root 11 | from snowflake.core.task.dagv1 import DAGOperation, DAG, DAGTask 12 | from datetime import timedelta 13 | 14 | # Create the tasks using the DAG API 15 | def main(session: Session, database_name, schema_name) -> str: 16 | # Set the environment context 17 | env = 'PROD' if schema_name == 'PROD_SCHEMA' else 'DEV' 18 | session.use_schema(f"{database_name}.{schema_name}") 19 | 20 | warehouse_name = "DEMO_WH" 21 | dag_name = "DEMO_DAG" 22 | api_root = Root(session) 23 | schema = api_root.databases[database_name].schemas[schema_name] 24 | dag_op = DAGOperation(schema) 25 | 26 | # Define the DAG 27 | with DAG(dag_name, schedule=timedelta(days=1), warehouse=warehouse_name) as dag: 28 | dag_task1 = DAGTask("LOAD_EXCEL_FILES_TASK", definition=f'''EXECUTE NOTEBOOK "{database_name}"."{schema_name}"."{env}_06_load_excel_files"()''', warehouse=warehouse_name) 29 | dag_task2 = DAGTask("LOAD_DAILY_CITY_METRICS", definition=f'''EXECUTE NOTEBOOK "{database_name}"."{schema_name}"."{env}_07_load_daily_city_metrics"()''', warehouse=warehouse_name) 30 | 31 | # Define the dependencies between the tasks 32 | dag_task1 >> dag_task2 # dag_task1 is a predecessor of dag_task2 33 | 34 | # Create the DAG in Snowflake 35 | dag_op.deploy(dag, mode="orreplace") 36 | 37 | #dag_op.run(dag) 38 | 39 | 40 | # For local debugging 41 | if __name__ == "__main__": 42 | import sys 43 | 44 | # Create a local Snowpark session 45 | with Session.builder.getOrCreate() as session: 46 | if len(sys.argv) > 1: 47 | print(main(session, *sys.argv[1:])) # type: ignore 48 | else: 49 | print(main(session)) # type: ignore 50 | -------------------------------------------------------------------------------- /scripts/setup.sql: -------------------------------------------------------------------------------- 1 | /*----------------------------------------------------------------------------- 2 | Hands-On Lab: Intro to Data Engineering with Notebooks 3 | Script: bootstrap.sql 4 | Author: Jeremiah Hansen 5 | Last Updated: 6/11/2024 6 | -----------------------------------------------------------------------------*/ 7 | 8 | SET MY_USER = CURRENT_USER(); 9 | SET GITHUB_SECRET_USERNAME = 'username'; 10 | SET GITHUB_SECRET_PASSWORD = 'personal access token'; 11 | SET GITHUB_URL_PREFIX = 'https://github.com/username'; 12 | SET GITHUB_REPO_ORIGIN = 'https://github.com/username/sfguide-data-engineering-with-notebooks.git'; 13 | 14 | 15 | -- ---------------------------------------------------------------------------- 16 | -- Create the account level objects (ACCOUNTADMIN part) 17 | -- ---------------------------------------------------------------------------- 18 | USE ROLE ACCOUNTADMIN; 19 | 20 | -- Roles 21 | CREATE OR REPLACE ROLE DEMO_ROLE; 22 | GRANT ROLE DEMO_ROLE TO ROLE SYSADMIN; 23 | GRANT ROLE DEMO_ROLE TO USER IDENTIFIER($MY_USER); 24 | 25 | GRANT CREATE INTEGRATION ON ACCOUNT TO ROLE DEMO_ROLE; 26 | GRANT EXECUTE TASK ON ACCOUNT TO ROLE DEMO_ROLE; 27 | GRANT EXECUTE MANAGED TASK ON ACCOUNT TO ROLE DEMO_ROLE; 28 | GRANT MONITOR EXECUTION ON ACCOUNT TO ROLE DEMO_ROLE; 29 | GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE DEMO_ROLE; 30 | 31 | -- Databases 32 | CREATE OR REPLACE DATABASE DEMO_DB; 33 | GRANT OWNERSHIP ON DATABASE DEMO_DB TO ROLE DEMO_ROLE; 34 | 35 | -- Warehouses 36 | CREATE OR REPLACE WAREHOUSE DEMO_WH WAREHOUSE_SIZE = XSMALL, AUTO_SUSPEND = 300, AUTO_RESUME= TRUE; 37 | GRANT OWNERSHIP ON WAREHOUSE DEMO_WH TO ROLE DEMO_ROLE; 38 | 39 | 40 | -- ---------------------------------------------------------------------------- 41 | -- Create the database level objects 42 | -- ---------------------------------------------------------------------------- 43 | USE ROLE DEMO_ROLE; 44 | USE WAREHOUSE DEMO_WH; 45 | USE DATABASE DEMO_DB; 46 | 47 | -- Schemas 48 | CREATE OR REPLACE SCHEMA INTEGRATIONS; 49 | CREATE OR REPLACE SCHEMA DEV_SCHEMA; 50 | CREATE OR REPLACE SCHEMA PROD_SCHEMA; 51 | 52 | USE SCHEMA INTEGRATIONS; 53 | 54 | -- External Frostbyte objects 55 | CREATE OR REPLACE STAGE FROSTBYTE_RAW_STAGE 56 | URL = 's3://sfquickstarts/data-engineering-with-snowpark-python/' 57 | ; 58 | 59 | -- Secrets (schema level) 60 | CREATE OR REPLACE SECRET DEMO_GITHUB_SECRET 61 | TYPE = password 62 | USERNAME = $GITHUB_SECRET_USERNAME 63 | PASSWORD = $GITHUB_SECRET_PASSWORD; 64 | 65 | -- API Integration (account level) 66 | -- This depends on the schema level secret! 67 | CREATE OR REPLACE API INTEGRATION DEMO_GITHUB_API_INTEGRATION 68 | API_PROVIDER = GIT_HTTPS_API 69 | API_ALLOWED_PREFIXES = ($GITHUB_URL_PREFIX) 70 | ALLOWED_AUTHENTICATION_SECRETS = (DEMO_GITHUB_SECRET) 71 | ENABLED = TRUE; 72 | 73 | -- Create the "dev" branch in your repo 74 | 75 | -- Git Repository 76 | CREATE OR REPLACE GIT REPOSITORY DEMO_GIT_REPO 77 | API_INTEGRATION = DEMO_GITHUB_API_INTEGRATION 78 | GIT_CREDENTIALS = DEMO_GITHUB_SECRET 79 | ORIGIN = $GITHUB_REPO_ORIGIN; 80 | 81 | 82 | -- ---------------------------------------------------------------------------- 83 | -- Create the event table 84 | -- ---------------------------------------------------------------------------- 85 | USE ROLE ACCOUNTADMIN; 86 | 87 | CREATE EVENT TABLE DEMO_DB.INTEGRATIONS.DEMO_EVENTS; 88 | GRANT SELECT ON EVENT TABLE DEMO_DB.INTEGRATIONS.DEMO_EVENTS TO ROLE DEMO_ROLE; 89 | GRANT INSERT ON EVENT TABLE DEMO_DB.INTEGRATIONS.DEMO_EVENTS TO ROLE DEMO_ROLE; 90 | 91 | ALTER ACCOUNT SET EVENT_TABLE = DEMO_DB.INTEGRATIONS.DEMO_EVENTS; 92 | ALTER DATABASE DEMO_DB SET LOG_LEVEL = INFO; 93 | -------------------------------------------------------------------------------- /scripts/teardown.sql: -------------------------------------------------------------------------------- 1 | /*----------------------------------------------------------------------------- 2 | Hands-On Lab: Intro to Data Engineering with Notebooks 3 | Script: teardown.sql 4 | Author: Jeremiah Hansen 5 | Last Updated: 6/11/2024 6 | -----------------------------------------------------------------------------*/ 7 | 8 | 9 | USE ROLE ACCOUNTADMIN; 10 | 11 | DROP API INTEGRATION DEMO_GITHUB_API_INTEGRATION; 12 | DROP DATABASE DEMO_DB; 13 | DROP WAREHOUSE DEMO_WH; 14 | DROP ROLE DEMO_ROLE; 15 | 16 | -- Drop the weather share 17 | DROP DATABASE FROSTBYTE_WEATHERSOURCE; 18 | 19 | -- Remove the "dev" branch in your repo 20 | --------------------------------------------------------------------------------