├── .github └── workflows │ └── python-app.yml ├── .gitignore ├── Access External Endpoints ├── Access External Endpoints.ipynb ├── environment.yml └── setup.sql ├── Anomaly Detection with Snowflake ML Functions └── Anomaly Detection with Snowflake ML Functions.ipynb ├── ArcGIS_Snowflake ├── ARCGIS_SERVICEAREA.ipynb └── environment.yml ├── Avalanche-Customer-Review-Analytics ├── Avalanche-Customer-Review-Analytics.ipynb ├── customer_reviews.csv ├── customer_reviews_docx.zip ├── environment.yml └── setup.sql ├── Bioinformatics_Solubility_Dashboard ├── Bioinformatics_Solubility_Dashboard.ipynb ├── delaney_solubility_with_descriptors.csv └── environment.yml ├── Build and Optimize Machine Learning Models with Streamlit ├── Build_and_Optimize_Machine_Learning_Models_with_Streamlit.ipynb └── environment.yml ├── Create and Manage Snowflake Objects like a Pro └── Create and Manage Snowflake Objects like a Pro.ipynb ├── Creating Snowflake Object using Python API ├── Creating Snowflake Object using Python API.ipynb └── environment.yml ├── Dashboard_with_Streamlit ├── Build_a_Dashboard_with_Streamlit_in_Snowflake_Notebooks.ipynb └── environment.yml ├── Data Engineering Pipelines with Snowpark Python └── Data Engineering Pipelines with Snowpark Python.ipynb ├── Data Pipeline Observability ├── finalizer_task_summary_to_html_email.ipynb ├── pipeline_alerts_level_1.ipynb ├── task_graph_run_demo.ipynb └── task_graphs_dmf_quality_checks.ipynb ├── Data_Analysis_with_LLM_RAG ├── Data_Analysis_with_LLM_RAG.ipynb └── environment.yml ├── End-to-End Machine Learning with Snowpark ML ├── 1_sf_nb_snowpark_ml_data_ingest.ipynb └── environment.yml ├── End-to-end ML with Feature Store and Model Registry └── End-to-end ML with Feature Store and Model Registry.ipynb ├── Feature Store API Overview └── Feature Store API Overview.ipynb ├── Feature Store Quickstart └── Feature Store Quickstart.ipynb ├── Fine tuning LLM using Snowflake Cortex AI ├── Fine tuning LLM using Snowflake Cortex AI.ipynb └── environment.yml ├── Getting Started With Snowflake Cortex AI in Snowflake Notebooks └── dash_snowflake_cortex_ai_101_notebook_app.ipynb ├── Getting Started with Container Runtimes ├── README.md ├── assets │ ├── diamonds_upload.png │ ├── eai.png │ ├── notebook_setup.png │ └── notebook_upload.png ├── diamonds.csv └── getting_started_with_container_runtimes.ipynb ├── Getting Started with Snowflake Cortex ML-Based Functions └── Getting Started with Snowflake Cortex ML-Based Functions.ipynb ├── Getting started with Snowpark using Snowflake Notebooks ├── Getting Started with Snowpark using Snowflake notebooks.ipynb └── environment.yml ├── Hyperparameter Tuning with sklearn ├── Hyperparameter Tuning with sklearn.ipynb └── environment.yml ├── Image_Classification_PyTorch └── image_classification_pytorch.ipynb ├── Image_Processing_Pipeline_Stream_Task_Cortex_Complete ├── Image_Processing_Pipeline.ipynb └── Image_Processing_Pipeline.pdf ├── Import Package from Stage ├── Import Package from Stage.ipynb ├── package_from_stage.png ├── simple.zip └── simple │ └── __init__.py ├── Ingest Public JSON └── Ingest Public JSON.ipynb ├── Intro to Snowpark pandas ├── Intro to Snowpark pandas.ipynb └── environment.yml ├── Java User-Defined Functions and Stored Procedures └── Java User-Defined Functions and Stored Procedures.ipynb ├── LICENSE ├── Load CSV from S3 └── Load CSV from S3.ipynb ├── MFA_Audit_of_Users ├── MFA_Audit_of_Users_with_Streamlit_in_Snowflake_Notebooks.ipynb ├── demo_data.csv └── environment.yml ├── ML Lineage Workflows └── ML Lineage Workflows.ipynb ├── Manage features in DBT with Feature Store └── Manage features in DBT with Feature Store.ipynb ├── Monitoring_Table_Size_with_Streamlit ├── Monitoring_Table_Size_with_Streamlit.ipynb └── environment.yml ├── My First Notebook Project ├── My First Notebook Project.ipynb └── environment.yml ├── Navigating and Browsing Files ├── Navigating and Browsing Files.ipynb ├── data.csv ├── data.json ├── display.py ├── img │ ├── browse_files.png │ ├── git_diff.png │ ├── git_files.png │ └── upload_files.png └── stats.py ├── Query_Caching_Effectiveness ├── Query_Caching_Effectiveness.ipynb └── environment.yml ├── Query_Cost_Monitoring ├── Query_Cost_Monitoring.ipynb └── environment.yml ├── Query_Performance_Insights ├── Automated_Query_Performance_Insights_in_Snowflake_Notebooks.ipynb └── environment.yml ├── Query_Performance_Insights_using_Streamlit ├── Build_an_Interactive_Query_Performance_App_with_Streamlit.ipynb └── environment.yml ├── RAG Chatbot for KubeCon Sessions └── RAG Chatbot for KubeCon Sessions.ipynb ├── README.md ├── Reference cells and variables └── Reference cells and variables.ipynb ├── Role_Based_Access_Auditing_with_Streamlit ├── Role_Based_Access_Auditing_with_Streamlit.ipynb └── environment.yml ├── Scheduled_Query_Execution_Report ├── Scheduled_Query_Execution_Report.ipynb └── environment.yml ├── Schema_Change_Tracker ├── Schema_Change_Tracker.ipynb └── environment.yml ├── Snowflake_Notebooks_Summit_2024_Demo └── aileen_summit_notebook.ipynb ├── Snowflake_Semantic_View ├── environment.yml └── getting-started-with-snowflake-semantic-view.ipynb ├── Snowflake_Trail_Alerts_Notifications ├── environment.yml ├── screenshot.png └── truck_sentiment_analysis_with_trail.ipynb ├── Streamlit_Zero_To_Hero_Machine_Learning_App ├── Streamlit_Machine_Learning_App.ipynb └── environment.yml ├── Telco Churn Data Analysis ├── Telco Churn Data Analysis.ipynb └── environment.yml ├── Visual Data Stories with Snowflake Notebooks ├── Visual Data Stories with Snowflake Notebooks.ipynb ├── environment.yml └── snowflake-logo.png ├── Warehouse_Utilization_with_Streamlit ├── Warehouse_Utilization_with_Streamlit.ipynb └── environment.yml ├── Working with Files └── Working with Files.ipynb ├── Working with Git ├── Working with Git.ipynb ├── environment.yml └── git_setup.sql └── config.toml /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | name: Python application 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Set up Snowflake connection by putting secrets into config file 17 | env: 18 | SNOWCLI_CONFIG: ${{secrets.SNOWCLI_CONFIG}} 19 | shell: bash 20 | run: | 21 | echo -e "$SNOWCLI_CONFIG" > config.toml 22 | # Snowflake CLI requires the config.toml file to limit its file permissions to read and write for the file owner only 23 | chown $USER config.toml 24 | chmod 0600 config.toml 25 | - name: Snowflake CLI installation 26 | uses: Snowflake-Labs/snowflake-cli-action@v1 27 | with: 28 | cli-version: "latest" 29 | default-config-file-path: "config.toml" 30 | - name: Fetch the latest update from Github 31 | run: | 32 | snow sql -q "ALTER GIT REPOSITORY SFLAB_DEMO_NB FETCH;" 33 | - name: Test Notebook - My First Notebook Project 34 | run: | 35 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_FIRST_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/My First Notebook Project/' MAIN_FILE = 'My First Notebook Project.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 36 | snow sql -q "ALTER NOTEBOOK GH_ACTION_FIRST_NB ADD LIVE VERSION FROM LAST;" 37 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_FIRST_NB();" 38 | - name: Test Notebook - Visual Data Stories 39 | run: | 40 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_VISUAL_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Visual Data Stories with Snowflake Notebooks/' MAIN_FILE = 'Visual Data Stories with Snowflake Notebooks.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 41 | snow sql -q "ALTER NOTEBOOK GH_ACTION_VISUAL_NB ADD LIVE VERSION FROM LAST;" 42 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_VISUAL_NB();" 43 | - name: Test Notebook - Ingest Public JSON 44 | run: | 45 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_PUBLIC_JSON_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Ingest Public JSON/' MAIN_FILE = 'Ingest Public JSON.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 46 | snow sql -q "ALTER NOTEBOOK GH_ACTION_PUBLIC_JSON_NB ADD LIVE VERSION FROM LAST;" 47 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_PUBLIC_JSON_NB();" 48 | - name: Test Notebook - Load CSV from S3 49 | run: | 50 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_CSV_S3_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Load CSV from S3/' MAIN_FILE = 'Load CSV from S3.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 51 | snow sql -q "ALTER NOTEBOOK GH_ACTION_CSV_S3_NB ADD LIVE VERSION FROM LAST;" 52 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_CSV_S3_NB();" 53 | - name: Test Notebook - Reference cells and variables 54 | run: | 55 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_CELLREF_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Reference cells and variables/' MAIN_FILE = 'Reference cells and variables.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 56 | snow sql -q "ALTER NOTEBOOK GH_ACTION_CELLREF_NB ADD LIVE VERSION FROM LAST;" 57 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_CELLREF_NB();" 58 | - name: Test Notebook - Working with Files 59 | run: | 60 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_FILES_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Working with Files/' MAIN_FILE = 'Working with Files.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 61 | snow sql -q "ALTER NOTEBOOK GH_ACTION_FILES_NB ADD LIVE VERSION FROM LAST;" 62 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_FILES_NB();" 63 | - name: Test Notebook - Navigating and Browsing Files 64 | run: | 65 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_MULTIFILE_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Navigating and Browsing Files/' MAIN_FILE = 'Navigating and Browsing Files.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 66 | snow sql -q "ALTER NOTEBOOK GH_ACTION_MULTIFILE_NB ADD LIVE VERSION FROM LAST;" 67 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_MULTIFILE_NB();" 68 | - name: Test Notebook - Access External Endpoints 69 | run: | 70 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_EAI_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Access External Endpoints/' MAIN_FILE = 'Access External Endpoints.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 71 | snow sql -q "ALTER NOTEBOOK GH_ACTION_EAI_NB ADD LIVE VERSION FROM LAST;" 72 | snow sql -q "EXECUTE IMMEDIATE FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Access External Endpoints/setup.sql';" 73 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_EAI_NB();" 74 | - name: Test Notebook - Hyperparameter Tuning with sklearn 75 | run: | 76 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_SKLEARN_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Hyperparameter Tuning with sklearn/' MAIN_FILE = 'Hyperparameter Tuning with sklearn.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 77 | snow sql -q "ALTER NOTEBOOK GH_ACTION_SKLEARN_NB ADD LIVE VERSION FROM LAST;" 78 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_SKLEARN_NB();" 79 | - name: Test Notebook - Import from Stage 80 | run: | 81 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_STAGE_IMPORT_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Import Package from Stage/' MAIN_FILE = 'Import Package from Stage.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 82 | snow sql -q "ALTER NOTEBOOK GH_ACTION_STAGE_IMPORT_NB ADD LIVE VERSION FROM LAST;" 83 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_STAGE_IMPORT_NB();" 84 | # - name: Test Notebook - Working with Git 85 | # run: | 86 | # snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_GIT_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Working with Git/' MAIN_FILE = 'Working with Git.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 87 | # snow sql -q "ALTER NOTEBOOK GH_ACTION_GIT_NB ADD LIVE VERSION FROM LAST;" 88 | # snow sql -q "EXECUTE NOTEBOOK GH_ACTION_GIT_NB();" 89 | - name: Test Notebook - Create Objects with Python API 90 | run: | 91 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_PYTHONAPI_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Creating Snowflake Object using Python API/' MAIN_FILE = 'Creating Snowflake Object using Python API.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 92 | snow sql -q "ALTER NOTEBOOK GH_ACTION_PYTHONAPI_NB ADD LIVE VERSION FROM LAST;" 93 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_PYTHONAPI_NB();" 94 | - name: Test Notebook - Cortex ML Function 95 | run: | 96 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_CORTEX_MLFUNC_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Getting Started with Snowflake Cortex ML-Based Functions/' MAIN_FILE = 'Getting Started with Snowflake Cortex ML-Based Functions.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 97 | snow sql -q "ALTER NOTEBOOK GH_ACTION_CORTEX_MLFUNC_NB ADD LIVE VERSION FROM LAST;" 98 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_CORTEX_MLFUNC_NB();" 99 | - name: Test Notebook - End-to-End Machine Learning with Snowpark ML (1) 100 | run: | 101 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_SPML1_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/End-to-End Machine Learning with Snowpark ML/' MAIN_FILE = '1_sf_nb_snowpark_ml_data_ingest.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 102 | snow sql -q "ALTER NOTEBOOK GH_ACTION_SPML1_NB ADD LIVE VERSION FROM LAST;" 103 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_SPML1_NB();" 104 | - name: Test Notebook - End-to-End Machine Learning with Snowpark ML (2) 105 | run: | 106 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_SPML2_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/End-to-End Machine Learning with Snowpark ML/' MAIN_FILE = '2_sf_nb_snowpark_ml_feature_transformations.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 107 | snow sql -q "ALTER NOTEBOOK GH_ACTION_SPML2_NB ADD LIVE VERSION FROM LAST;" 108 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_SPML2_NB();" 109 | - name: Test Notebook - End-to-End Machine Learning with Snowpark ML (3) 110 | run: | 111 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_SPML3_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/End-to-End Machine Learning with Snowpark ML/' MAIN_FILE = '3_sf_nb_snowpark_ml_model_training_inference.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 112 | snow sql -q "ALTER NOTEBOOK GH_ACTION_SPML3_NB ADD LIVE VERSION FROM LAST;" 113 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_SPML3_NB();" 114 | - name: Test Notebook - Intro to Snowpark pandas 115 | run: | 116 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_PANDAS_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Intro to Snowpark pandas/' MAIN_FILE = 'Intro to Snowpark pandas.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 117 | snow sql -q "ALTER NOTEBOOK GH_ACTION_PANDAS_NB ADD LIVE VERSION FROM LAST;" 118 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_PANDAS_NB();" 119 | - name: Test Notebook - Data Engineering Pipelines with Snowpark Python 120 | run: | 121 | snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_DE_SNOWPARK_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Data Engineering Pipelines with Snowpark Python/' MAIN_FILE = 'Data Engineering Pipelines with Snowpark Python.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 122 | snow sql -q "ALTER NOTEBOOK GH_ACTION_DE_SNOWPARK_NB ADD LIVE VERSION FROM LAST;" 123 | snow sql -q "EXECUTE NOTEBOOK GH_ACTION_DE_SNOWPARK_NB();" 124 | # - name: Test Notebook - Create and Manage Snowflake Objects like a Pro 125 | # run: | 126 | # snow sql -q "CREATE OR REPLACE NOTEBOOK GH_ACTION_PRO_NB FROM '@"GH_ACTION"."PUBLIC"."SFLAB_DEMO_NB"/branches/main/Create and Manage Snowflake Objects like a Pro/' MAIN_FILE = 'Create and Manage Snowflake Objects like a Pro.ipynb' QUERY_WAREHOUSE = 'GH_ACTION_WH';" 127 | # snow sql -q "ALTER NOTEBOOK GH_ACTION_PRO_NB ADD LIVE VERSION FROM LAST;" 128 | # snow sql -q "EXECUTE NOTEBOOK GH_ACTION_PRO_NB();" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | scripts/ 163 | -------------------------------------------------------------------------------- /Access External Endpoints/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - pytorch=2.2.0 6 | - transformers=4.37.2 -------------------------------------------------------------------------------- /Access External Endpoints/setup.sql: -------------------------------------------------------------------------------- 1 | -- Create the HuggingFace external access integration and the network rule it relies on. 2 | CREATE OR REPLACE NETWORK RULE hf_network_rule 3 | MODE = EGRESS 4 | TYPE = HOST_PORT 5 | VALUE_LIST = ('huggingface.co','cdn-lfs-us-1.huggingface.co'); 6 | 7 | CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION hf_access_integration 8 | ALLOWED_NETWORK_RULES = (hf_network_rule) 9 | ENABLED = true; 10 | 11 | -- Create the Github external access integration and the network rule it relies on. 12 | CREATE OR REPLACE NETWORK RULE gh_network_rule 13 | MODE = EGRESS 14 | TYPE = HOST_PORT 15 | VALUE_LIST = ('raw.githubusercontent.com', 'githubusercontent.com','github.com'); 16 | 17 | CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION gh_access_integration 18 | ALLOWED_NETWORK_RULES = (gh_network_rule) 19 | ENABLED = true; 20 | 21 | ALTER NOTEBOOK GH_ACTION_EAI_NB set EXTERNAL_ACCESS_INTEGRATIONS = (hf_access_integration, gh_access_integration); -------------------------------------------------------------------------------- /ArcGIS_Snowflake/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - pydeck=* -------------------------------------------------------------------------------- /Avalanche-Customer-Review-Analytics/Avalanche-Customer-Review-Analytics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | }, 7 | "lastEditStatus": { 8 | "notebookId": "2gfpag77rjklnaepw2qp", 9 | "authorId": "6841714608330", 10 | "authorName": "CHANINN", 11 | "authorEmail": "chanin.nantasenamat@snowflake.com", 12 | "sessionId": "fd937486-2fde-4160-99dc-ddfca8af4103", 13 | "lastEditTime": 1743707076161 14 | } 15 | }, 16 | "nbformat_minor": 5, 17 | "nbformat": 4, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "id": "3e3bdd35-2104-4280-a28f-e02cac177a85", 22 | "metadata": { 23 | "name": "md_title", 24 | "collapsed": false 25 | }, 26 | "source": "# Build a Customer Review Analytics Dashboard with Streamlit on Snowflake\n\nIn this notebook, we're performing data processing of the Avalanche customer review data. By the end of the tutorial, we'll have created a few data visualization to gain insights into the general sentiment of the products." 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "3fc8fa46-8a26-43e3-a2a9-381c89eae2a7", 31 | "metadata": { 32 | "name": "md_about", 33 | "collapsed": false 34 | }, 35 | "source": "## Avalanche data\n\nThe Avalanche data set is based on a hypothetical company that sells winter sports gear. Holistically, this data set is comprised of the product catalog, customer review, shipping logistics and order history.\n\nIn this particular notebook, we'll use only the customer review data. We'll start by uploading customer review data in DOCX format. Next, we'll parse and reshape the data into a semi-structured form. Particularly, we'll apply LLMs for language translation and text summarization along with sentiment analysis." 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "03e5be91-6497-450d-97c0-ca70199b8eef", 40 | "metadata": { 41 | "name": "md_data", 42 | "collapsed": false 43 | }, 44 | "source": "## Retrieve customer review data\n\nFirst, we're starting by querying and parsing the content from DOCX files that are stored on the `@avalanche_db.avalanche_schema.customer-reviews` stage." 45 | }, 46 | { 47 | "cell_type": "code", 48 | "id": "b45557a0-01b9-4775-9b97-28da754ec326", 49 | "metadata": { 50 | "language": "sql", 51 | "name": "sql1", 52 | "collapsed": false, 53 | "codeCollapsed": false 54 | }, 55 | "outputs": [], 56 | "source": "-- Parse content from DOCX files\nWITH files AS (\n SELECT \n REPLACE(REGEXP_SUBSTR(file_url, '[^/]+$'), '%2e', '.') as filename\n FROM DIRECTORY('@avalanche_db.avalanche_schema.customer_reviews')\n WHERE filename LIKE '%.docx'\n)\nSELECT \n filename,\n SNOWFLAKE.CORTEX.PARSE_DOCUMENT(\n @avalanche_db.avalanche_schema.customer_reviews,\n filename,\n {'mode': 'layout'}\n ):content AS layout\nFROM files;", 57 | "execution_count": null 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "796ba2b7-2d50-4d22-911d-db20912257f5", 62 | "metadata": { 63 | "name": "md_sql2", 64 | "collapsed": false 65 | }, 66 | "source": "## Data reshaping\n\nWe're reshaping the data to a more structured form by using regular expression to create additional columns from the customer review `LAYOUT` column." 67 | }, 68 | { 69 | "cell_type": "code", 70 | "id": "c6f47ba7-4c5a-46f1-a2eb-3533f4dcda05", 71 | "metadata": { 72 | "language": "sql", 73 | "name": "sql2", 74 | "codeCollapsed": false, 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": "-- Extract PRODUCT name, DATE, and CUSTOMER_REVIEW from the LAYOUT column\nSELECT \n filename,\n REGEXP_SUBSTR(layout, 'Product: (.*?) Date:', 1, 1, 'e') as product,\n REGEXP_SUBSTR(layout, 'Date: (202[0-9]-[0-9]{2}-[0-9]{2})', 1, 1, 'e') as date,\n REGEXP_SUBSTR(layout, '## Customer Review\\n([\\\\s\\\\S]*?)$', 1, 1, 'es') as customer_review\nFROM {{sql1}};", 79 | "execution_count": null 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "99f6b075-3d7c-4615-8414-86568a80ee20", 84 | "metadata": { 85 | "name": "md_sql3", 86 | "collapsed": false 87 | }, 88 | "source": "## Apply Cortex LLM on customer review data\n\nHere, we'll apply the Cortex LLM to perform the following 3 tasks:\n- Text translation is performed on foreign language text where they are translated to English.\n- Text summarization is performed on the translated text to obtain a more concise summary.\n- Sentiment score is calculated to give insights on whether the sentiment was positive or negative." 89 | }, 90 | { 91 | "cell_type": "code", 92 | "id": "74be7b08-6122-4a98-b113-99ff874375e3", 93 | "metadata": { 94 | "language": "sql", 95 | "name": "sql3", 96 | "collapsed": false, 97 | "codeCollapsed": false 98 | }, 99 | "outputs": [], 100 | "source": "-- Perform translation, summarization and sentiment analysis on customer review\nSELECT \n product,\n date,\n SNOWFLAKE.CORTEX.TRANSLATE(customer_review, '', 'en') as translated_review,\n SNOWFLAKE.CORTEX.SUMMARIZE(translated_review) as summary,\n SNOWFLAKE.CORTEX.SENTIMENT(translated_review) as sentiment_score\nFROM {{sql2}}\nORDER BY date;", 101 | "execution_count": null 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "adaa0f32-5263-41ac-aa30-88cc75303d42", 106 | "metadata": { 107 | "name": "md_df", 108 | "collapsed": false 109 | }, 110 | "source": "## Convert SQL output to Pandas DataFrame\n\nHere, we'll convert the SQL output to a Pandas DataFrame by applying the `to_pandas()` method." 111 | }, 112 | { 113 | "cell_type": "code", 114 | "id": "b88d6ae3-0de9-42c1-b48a-f2ebc4d34255", 115 | "metadata": { 116 | "language": "python", 117 | "name": "df", 118 | "codeCollapsed": false, 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": "sql3.to_pandas()", 123 | "execution_count": null 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "a3a0334d-29df-494f-982f-3e1fcd916066", 128 | "metadata": { 129 | "name": "md_bar", 130 | "collapsed": false 131 | }, 132 | "source": "## Bar charts\n\nHere, we're creating some bar charts for the sentiment scores.\n\n### Daily sentiment scores\n\nNote: Positive values are shown in green while negative values in red." 133 | }, 134 | { 135 | "cell_type": "code", 136 | "id": "4cd85ca2-f005-4285-a633-744b12de2109", 137 | "metadata": { 138 | "language": "python", 139 | "name": "py_bar", 140 | "codeCollapsed": false, 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": "import streamlit as st\nimport altair as alt\nimport pandas as pd\n\n# Ensure SENTIMENT_SCORE is numeric\ndf['SENTIMENT_SCORE'] = pd.to_numeric(df['SENTIMENT_SCORE'])\n\n# Create the base chart with bars\nchart = alt.Chart(df).mark_bar(size=15).encode(\n x=alt.X('DATE:T',\n axis=alt.Axis(\n format='%Y-%m-%d', # YYYY-MM-DD format\n labelAngle=90) # Rotate labels 90 degrees\n ),\n y=alt.Y('SENTIMENT_SCORE:Q'),\n color=alt.condition(\n alt.datum.SENTIMENT_SCORE >= 0,\n alt.value('#2ecc71'), # green for positive\n alt.value('#e74c3c') # red for negative\n ),\n tooltip=['PRODUCT:N', 'DATE:T'] # Add tooltip\n).properties(\n height=500\n)\n\n# Display the chart\nst.altair_chart(chart, use_container_width=True)", 145 | "execution_count": null 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "32bcfa7b-c940-4615-94a2-373c199ede4f", 150 | "metadata": { 151 | "name": "md_bar_2", 152 | "collapsed": false 153 | }, 154 | "source": "### Product sentiment scores" 155 | }, 156 | { 157 | "cell_type": "code", 158 | "id": "74951343-25ef-41c7-825e-4d487dc676eb", 159 | "metadata": { 160 | "language": "python", 161 | "name": "py_product_sentiment", 162 | "codeCollapsed": false 163 | }, 164 | "outputs": [], 165 | "source": "import streamlit as st\nimport altair as alt\nimport pandas as pd\n\n# Create the base chart with aggregation by PRODUCT\nbars = alt.Chart(df).mark_bar(size=15).encode(\n y=alt.Y('PRODUCT:N', \n axis=alt.Axis(\n labelAngle=0, # Horizontal labels\n labelOverlap=False, # Prevent label overlap\n labelPadding=10 # Add some padding\n )\n ),\n x=alt.X('mean(SENTIMENT_SCORE):Q', # Aggregate mean sentiment score\n title='MEAN SENTIMENT_SCORE'),\n color=alt.condition(\n alt.datum.mean_SENTIMENT_SCORE >= 0,\n alt.value('#2ecc71'), # green for positive\n alt.value('#e74c3c') # red for negative\n ),\n tooltip=['PRODUCT:N', 'mean(SENTIMENT_SCORE):Q']\n).properties(\n height=400\n)\n\n# Display the chart\nst.altair_chart(bars, use_container_width=True)", 166 | "execution_count": null 167 | }, 168 | { 169 | "cell_type": "code", 170 | "id": "d430287f-867c-484a-8e09-d9d29ca9ef3f", 171 | "metadata": { 172 | "language": "python", 173 | "name": "py_download", 174 | "codeCollapsed": false 175 | }, 176 | "outputs": [], 177 | "source": "# Download button for the CSV file\nst.subheader('Processed Customer Reviews Data')\nst.download_button(\n label=\"Download CSV\",\n data=df[['PRODUCT', 'DATE', 'SUMMARY', 'SENTIMENT_SCORE']].to_csv(index=False).encode('utf-8'),\n mime=\"text/csv\"\n)", 178 | "execution_count": null 179 | }, 180 | { 181 | "cell_type": "code", 182 | "id": "597a05b3-0ead-4fb0-a821-d02ce6802b47", 183 | "metadata": { 184 | "language": "sql", 185 | "name": "cell1" 186 | }, 187 | "outputs": [], 188 | "source": "", 189 | "execution_count": null 190 | } 191 | ] 192 | } 193 | -------------------------------------------------------------------------------- /Avalanche-Customer-Review-Analytics/customer_reviews_docx.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Avalanche-Customer-Review-Analytics/customer_reviews_docx.zip -------------------------------------------------------------------------------- /Avalanche-Customer-Review-Analytics/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake.core=* 6 | -------------------------------------------------------------------------------- /Avalanche-Customer-Review-Analytics/setup.sql: -------------------------------------------------------------------------------- 1 | -- STEP 1 2 | -- Create the avalanche database and schema 3 | CREATE DATABASE IF NOT EXISTS avalanche_db; 4 | CREATE SCHEMA IF NOT EXISTS avalanche_schema; 5 | 6 | -- STEP 2 7 | -- Option 1: Manual upload to Stage 8 | -- Create the stage for storing our files 9 | -- Uncomment code block below for this option: 10 | -- 11 | CREATE STAGE IF NOT EXISTS avalanche_db.avalanche_schema.customer_reviews 12 | ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE') 13 | DIRECTORY = (ENABLE = true); 14 | -- 15 | -- Now go and upload files to the stage. 16 | -- Once you've done that proceed to the next step 17 | 18 | -- Option 2: Push files to Stage from S3 19 | -- Uncomment lines below to use: 20 | -- 21 | -- Create the stage for storing our files 22 | -- CREATE OR REPLACE STAGE customer_reviews 23 | -- URL = 's3://sfquickstarts/misc/customer_reviews/' 24 | -- DIRECTORY = (ENABLE = TRUE AUTO_REFRESH = TRUE); 25 | 26 | 27 | -- STEP 3 28 | -- List the contents of the newly created stage 29 | ls @avalanche_db.avalanche_schema.customer_reviews; 30 | 31 | 32 | -- STEP 4 33 | -- USAGE 34 | -- 35 | -- Read single file 36 | -- Uncomment lines below to use: 37 | -- 38 | -- SELECT 39 | -- SNOWFLAKE.CORTEX.PARSE_DOCUMENT( 40 | -- @avalanche_db.avalanche_schema.customer_reviews, 41 | -- 'review-01.docx', 42 | -- {'mode': 'layout'} 43 | -- ) AS layout; 44 | 45 | -- Read multiple files into a table 46 | -- Uncomment lines below to use: 47 | -- 48 | -- WITH files AS ( 49 | -- SELECT 50 | -- REPLACE(REGEXP_SUBSTR(file_url, '[^/]+$'), '%2e', '.') as filename 51 | -- FROM DIRECTORY('@avalanche_db.avalanche_schema.customer_reviews') 52 | -- WHERE filename LIKE '%.docx' 53 | -- ) 54 | -- SELECT 55 | -- filename, 56 | -- SNOWFLAKE.CORTEX.PARSE_DOCUMENT( 57 | -- @avalanche_db.avalanche_schema.customer_reviews, 58 | -- filename, 59 | -- {'mode': 'layout'} 60 | -- ):content AS layout 61 | -- FROM files; 62 | -------------------------------------------------------------------------------- /Bioinformatics_Solubility_Dashboard/Bioinformatics_Solubility_Dashboard.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | }, 7 | "lastEditStatus": { 8 | "notebookId": "7rpm6lxftnqo2r7bqwsp", 9 | "authorId": "6841714608330", 10 | "authorName": "CHANINN", 11 | "authorEmail": "chanin.nantasenamat@snowflake.com", 12 | "sessionId": "6c69bcea-e09a-4f87-a91d-99ff6aecc8bf", 13 | "lastEditTime": 1741649071648 14 | } 15 | }, 16 | "nbformat_minor": 5, 17 | "nbformat": 4, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "id": "407331eb-29af-42a3-976c-43e3652cd685", 22 | "metadata": { 23 | "name": "md_title", 24 | "collapsed": false 25 | }, 26 | "source": "# Build a Bioinformatics Solubility Dashboard in Snowflake\n\nIn this notebook, you'll build a **bioinformatics project** from scratch in Snowflake. \n\nBriefly, we're using the *Delaney* solubility data set. Solubility is an important property for successful drug discovery efforts and is amongst one of the key metrics used in defining drug-like molecules according to the Lipinski Rule of 5.\n\nIn a nutshell, here's what you're building:\n- Load data into Snowflake\n- Perform data preparation using Pandas\n- Build a simple dashboard with Streamlit\n" 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "121d2db7-d366-4363-a464-fadf2ffbb1dc", 31 | "metadata": { 32 | "name": "md_solubility", 33 | "collapsed": false 34 | }, 35 | "source": "## About molecular solubility\n\nMolecular solubility is a crucial property in drug development that affects whether a drug can reach its target in the human body. Let me explain why it matters in simple terms.\n\n### Solubility\nSolubility is a molecule's ability to dissolve in a liquid, which literally means the ability to dissolve in human bloodstream and transport to its desired target in the human body. If it can dissolve, it can't work!\n\nPoorly soluble drugs might require higher doses or special formulations, leading to potential side effects or complicated treatment regimens. So we want drugs that are both effective and yet soluble so that fewer of it is required so as to minimize potential side effects.\n\n### Lipinski's Rule of 5\nDrug development often refer to a guidelines known as the Lipinski's Rule of 5 to predict whether a molecule will be soluble enough to make a good oral drug. This includes factors like:\n- Molecule's size\n- How water-loving or water-repelling it is\n- Number of hydrogen bond donors and acceptors\n\nUnderstanding and optimizing solubility helps pharmaceutical companies develop effective medicines that can be easily administered and work efficiently in the body." 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "3a2a4205-5392-4730-8495-93fea5c1602f", 40 | "metadata": { 41 | "name": "md_data", 42 | "collapsed": false 43 | }, 44 | "source": "## Load data\n\nHere, we're loading the Delaney data set ([reference](https://pubs.acs.org/doi/10.1021/ci034243x))." 45 | }, 46 | { 47 | "cell_type": "code", 48 | "id": "92528066-a158-4733-8747-a2915c832c58", 49 | "metadata": { 50 | "language": "sql", 51 | "name": "sql_data" 52 | }, 53 | "outputs": [], 54 | "source": "SELECT * FROM CHANINN_DEMO_DATA.PUBLIC.SOLUBILITY", 55 | "execution_count": null 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "32b8bb10-45e2-4c81-8953-b4af097fe619", 60 | "metadata": { 61 | "name": "md_to_pandas", 62 | "collapsed": false 63 | }, 64 | "source": "## Convert SQL output to Pandas DataFrame\n\nWe're using `to_pandas()` method to convert our SQL output table to a Pandas DataFrame." 65 | }, 66 | { 67 | "cell_type": "code", 68 | "id": "24aef3fd-6815-4874-a712-d7ab940660f7", 69 | "metadata": { 70 | "language": "python", 71 | "name": "df", 72 | "codeCollapsed": false 73 | }, 74 | "outputs": [], 75 | "source": "sql_data.to_pandas()", 76 | "execution_count": null 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "126ab616-c4bc-484a-9d44-833b0bf26143", 81 | "metadata": { 82 | "name": "md_class", 83 | "collapsed": false 84 | }, 85 | "source": "## Data Aggregation\n\nHere, we're aggregating the data (grouping it) by its molecular weight:\n- `small` if <300\n- `large` if >= 300" 86 | }, 87 | { 88 | "cell_type": "code", 89 | "id": "ab0fb5ec-3cf1-45d6-872c-d92691cb9d9d", 90 | "metadata": { 91 | "language": "python", 92 | "name": "py_class", 93 | "codeCollapsed": false 94 | }, 95 | "outputs": [], 96 | "source": "df['MOLWT_CLASS'] = pd.Series(['small' if x < 300 else 'large' for x in df['MOLWT']])\ndf_class = df.groupby('MOLWT_CLASS').mean().reset_index()\ndf_class", 97 | "execution_count": null 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "dd9543d3-31b7-4c54-9bde-530c42e36a90", 102 | "metadata": { 103 | "name": "md_app", 104 | "collapsed": false 105 | }, 106 | "source": "## Building the Solubility Dashboard" 107 | }, 108 | { 109 | "cell_type": "code", 110 | "id": "89a6c1ff-71e9-4c2f-be2b-6d14879ddd00", 111 | "metadata": { 112 | "language": "python", 113 | "name": "py_app", 114 | "codeCollapsed": false 115 | }, 116 | "outputs": [], 117 | "source": "import streamlit as st\n\nst.title('☘️ Solubility Dashboard')\n\n# Data Filtering\nmol_size = st.slider('Select a value', 100, 500, 300)\ndf['MOLWT_CLASS'] = pd.Series(['small' if x < mol_size else 'large' for x in df['MOLWT']])\ndf_class = df.groupby('MOLWT_CLASS').mean().reset_index()\n\nst.divider()\n\n# Calculate Metrics\nmolwt_large = round(df_class['MOLWT'][0], 2)\nmolwt_small = round(df_class['MOLWT'][1], 2)\nnumrotatablebonds_large = round(df_class['NUMROTATABLEBONDS'][0], 2)\nnumrotatablebonds_small = round(df_class['NUMROTATABLEBONDS'][1], 2)\nmollogp_large = round(df_class['MOLLOGP'][0], 2)\nmollogp_small = round(df_class['MOLLOGP'][1], 2)\naromaticproportion_large = round(df_class['AROMATICPROPORTION'][0], 2)\naromaticproportion_small = round(df_class['AROMATICPROPORTION'][1], 2)\n\n# Data metrics and visualizations\ncol = st.columns(2)\nwith col[0]:\n st.subheader('Molecular Weight')\n st.metric('Large', molwt_large)\n st.metric('Small', molwt_small)\n st.bar_chart(df_class, x='MOLWT_CLASS', y='MOLWT', color='MOLWT_CLASS')\n\n st.subheader('Number of Rotatable Bonds')\n st.metric('Large', numrotatablebonds_large)\n st.metric('Small', numrotatablebonds_small)\n st.bar_chart(df_class, x='MOLWT_CLASS', y='NUMROTATABLEBONDS', color='MOLWT_CLASS')\nwith col[1]:\n st.subheader('Molecular LogP')\n st.metric('Large', mollogp_large)\n st.metric('Small', mollogp_small)\n st.bar_chart(df_class, x='MOLWT_CLASS', y='MOLLOGP', color='MOLWT_CLASS')\n\n st.subheader('Aromatic Proportion')\n st.metric('Large', mollogp_large)\n st.metric('Small', mollogp_small)\n st.bar_chart(df_class, x='MOLWT_CLASS', y='AROMATICPROPORTION', color='MOLWT_CLASS')\n\nwith st.expander('Show Original DataFrame'):\n st.dataframe(df)\nwith st.expander('Show Aggregated DataFrame'):\n st.dataframe(df_class)", 118 | "execution_count": null 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "81a409e7-7219-4c20-9276-f3b27e0b8ea4", 123 | "metadata": { 124 | "name": "md_reference", 125 | "collapsed": false 126 | }, 127 | "source": "## References\n\n- [ESOL:  Estimating Aqueous Solubility Directly from Molecular Structure](https://pubs.acs.org/doi/10.1021/ci034243x)\n- [st.bar_chart](https://docs.streamlit.io/develop/api-reference/charts/st.bar_chart)\n- [st.expander](https://docs.streamlit.io/develop/api-reference/layout/st.expander)\n- [st.slider](https://docs.streamlit.io/develop/api-reference/widgets/st.slider)" 128 | } 129 | ] 130 | } -------------------------------------------------------------------------------- /Bioinformatics_Solubility_Dashboard/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - pandas=* 6 | -------------------------------------------------------------------------------- /Build and Optimize Machine Learning Models with Streamlit/Build_and_Optimize_Machine_Learning_Models_with_Streamlit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "2ca12abe-9d90-46c7-a40b-3631fe7e7665", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false 17 | }, 18 | "source": "# Build and Optimize a Machine Learning Models in Snowflake Notebooks with Streamlit\n\nIn this notebook, we'll build and optimize machine learning models. We'll also sprinkle in UI interactivity with Streamlit widgets to allow users to experiment and play with the parameters and settings.\n\n## Libraries used\n- `streamlit` - build the frontend UI\n- `pandas` - handle and wrangle data\n- `numpy` - numerical computing\n- `scikit-learn` - build machine learning models\n- `altair` - data visualization\n\n## Protocol\nHere's a breakdown of what we'll be doing:\n1. Load and prepare a dataset for modeling.\n2. Perform grid search hyperparameter optimization using the radial basis function (RBF) kernel with the support vector machine (SVM) algorithm.\n3. Visualize the hyperparameter optimization via a heatmap and line chart.\n" 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "cc43846f-0d71-40d4-9c6c-ebd7e81e4db4", 23 | "metadata": { 24 | "name": "cell1", 25 | "collapsed": false 26 | }, 27 | "source": "## Build the ML Hyperparameter Optimization App using Streamlit" 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "59bf3b1e-92f9-4a24-919a-b7ea11f164b6", 32 | "metadata": { 33 | "language": "python", 34 | "name": "py_app", 35 | "codeCollapsed": false, 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": "import streamlit as st\nimport pandas as pd\nimport numpy as np\nimport altair as alt\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.svm import SVC\nfrom sklearn.datasets import load_wine\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.preprocessing import StandardScaler\n\nst.title('ML Hyperparameter Optimization')\n\n# Load wine dataset\ndataset = load_wine()\nX = dataset.data\ny = dataset.target\nfeature_names = dataset.feature_names\n\n# Create DataFrame\ndf = pd.DataFrame(X, columns=feature_names)\ndf['target'] = y\n\n# Display dataset info using metrics\nst.header('📖 Dataset Information')\ncol1, col2, col3 = st.columns(3)\nwith col1:\n st.metric(\"Number of features\", len(feature_names))\nwith col2:\n st.metric(\"Number of classes\", len(dataset.target_names))\nwith col3:\n st.metric(\"Number of samples\", len(y))\n\n# Display class names\nformatted_classes = \", \".join([f\"`{i+1}`\" for i in range(len(dataset.target_names))])\nst.write(f\"Classes: {formatted_classes}\")\n\n# Display sample of the data\nwith st.expander(\"👀 See the dataset\"):\n st.write(df.head())\n\n# Model hyperparameters using powers of 2\nst.header('⚙️ Hyperparameters')\n\n# Parameter range selection\nst.subheader(\"Parameter Ranges (in powers of 2)\")\ncol1, col2 = st.columns(2)\n\n# Create list of powers of 2\npowers = list(range(-10, 11, 2))\n\nwith col1:\n C_power_range = st.select_slider(\n 'C (Regularization) range - powers of 2',\n options=powers,\n value=(-4, 4),\n help='C = 2^value'\n )\n st.info(f'''\n C range: $2^{{{C_power_range[0]}}}$ to $2^{{{C_power_range[1]}}}$\n \n {2**C_power_range[0]:.6f} to {2**C_power_range[1]:.6f}\n ''')\n\nwith col2:\n gamma_power_range = st.select_slider(\n 'γ range - powers of 2',\n options=powers,\n value=(-4, 4),\n help='gamma = 2^value'\n )\n st.info(f'''\n γ range: $2^{{{gamma_power_range[0]}}}$ to $2^{{{gamma_power_range[1]}}}$\n \n {2**gamma_power_range[0]:.6f} to {2**gamma_power_range[1]:.6f}\n ''')\n\n# Step size selection\nst.subheader(\"Step Size for Grid Search\")\ncol1, col2, col3 = st.columns(3)\n\nwith col1:\n C_step = st.slider('C step size', 0.1, 2.0, 0.5, 0.1)\nwith col2:\n gamma_step = st.slider('Gamma step size', 0.1, 2.0, 0.5, 0.1)\nwith col3:\n test_size = st.slider('Test size', 0.1, 0.5, 0.2)\n\nst.divider()\n\n# Split and scale data\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)\n\n# Scale the features\nscaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n\n# Create parameter grid using powers of 2 with specified step sizes\ndef create_param_range(start_power, end_power, step):\n powers = np.arange(start_power, end_power + step, step)\n return np.power(2, powers)\n\nC_range = create_param_range(C_power_range[0], C_power_range[1], C_step)\ngamma_range = create_param_range(gamma_power_range[0], gamma_power_range[1], gamma_step)\n\n# Train model with GridSearchCV\nparam_grid = {\n 'C': C_range,\n 'gamma': gamma_range\n}\n\nsvm = SVC(kernel='rbf', random_state=42)\ngrid = GridSearchCV(svm, param_grid, cv=5)\ngrid.fit(X_train_scaled, y_train)\n\n# Results\ny_pred = grid.predict(X_test_scaled)\naccuracy = accuracy_score(y_test, y_pred)\n\n# Display metrics in columns\nmetrics1, metrics2, metrics3 = st.columns(3)\nwith metrics1:\n st.header('Model Performance')\n st.metric(\"Accuracy\", f\"{accuracy:.2f}\")\nwith metrics2:\n best_C_power = np.log2(grid.best_params_['C'])\n st.header('Best Parameters')\n st.write(\"C\")\n st.write(f\"$2^{{{best_C_power:.1f}}}$ = {grid.best_params_['C']:.6f}\")\n st.write(f\"\")\nwith metrics3:\n best_gamma_power = np.log2(grid.best_params_['gamma'])\n st.header('󠀠󠀠‎')\n st.write(\"γ\")\n st.write(f\"$2^{{{best_gamma_power:.1f}}}$ = {grid.best_params_['gamma']:.6f}\")\n\n# Create visualization data with means and standard deviations\nresults = pd.DataFrame(grid.cv_results_)\nparam_results = pd.DataFrame({\n 'C': np.log2(results['param_C']),\n 'gamma': np.log2(results['param_gamma']),\n 'score': results['mean_test_score']\n})\n\n# Calculate means and standard errors for C\nC_stats = param_results.groupby('C').agg({\n 'score': ['mean', 'std', 'count']\n}).reset_index()\nC_stats.columns = ['C', 'mean_score', 'std_score', 'count']\nC_stats['stderr'] = C_stats['std_score'] / np.sqrt(C_stats['count'])\nC_stats['ci_upper'] = C_stats['mean_score'] + (2 * C_stats['stderr'])\nC_stats['ci_lower'] = C_stats['mean_score'] - (2 * C_stats['stderr'])\n\n# Calculate means and standard errors for gamma\ngamma_stats = param_results.groupby('gamma').agg({\n 'score': ['mean', 'std', 'count']\n}).reset_index()\ngamma_stats.columns = ['gamma', 'mean_score', 'std_score', 'count']\ngamma_stats['stderr'] = gamma_stats['std_score'] / np.sqrt(gamma_stats['count'])\ngamma_stats['ci_upper'] = gamma_stats['mean_score'] + (2 * gamma_stats['stderr'])\ngamma_stats['ci_lower'] = gamma_stats['mean_score'] - (2 * gamma_stats['stderr'])\n\n# Create heatmap\nst.header(\"Hyperparameter optimization\")\ncolor_schemes = ['yellowgreenblue', 'spectral', 'viridis', 'inferno', 'magma', 'plasma', 'turbo', 'greenblue', 'blues', 'reds', 'greens', 'purples', 'oranges']\nselected_color = st.selectbox('Select heatmap color scheme:', color_schemes)\n\n# Create heatmap with grid lines and selected color scheme\nheatmap = alt.Chart(param_results).mark_rect().encode(\n x=alt.X('C:Q', \n title='C parameter', \n scale=alt.Scale(domain=[C_power_range[0], C_power_range[1]]),\n axis=alt.Axis(grid=True, gridDash=[5,5])),\n y=alt.Y('gamma:Q', \n title='γ parameter', \n scale=alt.Scale(domain=[gamma_power_range[0], gamma_power_range[1]]),\n axis=alt.Axis(grid=True, gridDash=[5,5])),\n color=alt.Color('score:Q', \n title='Cross-validation Score',\n scale=alt.Scale(scheme=selected_color)),\n tooltip=['C', 'gamma', alt.Tooltip('score:Q', format='.3f')]\n).transform_window(\n row_number='row_number()'\n).transform_fold(['score']\n).properties(\n width=900,\n height=300,\n)\n\n# Add grid lines as a separate layer\ngrid = alt.Chart(param_results).mark_rule(color='darkgray', strokeOpacity=0.2).encode(\n x='C:Q'\n).properties(\n width=900,\n height=300\n) + alt.Chart(param_results).mark_rule(color='darkgray', strokeOpacity=0.2).encode(\n y='gamma:Q'\n).properties(\n width=900,\n height=300\n)\n\n# Combine heatmap and grid\nfinal_heatmap = (heatmap + grid)\nst.altair_chart(final_heatmap)\n\n# Define common Y axis title\ny_axis_title = 'Cross-validation Score'\n\n# Create C parameter plot with error bands\nc_line_base = alt.Chart(C_stats)\n\nc_line = c_line_base.mark_line().encode(\n x=alt.X('C:Q', title='C parameter', \n scale=alt.Scale(domain=[C_power_range[0], C_power_range[1]])),\n y=alt.Y('mean_score:Q', title=y_axis_title, scale=alt.Scale(zero=False))\n)\n\nc_points = c_line_base.mark_point(size=50).encode(\n x='C:Q',\n y=alt.Y('mean_score:Q', title=y_axis_title),\n tooltip=[\n alt.Tooltip('C:Q', title='C', format='.1f'),\n alt.Tooltip('mean_score:Q', title='Mean Score', format='.3f'),\n alt.Tooltip('std_score:Q', title='Std Dev', format='.3f')\n ]\n)\n\nc_errorbars = c_line_base.mark_errorbar().encode(\n x='C:Q',\n y=alt.Y('ci_lower:Q', title=y_axis_title),\n y2='ci_upper:Q'\n)\n\nc_band = c_line_base.mark_area(opacity=0.3).encode(\n x='C:Q',\n y=alt.Y('ci_lower:Q', title=y_axis_title),\n y2='ci_upper:Q'\n)\n\nc_plot = (c_band + c_line + c_errorbars + c_points).properties(\n width=400,\n height=300,\n)\n\n# Create gamma parameter plot with error bands\ngamma_line_base = alt.Chart(gamma_stats)\n\ngamma_line = gamma_line_base.mark_line().encode(\n x=alt.X('gamma:Q', title='γ parameter', \n scale=alt.Scale(domain=[gamma_power_range[0], gamma_power_range[1]])),\n y=alt.Y('mean_score:Q', title=y_axis_title, scale=alt.Scale(zero=False))\n)\n\ngamma_points = gamma_line_base.mark_point(size=50).encode(\n x='gamma:Q',\n y=alt.Y('mean_score:Q', title=y_axis_title),\n tooltip=[\n alt.Tooltip('gamma:Q', title='Gamma', format='.1f'),\n alt.Tooltip('mean_score:Q', title='Mean Score', format='.3f'),\n alt.Tooltip('std_score:Q', title='Std Dev', format='.3f')\n ]\n)\n\ngamma_errorbars = gamma_line_base.mark_errorbar().encode(\n x='gamma:Q',\n y=alt.Y('ci_lower:Q', title=y_axis_title),\n y2='ci_upper:Q'\n)\n\ngamma_band = gamma_line_base.mark_area(opacity=0.3).encode(\n x='gamma:Q',\n y=alt.Y('ci_lower:Q', title=y_axis_title),\n y2='ci_upper:Q'\n)\n\ngamma_plot = (gamma_band + gamma_line + gamma_errorbars + gamma_points).properties(\n width=400,\n height=300,\n)\n\ncol = st.columns(2)\nwith col[0]:\n st.altair_chart(c_plot)\nwith col[1]:\n st.altair_chart(gamma_plot)", 40 | "execution_count": null 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "6e59b550-b740-4c15-a23e-a510b85762ce", 45 | "metadata": { 46 | "name": "cell2", 47 | "collapsed": false 48 | }, 49 | "source": "## Resources\n\n- An overview of [Snowflake Notebooks](https://www.snowflake.com/en/data-cloud/notebooks/) and its capabilities.\n- About [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks) in the [Snowflake Documentation](https://docs.snowflake.com/).\n- Further information on the use of Streamlit can be found at the [Streamlit Docs](https://docs.streamlit.io/)." 50 | } 51 | ] 52 | } -------------------------------------------------------------------------------- /Build and Optimize Machine Learning Models with Streamlit/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - numpy=* 7 | - pandas=* 8 | - scikit-learn=* 9 | -------------------------------------------------------------------------------- /Creating Snowflake Object using Python API/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake=0.8.0 6 | -------------------------------------------------------------------------------- /Dashboard_with_Streamlit/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - numpy=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Data_Analysis_with_LLM_RAG/Data_Analysis_with_LLM_RAG.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | }, 7 | "lastEditStatus": { 8 | "notebookId": "7vfpxlcc5brsm6magpsd", 9 | "authorId": "6841714608330", 10 | "authorName": "CHANINN", 11 | "authorEmail": "chanin.nantasenamat@snowflake.com", 12 | "sessionId": "248cc86f-5bc6-4821-99fc-2eb76b036f89", 13 | "lastEditTime": 1739213397874 14 | } 15 | }, 16 | "nbformat_minor": 5, 17 | "nbformat": 4, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "id": "414e046d-9d1c-4919-9914-a9ca160084b3", 22 | "metadata": { 23 | "name": "md_title", 24 | "collapsed": false 25 | }, 26 | "source": "# Data Analysis with LLM RAG in Snowflake Notebooks\n\nA notebook that answer questions about data via the use of an LLM reasoning model namely the DeepSeek-R1.\n\nHere's what we're implementing to investigate the tables:\n1. Retrieve penguins data\n2. Convert table to a DataFrame\n3. Create a text box for accepting user input\n4. Generate LLM response to answer questions about the data" 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "d069b3b5-7abe-4a46-a359-9b321ee539d8", 31 | "metadata": { 32 | "name": "md_retrieve_data", 33 | "collapsed": false 34 | }, 35 | "source": "## 1. Retrieve penguins data\n\nWe'll start by performing a simple SQL query to retrieve the penguins data." 36 | }, 37 | { 38 | "cell_type": "code", 39 | "id": "8d50cbf4-0c8d-4950-86cb-114990437ac9", 40 | "metadata": { 41 | "language": "sql", 42 | "name": "sql_output", 43 | "codeCollapsed": false, 44 | "collapsed": false 45 | }, 46 | "source": "SELECT * FROM CHANINN_DEMO_DATA.PUBLIC.PENGUINS", 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "40ea697a-bca6-400b-b1c4-0a1eb90948b6", 53 | "metadata": { 54 | "name": "md_dataframe", 55 | "collapsed": false 56 | }, 57 | "source": "## 2. Convert table to a DataFrame\n\nNext, we'll convert the table to a Pandas DataFrame." 58 | }, 59 | { 60 | "cell_type": "code", 61 | "id": "115fa0b9-4adb-413f-ad7c-34037e9f341d", 62 | "metadata": { 63 | "language": "python", 64 | "name": "df", 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": "sql_output.to_pandas()", 69 | "execution_count": null 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "1ef20081-c6f2-4e3e-8191-e9477e356a4c", 74 | "metadata": { 75 | "name": "md_helper", 76 | "collapsed": false 77 | }, 78 | "source": "## 3. Create helper functions\n\nHere, we'll create several helper functions that will be used in the forthcoming app that we're developing.\n1. `generate_deepseek_response()` - accepts user-provided `prompt` as input query model. Briefly, the input box allow users to ask questions about data and that will be assigned to the `prompt` variable." 79 | }, 80 | { 81 | "cell_type": "code", 82 | "id": "c695373e-ac74-4b62-a1f1-08206cbd5c81", 83 | "metadata": { 84 | "language": "python", 85 | "name": "py_helper", 86 | "codeCollapsed": false, 87 | "collapsed": false 88 | }, 89 | "source": "# Helper function\ndef generate_deepseek_response(prompt):\n cortex_prompt = f\"'[INST] {prompt} [/INST]'\"\n prompt_data = [{'role': 'user', 'content': cortex_prompt}]\n prompt_json = escape_sql_string(json.dumps(prompt_data))\n response = session.sql(\n \"select snowflake.cortex.complete(?, ?)\", \n params=['deepseek-r1', prompt_json]\n ).collect()[0][0]\n \n return response\n\ndef extract_think_content(response):\n think_pattern = r'(.*?)'\n think_match = re.search(think_pattern, response, re.DOTALL)\n \n if think_match:\n think_content = think_match.group(1).strip()\n main_response = re.sub(think_pattern, '', response, flags=re.DOTALL).strip()\n return think_content, main_response\n return None, response\n\ndef escape_sql_string(s):\n return s.replace(\"'\", \"''\")", 90 | "execution_count": null, 91 | "outputs": [] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "d2e6771a-80c6-474c-ac2d-46ada30dbb5d", 96 | "metadata": { 97 | "name": "md_app", 98 | "collapsed": false 99 | }, 100 | "source": "## Create the Asking about Penguins app\n\nNow that we have the data and helper functions ready, let's wrap up by creating the app.\n\n" 101 | }, 102 | { 103 | "cell_type": "code", 104 | "id": "8b8bcc88-fcb1-4abc-ad40-91a42fca5314", 105 | "metadata": { 106 | "language": "python", 107 | "name": "py_app", 108 | "collapsed": false, 109 | "codeCollapsed": false 110 | }, 111 | "outputs": [], 112 | "source": "import streamlit as st\nfrom snowflake.snowpark.context import get_active_session\nimport json\nimport pandas as pd\nimport re\n\n# Write directly to the app\nst.title(\"🐧 Ask about Penguins\")\n\n# Get the current credentials\nsession = get_active_session()\n\n# df = sql_output.to_pandas()\n\nuser_queries = [\"Which penguins has the longest bill length?\",\n \"Where do the heaviest penguins live?\",\n \"Which penguins has the shortest flippers?\"]\n\nquestion = st.selectbox(\"What would you like to know?\", user_queries)\n# question = st.text_input(\"Ask a question\", user_queries[0])\n\nprompt = [\n {\n 'role': 'system',\n 'content': 'You are a helpful assistant that uses provided data to answer natural language questions.'\n },\n {\n 'role': 'user',\n 'content': (\n f'The user has asked a question: {question}. '\n f'Please use this data to answer the question: {df.to_markdown(index=False)}'\n )\n },\n {\n 'temperature': 0.7,\n 'max_tokens': 1000,\n 'guardrails': True\n }\n]\n\ndf\n\nif st.button(\"Submit\"):\n status_container = st.status(\"Thinking ...\", expanded=True)\n with status_container:\n response = generate_deepseek_response(prompt)\n think_content, main_response = extract_think_content(response)\n if think_content:\n st.write(think_content)\n \n status_container.update(label=\"Thoughts\", state=\"complete\", expanded=False)\n st.markdown(main_response)", 113 | "execution_count": null 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "c6e6119e-3a35-4c28-ac37-26f71d24e62b", 118 | "metadata": { 119 | "name": "md_resources", 120 | "collapsed": false 121 | }, 122 | "source": "## Want to learn more?\n\n- More about [palmerpenguins](https://allisonhorst.github.io/palmerpenguins/) data set.\n- More about [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake)\n- For more inspiration on how to use Streamlit widgets in Notebooks, check out [Streamlit Docs](https://docs.streamlit.io/) and this list of what is currently supported inside [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake#label-notebooks-streamlit-support)" 123 | } 124 | ] 125 | } 126 | -------------------------------------------------------------------------------- /Data_Analysis_with_LLM_RAG/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - tabulate=* 6 | -------------------------------------------------------------------------------- /End-to-End Machine Learning with Snowpark ML/1_sf_nb_snowpark_ml_data_ingest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## This repo has been moved\n", 8 | "\n", 9 | "Visit [this Github repo](https://github.com/Snowflake-Labs/sfguide-intro-to-machine-learning-with-snowflake-ml-for-python) to see the full quickstart source code." 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3 (ipykernel)", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.11.5" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 4 34 | } 35 | -------------------------------------------------------------------------------- /End-to-End Machine Learning with Snowpark ML/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - matplotlib=3.7.2 6 | - seaborn=0.12.2 7 | - snowflake-ml-python=1.3.1 8 | -------------------------------------------------------------------------------- /Feature Store Quickstart/Feature Store Quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8cc93dd0", 6 | "metadata": {}, 7 | "source": [ 8 | "## This repo has been moved\n", 9 | "\n", 10 | "Visit [this Github repo](https://github.com/Snowflake-Labs/sfguide-intro-to-feature-store-using-snowflake-notebooks) to see the full quickstart source code." 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Python 3 (ipykernel)", 17 | "language": "python", 18 | "name": "python3" 19 | }, 20 | "language_info": { 21 | "codemirror_mode": { 22 | "name": "ipython", 23 | "version": 3 24 | }, 25 | "file_extension": ".py", 26 | "mimetype": "text/x-python", 27 | "name": "python", 28 | "nbconvert_exporter": "python", 29 | "pygments_lexer": "ipython3", 30 | "version": "3.8.19" 31 | } 32 | }, 33 | "nbformat": 4, 34 | "nbformat_minor": 5 35 | } 36 | -------------------------------------------------------------------------------- /Fine tuning LLM using Snowflake Cortex AI/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake=0.8.0 6 | - streamlit=1.26.0 7 | -------------------------------------------------------------------------------- /Getting Started with Container Runtimes/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Notebooks on Container Runtimes 2 | 3 | This example notebooks demonstrates how to get started using Snowflake's Container Runtime for Notebooks. It includes the setup, configuration, and execution of a straightforward machine learning training job. 4 | 5 | For more info about the Notebooks Container runtime, check out: 6 | - The [public documentation](https://docs.snowflake.com/LIMITEDACCESS/snowsight-notebooks/ui-snowsight-notebooks-runtime) 7 | - An [overview presentation](https://docs.google.com/presentation/d/1pModfkpZuoAsKiYAYfpcO50PxaFB460VCO6qeM_S66s/edit#slide=id.g293d2d1b46a_1_87) of the notebooks container runtime 8 | 9 | ## Setup 10 | 11 | **Note: as of July 15, 2024, the Notebooks Container Runtime is in Private Preview.** Make sure that your account is enabled for this Private Preview if you intend to run this example. 12 | 13 | ### Step 1 - SQL Setup Script 14 | 15 | Run the following SQL code in a Snowflake SQL Worksheet to create the database objects, roles, privileges, and compute pools needed to run this example notebook. 16 | 17 | The setup script is to be run by ACCOUNTADMIN. However, a different role needs to be used to create and author notebooks. This role cannot be ACCOUNTADMIN, SECURITYADMIN, or ORGADMIN. In the example, we’re granting privileges to SYSADMIN which will then be used to create notebooks. Please choose a role that has the privilege of creating a table in a schema. 18 | 19 | ```sql 20 | ------------------ 21 | -- DEMO STEP #1 -- 22 | ------------------ 23 | -- General setup 24 | use role accountadmin; 25 | create database public; 26 | create schema notebooks; 27 | 28 | grant usage on database public to role sysadmin; 29 | grant usage on schema public.notebooks to role sysadmin; 30 | grant create stage on schema public.notebooks to role sysadmin; 31 | grant create notebook on schema public.notebooks to role sysadmin; 32 | grant create service on schema public.notebooks to role sysadmin; 33 | grant usage on warehouse compute_wh to role sysadmin; 34 | 35 | -- Create and grant access to compute pools 36 | CREATE COMPUTE POOL CPU_XS_5_NODES 37 | MIN_NODES = 1 38 | MAX_NODES = 5 39 | INSTANCE_FAMILY = CPU_X64_XS; 40 | 41 | CREATE COMPUTE POOL GPU_S_5_NODES 42 | MIN_NODES = 1 43 | MAX_NODES = 5 44 | INSTANCE_FAMILY = GPU_NV_S; 45 | 46 | grant usage on compute pool CPU_XS_5_NODES to role sysadmin; 47 | grant usage on compute pool GPU_S_5_NODES to role sysadmin; 48 | 49 | -- Create and grant access to EAIs 50 | -- Substep #1: create network rules (these are schema-level objects; end users do not need direct access to the network rules) 51 | 52 | create network rule allow_all_rule 53 | TYPE = 'HOST_PORT' 54 | MODE= 'EGRESS' 55 | VALUE_LIST = ('0.0.0.0:443','0.0.0.0:80'); 56 | 57 | -- Substep #2: create external access integration (these are account-level objects; end users need access to this to access the public internet with endpoints defined in network rules) 58 | 59 | CREATE EXTERNAL ACCESS INTEGRATION allow_all_integration 60 | ALLOWED_NETWORK_RULES = (allow_all_rule) 61 | ENABLED = true; 62 | 63 | CREATE OR REPLACE NETWORK RULE pypi_network_rule 64 | MODE = EGRESS 65 | TYPE = HOST_PORT 66 | VALUE_LIST = ('pypi.org', 'pypi.python.org', 'pythonhosted.org', 'files.pythonhosted.org'); 67 | 68 | CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION pypi_access_integration 69 | ALLOWED_NETWORK_RULES = (pypi_network_rule) 70 | ENABLED = true; 71 | 72 | Grant USAGE ON INTEGRATION allow_all_integration to ROLE sysadmin; 73 | Grant USAGE ON INTEGRATION pypi_access_integration to ROLE sysadmin; 74 | ``` 75 | 76 | Additional information about compute pool configurations is available in [the documentation](https://docs.snowflake.com/developer-guide/snowpark-container-services/working-with-compute-pool). 77 | 78 | ### Step 2 - Upload `diamonds.csv` Data 79 | 80 | Next, we will upload the [diamonds.csv](https://github.com/Snowflake-Labs/snowflake-demo-notebooks/tree/main/Getting%20Started%20with%Container%Runtimes/diamonds.csv) dataset included in this git repo. 81 | 82 | In Snowsight, navigate to **Data >> Databases** and select the database.schema where the role has privileges to create a table. For example, we'll be using `SYSADMIN` to upload the dataset and create a table out of it in the schema `PUBLIC.NOTEBOOKS`. Select **Create >> Table >> From File >> Standard** in the top right, and upload the `diamonds.csv` dataset. 83 | ![diamonds upload](./assets/diamonds_upload.png) 84 | 85 | ### Step 3 - Import the `getting_started_with_container_runtimes.ipynb` file and create a notebook 86 | 87 | Using the `SYSADMIN` role, navigate to the **Notebooks** page on Snowsight, and select the upload button to `Import .ipynb file`. 88 | ![notebook upload](./assets/notebook_upload.png) 89 | 90 | Fill out the creation dialog using the schema, warehouse, and compute pool set up in Step #1. 91 | ![notebook setup](./assets/notebook_setup.png) 92 | 93 | ### Step 4 - Attach External Access Integrations (EAIs) 94 | 95 | Navigate to the notebook settings via the three dots in the top right hand corner, and select the External Accesses tab. Toggle on the `allow_all_integration` EAI. 96 | ![configure EAI](./assets/eai.png) 97 | 98 | ### Step 5 - Run the notebook! 99 | 100 | You're now ready to run the notebook! Checkout out the Notebook Markdown cells for an explanation of what is happening at each step along the way. 101 | 102 | ## Additional Resources 103 | - [Documentation](https://docs.snowflake.com/LIMITEDACCESS/snowsight-notebooks/ui-snowsight-notebooks-runtime) 104 | - [YouTube Tutorials](https://www.youtube.com/playlist?list=PLavJpcg8cl1Efw8x_fBKmfA2AMwjUaeBI) 105 | - [GitHub repo](https://github.com/Snowflake-Labs/snowflake-demo-notebooks) of more example notebooks -------------------------------------------------------------------------------- /Getting Started with Container Runtimes/assets/diamonds_upload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Getting Started with Container Runtimes/assets/diamonds_upload.png -------------------------------------------------------------------------------- /Getting Started with Container Runtimes/assets/eai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Getting Started with Container Runtimes/assets/eai.png -------------------------------------------------------------------------------- /Getting Started with Container Runtimes/assets/notebook_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Getting Started with Container Runtimes/assets/notebook_setup.png -------------------------------------------------------------------------------- /Getting Started with Container Runtimes/assets/notebook_upload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Getting Started with Container Runtimes/assets/notebook_upload.png -------------------------------------------------------------------------------- /Getting Started with Container Runtimes/getting_started_with_container_runtimes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "07e67d82-cb27-4518-b025-b74c117c5637", 6 | "metadata": { 7 | "collapsed": false, 8 | "name": "cell1" 9 | }, 10 | "source": [ 11 | "# Welcome to the Notebooks Container Runtime!\n", 12 | "\n", 13 | "Make sure you've completed all of the setup instructions outlined in the [README]() file prior to running this Notebook.\n", 14 | "\n", 15 | "- Have you uploaded the data?\n", 16 | "- Have you configured the EAI?\n", 17 | "\n", 18 | "If so, proceed!" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "2a609d6f-f3de-4b32-9731-1411db287f9f", 25 | "metadata": { 26 | "collapsed": false, 27 | "language": "python", 28 | "name": "cell2" 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import warnings\n", 33 | "warnings.filterwarnings(\"ignore\")\n", 34 | "\n", 35 | "from snowflake.snowpark.context import get_active_session\n", 36 | "session = get_active_session()\n", 37 | "# Add a query tag to the session. This helps with troubleshooting and performance monitoring.\n", 38 | "session.query_tag = {\"origin\":\"sf_sit-is\", \n", 39 | " \"name\":\"aiml_notebooks_xgboost_on_gpu\", \n", 40 | " \"version\":{\"major\":1, \"minor\":0},\n", 41 | " \"attributes\":{\"is_quickstart\":1, \"source\":\"notebook\"}}" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "e6b51bc3-e121-4b6c-a84f-20f04eb1f28a", 48 | "metadata": { 49 | "collapsed": false, 50 | "language": "python", 51 | "name": "cell3" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "!pip freeze" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "507dda4f-a92a-4144-b715-3c9a5b994eb7", 61 | "metadata": { 62 | "collapsed": false, 63 | "name": "cell4" 64 | }, 65 | "source": [ 66 | "Notebooks Container Runtime, along with External Access Integrations give us the flexibility to `pip install` packages from anywhere, including popular package repositories such as pypi. You can install whatever packages you need by running `!pip install ` directly in the Notebook." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "01982269-5dac-46a6-8af6-2b495e65862f", 73 | "metadata": { 74 | "language": "python", 75 | "name": "cell5" 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "!pip install seaborn" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "ce5d7e1e-2323-428b-ad5d-dbab1b0f34a8", 85 | "metadata": { 86 | "name": "cell6" 87 | }, 88 | "source": [ 89 | "Just like Notebooks on the Warehouse Runtime, we can intermingle both SQL and Python cells:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "78126cdd-9f6e-4524-ac92-b12d915255ae", 96 | "metadata": { 97 | "collapsed": false, 98 | "language": "sql", 99 | "name": "cell7" 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "show tables;" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "b43cb438-746d-476d-8d00-a5fc4cd67648", 109 | "metadata": { 110 | "collapsed": false, 111 | "name": "cell8" 112 | }, 113 | "source": [ 114 | "Let's visualize some of our data using the `seaborn` package that we installed above:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "23f0f888-3d70-42c4-9071-bc366c861a52", 121 | "metadata": { 122 | "collapsed": false, 123 | "language": "python", 124 | "name": "cell9" 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "diamonds_df = session.table(\"DIAMONDS\")\n", 129 | "diamonds_df.show()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "63e2849a-df59-45d2-81e1-14b7880601fc", 136 | "metadata": { 137 | "language": "python", 138 | "name": "cell10" 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "df = diamonds_df.to_pandas()\n", 143 | "\n", 144 | "import seaborn as sns\n", 145 | "\n", 146 | "# Create a visualization\n", 147 | "sns.histplot(\n", 148 | " data=df,\n", 149 | " x=\"PRICE\"\n", 150 | ")" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "3020ac4d-058f-49aa-9686-ca0558d1a97b", 156 | "metadata": { 157 | "collapsed": false, 158 | "name": "cell11" 159 | }, 160 | "source": [ 161 | "Now, let's train a basic `XGBRegressor` machine learning model. The ML Container Runtime for Snowflake Notebooks includes pre-installed common packages for doing machine learning tasks, including SnowparkML and other OSS packages." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "id": "53aad007-803a-4120-b227-596caa842cba", 168 | "metadata": { 169 | "language": "python", 170 | "name": "cell12" 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "import time\n", 175 | "from snowflake.ml.modeling.xgboost import XGBRegressor\n", 176 | "\n", 177 | "CATEGORICAL_COLUMNS = [\"CUT\", \"COLOR\", \"CLARITY\"]\n", 178 | "NUMERICAL_COLUMNS = [\"CARAT\", \"DEPTH\", \"X\", \"Y\", \"Z\"]\n", 179 | "LABEL_COLUMNS = ['PRICE']\n", 180 | "diamonds_df = session.table(\"diamonds\")\n", 181 | "\n", 182 | "model = XGBRegressor(max_depth=400, input_cols=NUMERICAL_COLUMNS, label_cols=LABEL_COLUMNS)\n", 183 | "\n", 184 | "t0 = time.time()\n", 185 | "model.fit(diamonds_df)\n", 186 | "\n", 187 | "t1 = time.time()\n", 188 | "\n", 189 | "print(f\"Fit in {t1-t0} seconds.\")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "id": "159e14c5", 195 | "metadata": { 196 | "name": "cell13" 197 | }, 198 | "source": [ 199 | "SnowparkML on the container runtime automatically captures various logs and metrics associated with your training job. We can run some quick functions to fetch, print, or even visualize those metrics:" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "63126a6a-3c5c-4877-8b69-3e31e65e6587", 206 | "metadata": { 207 | "language": "python", 208 | "name": "cell14" 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "# utils\n", 213 | "import requests\n", 214 | "\n", 215 | "### Get logs depending on type\n", 216 | "def fetch_log(log_type):\n", 217 | " file_path = f'/var/log/managedservices/{log_type}/mlrs/logs-mlrs.log'\n", 218 | " with open(file_path, 'r') as file:\n", 219 | " # Read the contents of the file\n", 220 | " file_contents = file.read()\n", 221 | " return file_contents\n", 222 | "\n", 223 | "### Get response text\n", 224 | "def fetch_metrics(port):\n", 225 | " metrics_url = f\"http://localhost:{port}/metrics\"\n", 226 | " response = requests.get(metrics_url)\n", 227 | " return response.text\n", 228 | "\n", 229 | "def list_mlrs_metrics():\n", 230 | " txt = fetch_metrics(11501)\n", 231 | " metrics_name_and_value = {}\n", 232 | " for line in txt.split(\"\\n\")[:-1]:\n", 233 | " if not line.startswith(\"#\"):\n", 234 | " tokens = line.split(\" \")\n", 235 | " name, value = tokens[0], tokens[1]\n", 236 | " metrics_name_and_value[name] = value\n", 237 | " elif line.startswith(\"# HELP\"):\n", 238 | " tokens = line.split(\" \")\n", 239 | " return metrics_name_and_value" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "c655b9b5-f07f-4906-9530-761145ded013", 246 | "metadata": { 247 | "language": "python", 248 | "name": "cell15" 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "print(\"train attempt\", list_mlrs_metrics()['train_attempts_total'])" 253 | ] 254 | } 255 | ], 256 | "metadata": { 257 | "kernelspec": { 258 | "display_name": "Streamlit Notebook", 259 | "name": "streamlit" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 5 264 | } 265 | -------------------------------------------------------------------------------- /Getting started with Snowpark using Snowflake Notebooks/Getting Started with Snowpark using Snowflake notebooks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e41f588b", 6 | "metadata": {}, 7 | "source": [ 8 | "## This repo has been moved\n", 9 | "\n", 10 | "Visit [this Github repo](https://github.com/Snowflake-Labs/sfguide-getting-started-with-snowpark-in-worksheets-notebooks) to see the full quickstart source code." 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Streamlit Notebook", 17 | "name": "streamlit" 18 | } 19 | }, 20 | "nbformat": 4, 21 | "nbformat_minor": 5 22 | } 23 | -------------------------------------------------------------------------------- /Getting started with Snowpark using Snowflake Notebooks/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake=0.8.0 6 | -------------------------------------------------------------------------------- /Hyperparameter Tuning with sklearn/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - scikit-learn=1.3.0 6 | - python=3.8.* 7 | - snowbooks=1.27.0 8 | - streamlit=1.26.0 9 | -------------------------------------------------------------------------------- /Image_Processing_Pipeline_Stream_Task_Cortex_Complete/Image_Processing_Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | }, 7 | "lastEditStatus": { 8 | "notebookId": "n54d2mm74cvdxf25chvs", 9 | "authorId": "94022846931", 10 | "authorName": "DASH", 11 | "authorEmail": "dash.desai@snowflake.com", 12 | "sessionId": "f4f1ed7a-3ad8-43ab-9e3f-102f3f6fd367", 13 | "lastEditTime": 1744728063667 14 | } 15 | }, 16 | "nbformat_minor": 5, 17 | "nbformat": 4, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "id": "28916a15-ea2d-47ca-8d1f-75dc395fdcae", 22 | "metadata": { 23 | "name": "Overview", 24 | "collapsed": false 25 | }, 26 | "source": "# Image Processing Pipeline using Snowflake Cortex\n\nThis notebooks demonstrates the implementation of an image processing pipeline using [Streams](https://docs.snowflake.com/en/user-guide/streams-intro), [Tasks](https://docs.snowflake.com/en/user-guide/tasks-intro) and [SNOWFLAKE.CORTEX.COMPLETE multimodal](https://docs.snowflake.com/en/sql-reference/functions/complete-snowflake-cortex-multimodal) capability. (*Currently in Public Preview.*)" 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "db0e5507-9aa1-4115-a642-65709994bad5", 31 | "metadata": { 32 | "name": "_Step1", 33 | "collapsed": false 34 | }, 35 | "source": "Step 1: Create Snowflake managed stage to store sample images." 36 | }, 37 | { 38 | "cell_type": "code", 39 | "id": "0eb15096-8d11-48b2-abc3-0250ed43c599", 40 | "metadata": { 41 | "language": "sql", 42 | "name": "Create_Stage" 43 | }, 44 | "outputs": [], 45 | "source": "CREATE stage GENAI_IMAGES encryption = (TYPE = 'SNOWFLAKE_SSE') directory = ( ENABLE = true );", 46 | "execution_count": null 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "e5ebef76-111f-4652-b301-586a9fb1ea7b", 51 | "metadata": { 52 | "name": "_Step2", 53 | "collapsed": false 54 | }, 55 | "source": "Step 2: Download two sample images provided below and upload them on stage `GENAI_IMAGES`. [Learn how](https://docs.snowflake.com/en/user-guide/data-load-local-file-system-stage-ui?_fsi=oZm563yp&_fsi=oZm563yp#upload-files-onto-a-named-internal-stage)\n\nSample images:\n- https://sfquickstarts.s3.us-west-1.amazonaws.com/misc/images/other/sample-img-1.png\n- https://sfquickstarts.s3.us-west-1.amazonaws.com/misc/images/other/sample-img-2.jpg\n\n\n*Note: Sample images provided courtesy of [Dash](https://natureunraveled.com/).*" 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "21d0374d-5467-4922-8fa5-e118ca0e5310", 60 | "metadata": { 61 | "name": "_Step3", 62 | "collapsed": false 63 | }, 64 | "source": "Step 3: Create Stream `images_stream` on stage `GENAI_IMAGES` to detect changes." 65 | }, 66 | { 67 | "cell_type": "code", 68 | "id": "7b1d037f-d0f4-44e1-8443-afd4da31face", 69 | "metadata": { 70 | "language": "sql", 71 | "name": "Create_Stream" 72 | }, 73 | "outputs": [], 74 | "source": "CREATE OR REPLACE STREAM images_stream ON STAGE GENAI_IMAGES;", 75 | "execution_count": null 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "15a8d1c1-449e-4e26-8435-b2c19affe343", 80 | "metadata": { 81 | "name": "_Step4", 82 | "collapsed": false 83 | }, 84 | "source": "Step 4: Create target table `image_analysis` to store image analysis." 85 | }, 86 | { 87 | "cell_type": "code", 88 | "id": "917a7304-f0d1-4445-a91e-8b355c8b2db1", 89 | "metadata": { 90 | "language": "sql", 91 | "name": "Create_Target_Table" 92 | }, 93 | "outputs": [], 94 | "source": "CREATE OR REPLACE TABLE image_analysis \nas \nSELECT RELATIVE_PATH,SNOWFLAKE.CORTEX.COMPLETE('pixtral-large',\n 'Put image filename in an attribute called \"Image.\"\n Put a short title in title case in an attribute called \"Title\".\n Put a 200-word detailed summary summarizing the image in an attribute called \"Summary\"', \n TO_FILE('@GENAI_IMAGES', RELATIVE_PATH)) as image_classification \nfrom directory(@GENAI_IMAGES);", 95 | "execution_count": null 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "53594c24-762c-48d1-8572-c3f17a98a1e2", 100 | "metadata": { 101 | "name": "_step5", 102 | "collapsed": false 103 | }, 104 | "source": "Step 5: Preview image analysis produced on the sample images" 105 | }, 106 | { 107 | "cell_type": "code", 108 | "id": "d11b5868-3892-447a-bd54-cd58932ead67", 109 | "metadata": { 110 | "language": "sql", 111 | "name": "Preview_Images" 112 | }, 113 | "outputs": [], 114 | "source": "select * from image_analysis;", 115 | "execution_count": null 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "565ef0dd-9ed7-4deb-b2ea-1710a6449ca8", 120 | "metadata": { 121 | "name": "_Step6", 122 | "collapsed": false 123 | }, 124 | "source": "Step 6: Create Task `image_analysis_task` to process new images uploaded on stage `GENAI_IMAGES` using SNOWFLAKE.CORTEX.COMPLETE() multimodal capability." 125 | }, 126 | { 127 | "cell_type": "code", 128 | "id": "d80b2f3e-c82e-4281-8ef0-4897bcae5d86", 129 | "metadata": { 130 | "language": "sql", 131 | "name": "Create_Task" 132 | }, 133 | "outputs": [], 134 | "source": "CREATE OR REPLACE TASK image_analysis_task\nSCHEDULE = '1 minute'\nWHEN\n SYSTEM$STREAM_HAS_DATA('images_stream')\nAS\n INSERT INTO image_analysis (RELATIVE_PATH, image_classification)\n SELECT RELATIVE_PATH,SNOWFLAKE.CORTEX.COMPLETE('pixtral-large',\n 'Put image filename in an attribute called \"Image.\"\n Put a short title in title case in an attribute called \"Title\".\n Put a 200-word detailed summary summarizing the image in an attribute called \"Summary\"', \n TO_FILE('@GENAI_IMAGES', RELATIVE_PATH)) as image_classification \n from images_stream;\n\n-- NOTE: Tasks are suspended by default so let's resume it.\nALTER TASK image_analysis_task RESUME;", 135 | "execution_count": null 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "5fc732cd-b4d1-4487-a877-b7507519aa8a", 140 | "metadata": { 141 | "name": "_Step7", 142 | "collapsed": false 143 | }, 144 | "source": "Step 7: Confirm Task status " 145 | }, 146 | { 147 | "cell_type": "code", 148 | "id": "1b629f24-ab24-4ce8-bdd4-936d82d83b00", 149 | "metadata": { 150 | "language": "sql", 151 | "name": "Task_Status" 152 | }, 153 | "outputs": [], 154 | "source": "SHOW TASKS like 'image_analysis_task';", 155 | "execution_count": null 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "2fb915bd-c5ed-4be8-8863-5a8d71e3e344", 160 | "metadata": { 161 | "name": "_Step8", 162 | "collapsed": false 163 | }, 164 | "source": "Step 8: Download new sample image provided below and upload it on stage `GENAI_IMAGES`. [Learn how](https://docs.snowflake.com/en/user-guide/data-load-local-file-system-stage-ui?_fsi=oZm563yp&_fsi=oZm563yp#upload-files-onto-a-named-internal-stage)\n\nSample image:\n- https://sfquickstarts.s3.us-west-1.amazonaws.com/misc/images/other/sample-img-3.jpg\n\n*Note: Sample image provided courtesy of [Dash](https://natureunraveled.com/).*" 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "ae0b6047-de5a-43f4-bdb5-7b6dee3345ac", 169 | "metadata": { 170 | "name": "_Step9", 171 | "collapsed": false 172 | }, 173 | "source": "Step 9: Preview image analysis produced on the new sample image" 174 | }, 175 | { 176 | "cell_type": "code", 177 | "id": "e66b4b64-3987-4d54-af94-bbdb9eea3765", 178 | "metadata": { 179 | "language": "sql", 180 | "name": "Preview_New_Image" 181 | }, 182 | "outputs": [], 183 | "source": "select * from image_analysis;", 184 | "execution_count": null 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "11acad0a-209b-4538-b447-ad57dd9c1d2e", 189 | "metadata": { 190 | "name": "_Step10", 191 | "collapsed": false 192 | }, 193 | "source": "Step 10: Suspend task" 194 | }, 195 | { 196 | "cell_type": "code", 197 | "id": "6e8ff070-38b7-4f60-88b6-b21e2113d8d4", 198 | "metadata": { 199 | "language": "sql", 200 | "name": "Suspend_Task" 201 | }, 202 | "outputs": [], 203 | "source": "ALTER TASK image_analysis_task SUSPEND;", 204 | "execution_count": null 205 | } 206 | ] 207 | } -------------------------------------------------------------------------------- /Image_Processing_Pipeline_Stream_Task_Cortex_Complete/Image_Processing_Pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Image_Processing_Pipeline_Stream_Task_Cortex_Complete/Image_Processing_Pipeline.pdf -------------------------------------------------------------------------------- /Import Package from Stage/Import Package from Stage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a78daa85-b3fa-4dd6-bde7-38371c64c08d", 6 | "metadata": { 7 | "collapsed": false, 8 | "name": "cell1" 9 | }, 10 | "source": [ 11 | "# Import custom package from stage into notebook\n", 12 | "\n", 13 | "If the Python package that you are looking to use is not available in Anaconda, then you can upload the package to a stage and import the package from stage. Here we show a simple example of importing a custom package into a notebook.\n", 14 | "\n", 15 | "| Feature | Availability |\n", 16 | "| -------------- | --------------|\n", 17 | "| Preview Feature — Private | Support for this feature is currently not in production and is available only to selected accounts. |" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "c8b3a287-6cb3-4525-b0ed-a2188d37993c", 23 | "metadata": { 24 | "collapsed": false, 25 | "name": "cell2" 26 | }, 27 | "source": [ 28 | "# Example Package\n", 29 | "\n", 30 | "Here is the Python package used in this example. It is a simple package with a single Python code file. You can download the `simple.zip` package [here](https://github.com/Snowflake-Labs/snowflake-demo-notebooks/tree/main/Import%20Package%20from%20Stage/simple.zip).\n", 31 | "\n", 32 | "## Create a test package\n", 33 | "```bash\n", 34 | "mkdir simple\n", 35 | "touch simple/__init__.py\n", 36 | "cat >> simple/__init__.py # Paste the source below.\n", 37 | "zip -r simple simple\n", 38 | "```\n", 39 | "\n", 40 | "Inside `simple/__init__.py`, we create a simple package that returns Hello world: \n", 41 | "\n", 42 | "```python\n", 43 | "import streamlit as st\n", 44 | "\n", 45 | "def greeting():\n", 46 | " return \"Hello world!\"\n", 47 | "\n", 48 | "def hi():\n", 49 | " st.write(greeting())\n", 50 | "```\n", 51 | "\n", 52 | "\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "f36e4fd2-1c4b-4fec-8419-73036fd40d04", 58 | "metadata": { 59 | "collapsed": false, 60 | "name": "cell3" 61 | }, 62 | "source": [ 63 | "# Upload Package to Stage\n", 64 | "\n", 65 | "Next, we create a stage to upload the `simple.zip` package." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "ee92159e-eaa4-4eb2-a606-12003ae2ba43", 72 | "metadata": { 73 | "codeCollapsed": false, 74 | "collapsed": false, 75 | "language": "sql", 76 | "name": "cell4" 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "-- create a stage for the package.\n", 81 | "CREATE STAGE IF NOT EXISTS MY_PACKAGES;\n", 82 | "-- assign Query Tag to Session. This helps with performance monitoring and troubleshooting\n", 83 | "ALTER SESSION SET query_tag = '{\"origin\":\"sf_sit-is\",\"name\":\"notebook_demo_pack\",\"version\":{\"major\":1, \"minor\":0},\"attributes\":{\"is_quickstart\":0, \"source\":\"sql\", \"vignette\":\"import_package_stage\"}}';" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "35e0da06-7c20-410a-a66d-960cb0fa09a7", 89 | "metadata": { 90 | "collapsed": false, 91 | "name": "cell5" 92 | }, 93 | "source": [ 94 | "To upload the file to stage, you can run the following command. \n", 95 | "\n", 96 | "Using [snowscli](https://github.com/snowflakedb/snowflake-cli):\n", 97 | "\n", 98 | "```bash\n", 99 | "snow snowpark package upload --file simple.zip --stage MY_PACKAGES --overwrite\n", 100 | "```\n", 101 | "Alternatively, using [snowsql](https://docs.snowflake.com/en/user-guide/snowsql):\n", 102 | "\n", 103 | "```bash\n", 104 | "snowsql -q \"PUT file://simple.zip @MY_PACKAGES AUTO_COMPRESS=FALSE OVERWRITE=TRUE\"\n", 105 | "\n", 106 | "```\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "16bb85c1-e3ac-45af-833e-51c84bb031c6", 113 | "metadata": { 114 | "codeCollapsed": false, 115 | "language": "sql", 116 | "name": "cell6" 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "LS @MY_PACKAGES;" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "2ecef987-0162-407e-b739-3c38613253d7", 126 | "metadata": { 127 | "collapsed": false, 128 | "name": "cell7" 129 | }, 130 | "source": [ 131 | "## Upload the package using the Package Picker UI\n", 132 | "\n", 133 | "Now that the `simple.zip` package is on the stage, we can specify the path to this pacakge in the Package Picker. \n", 134 | "\n", 135 | "- Click on the `Packages` dropdown \n", 136 | "- Navigate to `Stage Packages` tab\n", 137 | "- Enter the Stage Package Path as `@..my_packages/simple.zip` (all lowercase) where `.` is the actual namespace of the stage " 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "b38bb25b-5e17-4b70-bcd4-f602fe7554bd", 144 | "metadata": { 145 | "codeCollapsed": false, 146 | "collapsed": false, 147 | "language": "python", 148 | "name": "cell8" 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "import streamlit as st\n", 153 | "st.image(\"https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/main/Import%20Package%20from%20Stage/package_from_stage.png\")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "c0c1b8dd-b690-42a4-a330-0369e27f5d47", 159 | "metadata": { 160 | "name": "cell9" 161 | }, 162 | "source": [ 163 | "Now that this package is uploaded and you have restarted your notebook session, you can import the `simple` package." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "d576bf8f-92cd-4012-9aa3-af2ef5795c6c", 170 | "metadata": { 171 | "codeCollapsed": false, 172 | "collapsed": false, 173 | "language": "python", 174 | "name": "cell10" 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "import simple" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "f2f327b4-b48a-4936-a671-87f81ac0748a", 185 | "metadata": { 186 | "codeCollapsed": false, 187 | "collapsed": false, 188 | "language": "python", 189 | "name": "cell11" 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "simple.hi()" 194 | ] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Streamlit Notebook", 200 | "name": "streamlit" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 5 205 | } 206 | -------------------------------------------------------------------------------- /Import Package from Stage/package_from_stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Import Package from Stage/package_from_stage.png -------------------------------------------------------------------------------- /Import Package from Stage/simple.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Import Package from Stage/simple.zip -------------------------------------------------------------------------------- /Import Package from Stage/simple/__init__.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | def greeting(): 4 | return "Hello world!" 5 | 6 | def hi(): 7 | st.write(greeting()) 8 | 9 | -------------------------------------------------------------------------------- /Intro to Snowpark pandas/Intro to Snowpark pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9997baf8", 6 | "metadata": {}, 7 | "source": [ 8 | "## This repo has been moved\n", 9 | "\n", 10 | "Visit [this Github repo](https://github.com/Snowflake-Labs/sfguide-getting-started-with-pandas-on-snowflake) to see the full quickstart source code." 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Python 3 (ipykernel)", 17 | "language": "python", 18 | "name": "python3" 19 | }, 20 | "language_info": { 21 | "codemirror_mode": { 22 | "name": "ipython", 23 | "version": 3 24 | }, 25 | "file_extension": ".py", 26 | "mimetype": "text/x-python", 27 | "name": "python", 28 | "nbconvert_exporter": "python", 29 | "pygments_lexer": "ipython3", 30 | "version": "3.9.19" 31 | } 32 | }, 33 | "nbformat": 4, 34 | "nbformat_minor": 5 35 | } 36 | -------------------------------------------------------------------------------- /Intro to Snowpark pandas/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - modin=0.28.1 6 | - pandas=2.2.1 7 | -------------------------------------------------------------------------------- /Load CSV from S3/Load CSV from S3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "13f35857-7833-4c7a-820b-421f7156fc94", 6 | "metadata": { 7 | "collapsed": false, 8 | "name": "cell1" 9 | }, 10 | "source": [ 11 | "# How to load CSV files from stage to Snowflake Notebooks 📁\n", 12 | "\n", 13 | "In this example, we will show how you can load a CSV file from stage and create a table with Snowpark. \n", 14 | "\n", 15 | "First, let's use the `get_active_session` command to get the [session](https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.Session#snowflake.snowpark.Session) context variable to work with Snowpark as follows:" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "4babf2c9-2d53-48dc-9b2e-07cda9bcc03c", 22 | "metadata": { 23 | "codeCollapsed": false, 24 | "collapsed": false, 25 | "language": "python", 26 | "name": "cell2" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "from snowflake.snowpark.context import get_active_session\n", 31 | "session = get_active_session()\n", 32 | "# Add a query tag to the session. This helps with troubleshooting and performance monitoring.\n", 33 | "session.query_tag = {\"origin\":\"sf_sit-is\", \n", 34 | " \"name\":\"notebook_demo_pack\", \n", 35 | " \"version\":{\"major\":1, \"minor\":0},\n", 36 | " \"attributes\":{\"is_quickstart\":1, \"source\":\"notebook\", \"vignette\":\"csv_from_s3\"}}\n", 37 | "print(session)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "b8151396-3ae3-4991-8ef0-be82fc33f363", 43 | "metadata": { 44 | "collapsed": false, 45 | "name": "cell3" 46 | }, 47 | "source": [ 48 | "Next, we will create an [external stage](https://docs.snowflake.com/en/sql-reference/sql/create-stage) that references data files stored in a location outside of Snowflake, in this case, the data lives in a [S3 bucket](https://docs.snowflake.com/en/user-guide/data-load-s3-create-stage)." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "f7d7f866-a698-457f-8bd0-4deff26ba329", 55 | "metadata": { 56 | "codeCollapsed": false, 57 | "collapsed": false, 58 | "language": "sql", 59 | "name": "cell4" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "CREATE STAGE IF NOT EXISTS TASTYBYTE_STAGE \n", 64 | "\tURL = 's3://sfquickstarts/frostbyte_tastybytes/';" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "614a9f59-b202-4102-81e8-192b66b656fd", 70 | "metadata": { 71 | "collapsed": false, 72 | "name": "cell5" 73 | }, 74 | "source": [ 75 | "Let's take a look at the files in the stage." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "18fdb36a-f3f6-46b0-92db-e06a28b14867", 82 | "metadata": { 83 | "codeCollapsed": false, 84 | "collapsed": false, 85 | "language": "sql", 86 | "name": "cell6" 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "LS @TASTYBYTE_STAGE/app/app_orders;" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "9feb2dbb-8752-41c1-bd88-f2075e89f4ea", 96 | "metadata": { 97 | "collapsed": false, 98 | "name": "cell7" 99 | }, 100 | "source": [ 101 | "We can use [Snowpark DataFrameReader](https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/1.14.0/api/snowflake.snowpark.DataFrameReader) to read in the CSV file.\n", 102 | "\n", 103 | "By using the `infer_schema = True` option, Snowflake will automatically infer the schema based on data types present in CSV file, so that you don't need to specify the schema beforehand. " 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "2bf5c75a-b4e8-4212-a645-b8d63102757d", 110 | "metadata": { 111 | "codeCollapsed": false, 112 | "language": "python", 113 | "name": "cell8" 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "# Create a DataFrame that is configured to load data from the CSV file.\n", 118 | "df = session.read.options({\"infer_schema\":True}).csv('@TASTYBYTE_STAGE/app/app_orders/app_order_detail.csv.gz')" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "81196d0e-3979-46f1-b11d-871082171f61", 125 | "metadata": { 126 | "codeCollapsed": false, 127 | "language": "python", 128 | "name": "cell9" 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "df" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "94b0bc16-c31c-4cf0-8bf0-f2fdcdbfac0f", 138 | "metadata": { 139 | "collapsed": false, 140 | "name": "cell10" 141 | }, 142 | "source": [ 143 | "Now that the data is loaded into a Snowpark DataFrame, we can work with the data using [Snowpark DataFrame API](https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/api/snowflake.snowpark.DataFrame). \n", 144 | "\n", 145 | "For example, I can compute descriptive statistics on the columns." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "bac152b7-8c98-4e0a-9ecc-42f2c104f49d", 152 | "metadata": { 153 | "codeCollapsed": false, 154 | "language": "python", 155 | "name": "cell11" 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "df.describe()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "b5ff2c51-66d9-4ca4-a060-0b40286ae37c", 165 | "metadata": { 166 | "collapsed": false, 167 | "name": "cell12" 168 | }, 169 | "source": [ 170 | "We can write the dataframe into a table called `APP_ORDER` and query it with SQL. " 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "1f7b5940-47cb-438c-a666-817267b4bf39", 177 | "metadata": { 178 | "codeCollapsed": false, 179 | "collapsed": false, 180 | "language": "python", 181 | "name": "cell13" 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "df.write.mode(\"overwrite\").save_as_table(\"APP_ORDER\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "90e335b9-f60a-4971-aec8-288f0470340b", 192 | "metadata": { 193 | "codeCollapsed": false, 194 | "collapsed": false, 195 | "language": "sql", 196 | "name": "cell14" 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "-- Preview the newly created APP_ORDER table\n", 201 | "SELECT * from APP_ORDER;" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "966f07d5-d246-49da-b133-6ab39fb0578d", 207 | "metadata": { 208 | "collapsed": false, 209 | "name": "cell15" 210 | }, 211 | "source": [ 212 | "Finally, we show how you can read the table back to Snowpark via the `session.table` syntax." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "76dd9c74-019d-47ff-a462-10499503bace", 219 | "metadata": { 220 | "codeCollapsed": false, 221 | "collapsed": false, 222 | "language": "python", 223 | "name": "cell16" 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "df = session.table(\"APP_ORDER\")\n", 228 | "df" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "ca22f85f-9073-44e6-a255-e34155b19bbb", 234 | "metadata": { 235 | "collapsed": false, 236 | "name": "cell17" 237 | }, 238 | "source": [ 239 | "From here, you can continue to query and process the data. " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "2ff779a9-c9ba-434d-b098-2564b9b6e337", 246 | "metadata": { 247 | "codeCollapsed": false, 248 | "language": "python", 249 | "name": "cell18" 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "df.groupBy('\"c4\"').count()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "792359f0-42fa-4639-b286-f8a8afeb1188", 260 | "metadata": { 261 | "codeCollapsed": false, 262 | "language": "sql", 263 | "name": "cell19" 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "-- Teardown table and stage created as part of this example\n", 268 | "DROP TABLE APP_ORDER;\n", 269 | "DROP STAGE TASTYBYTE_STAGE;" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "d149c3c7-4a48-446e-a75f-beefc949790b", 275 | "metadata": { 276 | "collapsed": false, 277 | "name": "cell20" 278 | }, 279 | "source": [ 280 | "### Conclusion\n", 281 | "In this example, we took a look at how you can load a CSV file from an external stage to process and query the data in your notebook using Snowpark. You can learn more about how to work with your data using Snowpark Python [here](https://docs.snowflake.com/en/developer-guide/snowpark/python/index)." 282 | ] 283 | } 284 | ], 285 | "metadata": { 286 | "kernelspec": { 287 | "display_name": "Streamlit Notebook", 288 | "name": "streamlit" 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 5 293 | } 294 | -------------------------------------------------------------------------------- /MFA_Audit_of_Users/demo_data.csv: -------------------------------------------------------------------------------- 1 | USER_ID,NAME,CREATED_ON,DELETED_ON,LOGIN_NAME,DISPLAY_NAME,FIRST_NAME,LAST_NAME,EMAIL,MUST_CHANGE_PASSWORD,HAS_PASSWORD,COMMENT,DISABLED,SNOWFLAKE_LOCK,DEFAULT_WAREHOUSE,DEFAULT_NAMESPACE,DEFAULT_ROLE,EXT_AUTHN_DUO,EXT_AUTHN_UID,HAS_MFA,BYPASS_MFA_UNTIL,LAST_SUCCESS_LOGIN,EXPIRES_AT,LOCKED_UNTIL_TIME,HAS_RSA_PUBLIC_KEY,PASSWORD_LAST_SET_TIME,OWNER,DEFAULT_SECONDARY_ROLE,TYPE 2 | 42,John Doe,2023-01-15 09:00:00,,john_doe,John D.,John,Doe,john.doe@example.com,FALSE,TRUE,"Senior Developer",FALSE,FALSE,COMPUTE_WH,ANALYTICS,SYSADMIN,FALSE,,TRUE,,2024-09-27 08:30:00,,,TRUE,2024-03-15 10:00:00,ACCOUNTADMIN,DEVELOPER,INTERNAL 3 | 255,Jane Smith,2023-02-20 10:30:00,,jane_smith,Jane S.,Jane,Smith,jane.smith@example.com,FALSE,TRUE,"Database Administrator",FALSE,FALSE,DBA_WH,PUBLIC,SECURITYADMIN,TRUE,jsmith123,TRUE,,2024-09-26 17:45:00,,,FALSE,2024-02-01 14:30:00,ACCOUNTADMIN,SYSADMIN,INTERNAL 4 | 578,Robert Johnson,2023-03-10 11:45:00,,robert_johnson,Rob J.,Robert,Johnson,robert.johnson@example.com,TRUE,TRUE,"Sales",FALSE,FALSE,SALES_WH,SALES,SALES_ROLE,FALSE,,FALSE,,2024-09-25 09:15:00,,,FALSE,2024-09-25 09:00:00,USERADMIN,,INTERNAL 5 | 890,Emily Brown,2023-04-05 13:15:00,2024-08-01 16:00:00,emily_brown,Emily B.,Emily,Brown,emily.brown@example.com,FALSE,TRUE,"HR Manager",TRUE,FALSE,HR_WH,HR,HR_ADMIN,FALSE,,TRUE,,2024-07-31 11:30:00,,,FALSE,2024-01-10 08:45:00,ACCOUNTADMIN,,INTERNAL 6 | 952,Michael Lee,2023-05-12 14:30:00,,michael_lee,Mike L.,Michael,Lee,michael.lee@example.com,FALSE,TRUE,"CFO",FALSE,FALSE,FINANCE_WH,FINANCE,FINANCE_ADMIN,TRUE,mlee456,TRUE,,2024-09-27 10:00:00,,,TRUE,2024-06-20 16:15:00,ACCOUNTADMIN,AUDITOR,INTERNAL 7 | 1205,Sarah Wilson,2023-06-18 09:45:00,,sarah_wilson,Sarah W.,Sarah,Wilson,sarah.wilson@example.com,FALSE,TRUE,"Data Analyst",FALSE,FALSE,ANALYST_WH,MARKETING,ANALYST,FALSE,,FALSE,,2024-09-26 14:20:00,,,FALSE,2024-04-05 11:00:00,USERADMIN,,INTERNAL 8 | 2506,David Taylor,2023-07-22 11:00:00,,david_taylor,Dave T.,David,Taylor,david.taylor@example.com,FALSE,TRUE,"Software Engineer",FALSE,FALSE,DEV_WH,DEVELOPMENT,DEVELOPER,FALSE,,TRUE,,2024-09-25 16:40:00,,,FALSE,2024-05-12 09:30:00,SYSADMIN,,INTERNAL 9 | 3789,Lisa Anderson,2023-08-30 10:15:00,,lisa_anderson,Lisa A.,Lisa,Anderson,lisa.anderson@example.com,FALSE,TRUE,"BI Specialist",FALSE,FALSE,BI_WH,BUSINESS_INTEL,BI_ROLE,TRUE,landerson789,TRUE,,2024-09-27 11:10:00,,,FALSE,2024-07-01 13:45:00,ACCOUNTADMIN,,INTERNAL 10 | 5050,James Martinez,2023-09-14 15:30:00,,james_martinez,James M.,James,Martinez,james.martinez@example.com,FALSE,TRUE,"QA Engineer",FALSE,FALSE,QA_WH,TESTING,QA_ROLE,FALSE,,FALSE,,2024-09-26 09:50:00,,,TRUE,2024-08-05 10:20:00,SYSADMIN,DEVELOPER,INTERNAL 11 | 5555,Olivia Garcia,2023-10-05 12:45:00,,olivia_garcia,Olivia G.,Olivia,Garcia,olivia.garcia@example.com,FALSE,TRUE,"HR Specialist",FALSE,FALSE,HR_WH,HR,HR_ROLE,FALSE,,TRUE,,2024-09-25 13:30:00,2025-10-05 12:45:00,,FALSE,2024-09-01 15:00:00,USERADMIN,,INTERNAL 12 | -------------------------------------------------------------------------------- /MFA_Audit_of_Users/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - modin=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Monitoring_Table_Size_with_Streamlit/Monitoring_Table_Size_with_Streamlit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "cc4fb15e-f9db-44eb-9f60-1b9589b755cb", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false 17 | }, 18 | "source": "# Monitoring the Table Size in Snowflake Notebooks with Streamlit\n\nA notebook that tracks the size of specific tables over time to help developers monitor storage growth trends. \n\nHere's what we're implementing to investigate the tables:\n1. Retrieve the Top 100 largest tables\n2. Analyze query patterns on the largest tables\n3. Identify which tables are users interacting with" 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "42a7b143-0779-4706-affc-c214213f55c5", 23 | "metadata": { 24 | "name": "md_section1", 25 | "collapsed": false 26 | }, 27 | "source": "## 1. Retrieve the Top 100 largest tables\n\nThis query shows the top 100 largest tables, sorted by row count, including their size in GB, owners and last modification details." 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "e17f14a5-ea50-4a1d-bc15-c64a6447d0a8", 32 | "metadata": { 33 | "language": "sql", 34 | "name": "sql_top_tables", 35 | "codeCollapsed": false, 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": "-- Top 100 largest tables with metrics\nSELECT \n CONCAT(TABLE_CATALOG, '.', TABLE_SCHEMA, '.', TABLE_NAME) AS FULLY_RESOLVED_TABLE_NAME,\n TABLE_OWNER,\n LAST_DDL,\n LAST_DDL_BY,\n ROW_COUNT,\n ROUND(BYTES / 1024 / 1024 / 1024, 2) AS SIZE_GB,\n LAST_ALTERED,\n CASE \n WHEN LAST_DDL <= DATEADD(DAY, -90, CURRENT_DATE) THEN 'YES' \n ELSE 'NO' \n END AS LAST_ACCESSED_90DAYS\nFROM SNOWFLAKE.ACCOUNT_USAGE.TABLES\nWHERE DELETED IS NULL\n AND ROW_COUNT > 0\n AND LAST_ACCESSED_90DAYS = 'NO'\nORDER BY ROW_COUNT DESC\nLIMIT 100;\n", 40 | "execution_count": null 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "26cf2c60-f4a0-493d-bb62-fbde9e4226b9", 45 | "metadata": { 46 | "name": "md_variable_info", 47 | "collapsed": false 48 | }, 49 | "source": "You can now run this query in Python without any additional code -- simply use your cell name as a variable! We're going to convert our cell to a pandas DataFrame below to make it easier to work with " 50 | }, 51 | { 52 | "cell_type": "code", 53 | "id": "ac2608a7-5cd1-45fb-bb89-17f1bf010b5f", 54 | "metadata": { 55 | "language": "python", 56 | "name": "sql_top_tables_pd", 57 | "codeCollapsed": false, 58 | "collapsed": false 59 | }, 60 | "outputs": [], 61 | "source": "sql_top_tables.to_pandas()", 62 | "execution_count": null 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "40d926ac-d441-4799-b56a-c200a13cbc09", 67 | "metadata": { 68 | "name": "md_section2", 69 | "collapsed": false 70 | }, 71 | "source": "## 2. Explore a specific table \n\nLet's explore one of these tables in greater detail to figure out the most common queries and who is using it most often. \n\n💡 **Pro tip:** You can interact with the below cell and select the fully resolved table name you want to explore more in your account!" 72 | }, 73 | { 74 | "cell_type": "code", 75 | "id": "50216adb-e5e2-4dd0-8b82-0e7dae07d27f", 76 | "metadata": { 77 | "language": "python", 78 | "name": "py_input", 79 | "collapsed": false, 80 | "codeCollapsed": false 81 | }, 82 | "outputs": [], 83 | "source": "import streamlit as st\n\nselection = st.text_input(label=\"Enter a fully resolved table path to explore\")", 84 | "execution_count": null 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "089287ef-efe4-423d-96ce-2ff4d53df21c", 89 | "metadata": { 90 | "name": "md_pass_variable", 91 | "collapsed": false 92 | }, 93 | "source": "Let's now pass that variable into a SQL query so we can grab query analytics on this table" 94 | }, 95 | { 96 | "cell_type": "code", 97 | "id": "7ad267bb-645d-4fa6-8e16-3666b2372fd8", 98 | "metadata": { 99 | "language": "sql", 100 | "name": "sql_most_expensive_queries_on_table", 101 | "collapsed": false, 102 | "codeCollapsed": false 103 | }, 104 | "outputs": [], 105 | "source": "-- Grab most expensive queries on this table \nSELECT \n '{{selection}}' as FULLY_RESOLVED_TABLE_NAME,\n q.QUERY_TEXT,\n q.QUERY_TYPE,\n SUM(CREDITS_USED_CLOUD_SERVICES) as CREDITS_USED,\n MAX(TOTAL_ELAPSED_TIME) as MAX_elapsed_time,\n AVG(TOTAL_ELAPSED_TIME)/1000 as AVG_EXECUTION_TIME_SEC\nFROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY q\nWHERE START_TIME >= CURRENT_DATE - interval '90 days'\n AND query_text LIKE '%{{selection}}%'\nGROUP BY ALL\nORDER BY AVG_EXECUTION_TIME_SEC DESC\nLIMIT 10", 106 | "execution_count": null 107 | }, 108 | { 109 | "cell_type": "code", 110 | "id": "14945658-f869-4047-b486-0a5456287948", 111 | "metadata": { 112 | "language": "python", 113 | "name": "py_visualization", 114 | "codeCollapsed": false, 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": "df = sql_most_expensive_queries_on_table.to_pandas()\nst.dataframe(df,\n column_config={\n \"CREDITS_USED\": st.column_config.ProgressColumn(\n \"CREDITS_USED\",\n format=\"%.4f\",\n min_value=df.CREDITS_USED.min(),\n max_value=df.CREDITS_USED.max(),\n ),\n },)", 119 | "execution_count": null 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "d80fe813-7fe3-48a7-a30b-eb0b3495d0f3", 124 | "metadata": { 125 | "name": "md_section3", 126 | "collapsed": false 127 | }, 128 | "source": "## 3. Find out which users most commonly query this table\n\nLet's say we want to take our top most expensive query and turn it into a materialization. Who will be the users who are most likely to be impacted by our activities? \n\nTo find out, we're going to grab the list of users who queried our table of interest in the last 90 days as well as the users who have executed the expensive query. We can then contact them when we make an update and tell them about improvements we made! 🎉 \n\n-----\n\nFirst, let's find out who has used our table in the last 90 days. We already have a variable `selection` we can use, so we're plugging it into the below query: " 129 | }, 130 | { 131 | "cell_type": "code", 132 | "id": "23866f56-0731-492e-8306-4f6fc28ddb6e", 133 | "metadata": { 134 | "language": "sql", 135 | "name": "py_user_queries", 136 | "codeCollapsed": false, 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": "-- Identify users who have queried selected table in last 90 days \nSELECT \n USER_NAME, \n COUNT(*) number_of_queries\nFROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY q\nWHERE START_TIME >= CURRENT_DATE - interval '90 days'\n AND query_text LIKE '%{{selection}}%'\nGROUP BY ALL\nORDER BY number_of_queries DESC\n", 141 | "execution_count": null 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "0aa5ad71-a360-4fbf-a9d3-868d1d7a329f", 146 | "metadata": { 147 | "name": "md_query_selection", 148 | "collapsed": false 149 | }, 150 | "source": "Now, let's say we want to materialize a specific long running query. Grab a query from the `py_visualization` cell from Section 2. \n\nWe can now plug it into the `QUERY_TEXT` value below to find out who else would benefit from materializing this pattern. \n\n💡 **Pro tip:** If the query is too long, try a unique subset of the query in the box below" 151 | }, 152 | { 153 | "cell_type": "code", 154 | "id": "a041825e-a1fa-4d80-9e2b-9426ee818023", 155 | "metadata": { 156 | "language": "python", 157 | "name": "py_query_selection", 158 | "collapsed": true, 159 | "codeCollapsed": false 160 | }, 161 | "outputs": [], 162 | "source": "query_selection = st.text_input(label=\"Enter the query text you want to look up\")\nst.write(\"**You Entered:** `\" + query_selection + \"`\")", 163 | "execution_count": null 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "b2368c7e-7325-4752-a2fb-ff4d6601123b", 168 | "metadata": { 169 | "name": "md_user_list", 170 | "collapsed": false 171 | }, 172 | "source": "Sweet! Now we get a list of all the users who might have run this query, along with their total credit\nconsumption and query execution time over the last 90 days." 173 | }, 174 | { 175 | "cell_type": "code", 176 | "id": "506d54d9-1a00-46df-9307-dcce94ce8fb9", 177 | "metadata": { 178 | "language": "sql", 179 | "name": "py_user_list", 180 | "collapsed": true, 181 | "codeCollapsed": false 182 | }, 183 | "outputs": [], 184 | "source": "SELECT \n USER_NAME, \n SUM(CREDITS_USED_CLOUD_SERVICES) as total_credits, \n MAX(TOTAL_ELAPSED_TIME) as MAX_elapsed_time,\n AVG(TOTAL_ELAPSED_TIME)/1000 as AVG_EXECUTION_TIME_SEC\nFROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY q\nWHERE START_TIME >= CURRENT_DATE - interval '90 days'\n AND query_text LIKE '%{{query_selection}}%'\nGROUP BY ALL\nORDER BY total_credits DESC", 185 | "execution_count": null 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "f6e54924-57e2-4dfb-8bf1-bad9b7fb635d", 190 | "metadata": { 191 | "name": "md_resources", 192 | "collapsed": false 193 | }, 194 | "source": "## Want to learn more?\n\n- Snowflake Docs on [Account Usage](https://docs.snowflake.com/en/sql-reference/account-usage) and [QUERY_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/query_history)\n\n- More about [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake)\n\n- For more inspiration on how to use Streamlit widgets in Notebooks, check out [Streamlit Docs](https://docs.streamlit.io/) and this list of what is currently supported inside [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake#label-notebooks-streamlit-support)" 195 | } 196 | ] 197 | } 198 | -------------------------------------------------------------------------------- /Monitoring_Table_Size_with_Streamlit/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - pandas=* 6 | -------------------------------------------------------------------------------- /My First Notebook Project/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - matplotlib=3.7.2 6 | - scipy=1.10.1 7 | -------------------------------------------------------------------------------- /Navigating and Browsing Files/Navigating and Browsing Files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "9e16b61a-d9a5-475e-9c72-1c5fb53540df", 14 | "metadata": { 15 | "name": "cell1", 16 | "collapsed": false 17 | }, 18 | "source": "# Navigating and Browsing Files in Snowflake Notebooks\n\nIn addition to files created within your notebook, you may need to interact with files from your local machine or a linked Git repository. These files can include code, data, media files, and more.\n\nYou'll find a **Files** tab located on the left-hand panel. This tab provides a list of files available for reference within the notebook. By default, every notebook is associated with two files: \n\n- Main Notebook File: Named \"notebook_app.ipynb\" by default, unless sourced from Git or uploaded from another .ipynb file with a different name.\n- `environment.yml`: An autogenerated file used for configuring the notebook environment, including required packages.\n\nTo inspect a file's contents, simply click on the file. A pop-up window will display a preview. Note that files previews are read-only. \n\n## Adding Files from Your Local Computer\n\nYou can upload files directly from your local machine to use within your Snowflake notebook. Simply click the `+` button in the Files pane and upload the selected files. Uploaded files will be stored in the Notebook's internal stage and persisted across sessions.\n\nYou can find the list of files that we are working with in this tutorial [here](https://github.com/Snowflake-Labs/snowflake-demo-notebooks/tree/main/Navigating%20and%20Browsing%20Files). Download all of the following files to your local machine.\n- `data.csv`\n- `data.json`\n- `display.py`\n- `stats.py`\n\nClick on `+` to upload files: \n\n![](https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/main/Navigating%20and%20Browsing%20Files/img/upload_files.png)\n\nOnce they are uploaded, you should see them on the File pane.\n\n![](https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/main/Navigating%20and%20Browsing%20Files/img/browse_files.png)\n\n\nNote: If your notebook session is active when you upload a file, you'll need to restart the session for the file to become accessible due to a known bug." 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "36eca65b-fa05-4ede-bc15-a3d9a93ed530", 23 | "metadata": { 24 | "name": "cell2", 25 | "collapsed": false 26 | }, 27 | "source": "## Example 1: Working with Data Files\n\nOnce uploaded, you can reference the file in your notebook as if it were colocated with your notebook. For instance, to load a CSV file named `data.csv` into a Pandas DataFrame:" 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "2537412c-5ec5-4b1d-898f-29fd4ce7a14a", 32 | "metadata": { 33 | "language": "python", 34 | "name": "cell3", 35 | "codeCollapsed": false 36 | }, 37 | "outputs": [], 38 | "source": "import pandas as pd\ndf = pd.read_csv(\"data.csv\")\ndf", 39 | "execution_count": null 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "b59c60b8-3ebb-4fe6-9663-52c410db4802", 44 | "metadata": { 45 | "name": "cell4", 46 | "collapsed": false 47 | }, 48 | "source": "You can do the same with loading `data.json`: " 49 | }, 50 | { 51 | "cell_type": "code", 52 | "id": "15974c58-8347-41cf-af87-689bd511e759", 53 | "metadata": { 54 | "language": "python", 55 | "name": "cell5", 56 | "codeCollapsed": false 57 | }, 58 | "outputs": [], 59 | "source": "df = pd.read_json(\"data.json\",lines=True)\ndf", 60 | "execution_count": null 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "ce12b87d-1c18-4240-b744-bb4cbb4e3866", 65 | "metadata": { 66 | "name": "cell6", 67 | "collapsed": false 68 | }, 69 | "source": "## Example 2: Working with Code Files\n\nTo improve the readability of your notebook, you can organize your code into modules that you can import and use in your notebooks. This is often useful if you have helper functions that span many lines of code or files. Here are the two helper files that have been loaded: \n- `stats.py` : Helper functions for generating random numbers and computing statistics\n- `display.py`: Helper function for generating data report display using Streamlit\n\nYou can click on the two files from the **File** pane to browse the code.\n\nNow let's take a look at how we can import and use the functions in each module." 70 | }, 71 | { 72 | "cell_type": "code", 73 | "id": "f3d0b6e4-9b74-45a0-9138-50433d7c9b05", 74 | "metadata": { 75 | "language": "python", 76 | "name": "cell7", 77 | "codeCollapsed": false 78 | }, 79 | "outputs": [], 80 | "source": "# Import from stats.py\nfrom stats import generate_random_list, median_absolute_deviation", 81 | "execution_count": null 82 | }, 83 | { 84 | "cell_type": "code", 85 | "id": "46f0e49d-a8ed-4065-95c2-81a6284bb40e", 86 | "metadata": { 87 | "language": "python", 88 | "name": "cell8", 89 | "codeCollapsed": false 90 | }, 91 | "outputs": [], 92 | "source": "# Generate a list of 5 random numbers between 0 and 100\nrandom_numbers = generate_random_list(5, 0, 100)\nprint(random_numbers)", 93 | "execution_count": null 94 | }, 95 | { 96 | "cell_type": "code", 97 | "id": "d9ebfd72-17cc-401f-ab98-68be4fb6493c", 98 | "metadata": { 99 | "language": "python", 100 | "name": "cell9", 101 | "codeCollapsed": false 102 | }, 103 | "outputs": [], 104 | "source": "# Compute the median absolute deviation of the list\nmad_val = median_absolute_deviation(random_numbers)\nprint(mad_val)", 105 | "execution_count": null 106 | }, 107 | { 108 | "cell_type": "code", 109 | "id": "dcae54cf-78b7-4415-a934-c55ccc1570ac", 110 | "metadata": { 111 | "language": "python", 112 | "name": "cell10", 113 | "codeCollapsed": false 114 | }, 115 | "outputs": [], 116 | "source": "# Import from display.py\nfrom display import print_report", 117 | "execution_count": null 118 | }, 119 | { 120 | "cell_type": "code", 121 | "id": "59635d1d-0540-43e6-94ee-588fe88b5305", 122 | "metadata": { 123 | "language": "python", 124 | "name": "cell11", 125 | "codeCollapsed": false 126 | }, 127 | "outputs": [], 128 | "source": "# Generate data report with visualizations using Streamlit\nprint_report(\"My Data Report\", random_numbers, mad_val)", 129 | "execution_count": null 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "0e4c8b48-9933-4193-b97c-4acb107780a6", 134 | "metadata": { 135 | "name": "cell12", 136 | "collapsed": false 137 | }, 138 | "source": "## Referencing Files with Stage Path\nEach uploaded file has a full stage path associated with it, which you can find by clicking on the `...` button on each file and selecting `Copy path`.\n\nThe stage path represents the file's location within your notebook environment. For example:\n\n```\nsnow://notebook/../versions/live/data.csv\n```\n\nYou can run SQL using the full stage path." 139 | }, 140 | { 141 | "cell_type": "code", 142 | "id": "14320dad-f9a7-4ff8-8248-98ed1083127a", 143 | "metadata": { 144 | "language": "python", 145 | "name": "cell13", 146 | "codeCollapsed": false 147 | }, 148 | "outputs": [], 149 | "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()\ndatabase = session.get_current_database()\nschema = session.get_current_schema()", 150 | "execution_count": null 151 | }, 152 | { 153 | "cell_type": "code", 154 | "id": "8e98d14b-2804-42e1-aab9-f54d9c0bb086", 155 | "metadata": { 156 | "language": "sql", 157 | "name": "cell14", 158 | "codeCollapsed": false 159 | }, 160 | "outputs": [], 161 | "source": "LIST 'snow://notebook/{{database}}.{{schema}}.\"GH_ACTION_MULTIFILE_NB\"/versions/live/data.csv'", 162 | "execution_count": null 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "47581354-621c-4724-bb1c-797c22affac5", 167 | "metadata": { 168 | "name": "cell15", 169 | "collapsed": false 170 | }, 171 | "source": "## Working with Files from Git\n\nIf your Notebook is connected to Git, then all the files in the same folder as your notebook will be displayed on the Files Tab. \n\n\n![](https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/main/Navigating%20and%20Browsing%20Files/img/git_files.png)\n\n\nIn addition, any addition or removal of files associated with the notebook will be version controlled through Git. For example, if you removed `data.json` by clicking on `...`>`Remove`. You will see the files that are modified in the `Commit` dialog which shows the Git diff. \n\n![](https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/main/Navigating%20and%20Browsing%20Files/img/git_diff.png)\n\nYou can learn more about how you can set up Git integration with Notebooks [here](https://docs.snowflake.com/en/developer-guide/git/git-overview)." 172 | } 173 | ] 174 | } -------------------------------------------------------------------------------- /Navigating and Browsing Files/data.csv: -------------------------------------------------------------------------------- 1 | fruit,size,weight 2 | apple,3.4,1.4 3 | orange,5.4,3.2 -------------------------------------------------------------------------------- /Navigating and Browsing Files/data.json: -------------------------------------------------------------------------------- 1 | {"fruit":"apple", "size":3.4, "weight":1.4},{"fruit":"orange", "size":5.4, "weight":3.2} -------------------------------------------------------------------------------- /Navigating and Browsing Files/display.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import altair as alt 3 | import streamlit as st 4 | 5 | def print_report(title, data, value): 6 | st.title(title) 7 | df = pd.DataFrame({'x': range(len(data)), 'y': data}) 8 | bars = alt.Chart(df).mark_bar().encode( 9 | x='x:O', 10 | y='y:Q' 11 | ) 12 | line = pd.DataFrame({'y': [value]}) 13 | median_line = alt.Chart(line).mark_rule(color='red', strokeDash=[3, 3], strokeWidth=3).encode( 14 | y='y:Q' 15 | ) 16 | chart = (bars + median_line).properties( 17 | width=400, 18 | height=400 19 | ) 20 | st.altair_chart(chart) -------------------------------------------------------------------------------- /Navigating and Browsing Files/img/browse_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Navigating and Browsing Files/img/browse_files.png -------------------------------------------------------------------------------- /Navigating and Browsing Files/img/git_diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Navigating and Browsing Files/img/git_diff.png -------------------------------------------------------------------------------- /Navigating and Browsing Files/img/git_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Navigating and Browsing Files/img/git_files.png -------------------------------------------------------------------------------- /Navigating and Browsing Files/img/upload_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Navigating and Browsing Files/img/upload_files.png -------------------------------------------------------------------------------- /Navigating and Browsing Files/stats.py: -------------------------------------------------------------------------------- 1 | def generate_random_list(length, min_val=0, max_val=100): 2 | """ 3 | Generates a random list of integers. 4 | 5 | Args: 6 | length (int): The length of the list. 7 | min_val (int): The minimum value of the integers (default is 0). 8 | max_val (int): The maximum value of the integers (default is 100). 9 | 10 | Returns: 11 | list: A list of random integers. 12 | """ 13 | import random 14 | random_list = [random.randint(min_val, max_val) for _ in range(length)] 15 | return random_list 16 | 17 | def median_absolute_deviation(numbers): 18 | """ 19 | Calculates the median absolute deviation (MAD) of a list of numbers. 20 | 21 | Args: 22 | numbers (list): A list of numerical values. 23 | 24 | Returns: 25 | float: The median absolute deviation (MAD). 26 | """ 27 | if not numbers: 28 | return None 29 | 30 | # Calculate median 31 | sorted_numbers = sorted(numbers) 32 | n = len(sorted_numbers) 33 | if n % 2 == 0: 34 | median = (sorted_numbers[n//2 - 1] + sorted_numbers[n//2]) / 2 35 | else: 36 | median = sorted_numbers[n//2] 37 | 38 | # Calculate absolute deviations from median 39 | absolute_deviations = [abs(x - median) for x in numbers] 40 | 41 | # Calculate MAD 42 | mad = sorted(absolute_deviations)[len(absolute_deviations) // 2] 43 | 44 | return mad -------------------------------------------------------------------------------- /Query_Caching_Effectiveness/Query_Caching_Effectiveness.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "cc4fb15e-f9db-44eb-9f60-1b9589b755cb", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false, 17 | "resultHeight": 311 18 | }, 19 | "source": "# Query Caching Effectiveness Report\n\nThis utility notebook analyzes the query cache hit rates. This is to ensure that caching is being used effectively and to reduce unnecessary compute costs.\n\nHere's our 4 step process:\n1. SQL query to retrieve data\n2. Convert SQL table to a Pandas DataFrame\n3. Data preparation and filtering (using user input from Streamlit widgets)\n4. Data visualization and exploration" 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "42a7b143-0779-4706-affc-c214213f55c5", 24 | "metadata": { 25 | "name": "md_retrieve_data", 26 | "collapsed": false, 27 | "resultHeight": 220 28 | }, 29 | "source": "## 1. Retrieve Data\n\nThe following query filters for queries that actually scanned data, groups results by `WAREHOUSE_NAME`, and orders them by *percentage of data scanned from cache* (`percent_scanned_from_cache`). \n\nThis helps to identify which warehouses are making the most effective use of caching.\n" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "id": "d549f7ac-bbbd-41f4-9ee3-98284e587de1", 34 | "metadata": { 35 | "language": "sql", 36 | "name": "sql_query_caching", 37 | "resultHeight": 439, 38 | "codeCollapsed": false, 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": "SELECT \n warehouse_name,\n DATE_TRUNC('day', start_time) AS query_date,\n COUNT(DISTINCT query_parameterized_hash) AS query_parameterized_hash_count,\n COUNT(*) AS daily_executions,\n AVG(total_elapsed_time)/1000 AS avg_execution_time,\n SUM(total_elapsed_time)/1000 AS total_execution_time,\n SUM(CASE WHEN bytes_scanned > 0 THEN bytes_scanned ELSE 0 END) AS daily_bytes_scanned,\n SUM(bytes_scanned * percentage_scanned_from_cache) / NULLIF(SUM(CASE WHEN bytes_scanned > 0 THEN bytes_scanned ELSE 0 END), 0) AS daily_cache_hit_ratio,\n MAX_BY(query_text, start_time) AS latest_query_text,\n MAX_BY(user_name, start_time) AS latest_user_name\nFROM snowflake.account_usage.query_history qh\nWHERE start_time >= dateadd(day, -30, current_timestamp())\nGROUP BY 1, 2\nHAVING daily_bytes_scanned > 0\nORDER BY \n query_date DESC,\n daily_cache_hit_ratio DESC,\n daily_bytes_scanned DESC", 43 | "execution_count": null 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "870b69dd-aae0-4dd3-93f7-7adce1268159", 48 | "metadata": { 49 | "name": "md_dataframe", 50 | "collapsed": false, 51 | "resultHeight": 102 52 | }, 53 | "source": "## 2. Convert Table to a DataFrame\n\nNext, we'll convert the tables to a Pandas DataFrame.\n" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "id": "4a5559a8-ef3a-40c3-a9d5-54602403adab", 58 | "metadata": { 59 | "language": "python", 60 | "name": "py_query_caching", 61 | "codeCollapsed": false, 62 | "resultHeight": 439, 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": "sql_query_caching.to_pandas()", 67 | "execution_count": null 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "e618ffe5-481f-4105-bc3f-f5e903b45e34", 72 | "metadata": { 73 | "name": "md_data_preparation", 74 | "collapsed": false, 75 | "resultHeight": 102 76 | }, 77 | "source": "## Data Preparation\n\nHere, we'll do some data preparation prior to visualization." 78 | }, 79 | { 80 | "cell_type": "code", 81 | "id": "a3f93f11-dd74-42f2-bd05-410bb66931a2", 82 | "metadata": { 83 | "language": "python", 84 | "name": "py_data_preparation", 85 | "resultHeight": 439, 86 | "collapsed": false, 87 | "codeCollapsed": false 88 | }, 89 | "outputs": [], 90 | "source": "df = py_query_caching.copy()\n\n# Convert QUERY_DATE to datetime\ndf['QUERY_DATE'] = pd.to_datetime(df['QUERY_DATE'])\n\n# Create WEEK_NUMBER column\ndf['WEEK_NUMBER'] = df['QUERY_DATE'].dt.isocalendar().week\n\n# Create MONTH_YEAR column\ndf['MONTH_YEAR'] = df['QUERY_DATE'].dt.strftime('%b %Y')\n\n# Group by\ngrouped_df = df.groupby('WAREHOUSE_NAME').agg({\n 'QUERY_PARAMETERIZED_HASH_COUNT': 'count',\n 'DAILY_EXECUTIONS': 'sum',\n 'AVG_EXECUTION_TIME': 'mean',\n 'TOTAL_EXECUTION_TIME': 'sum',\n 'DAILY_BYTES_SCANNED': 'sum',\n 'DAILY_CACHE_HIT_RATIO': 'mean'\n}).reset_index()\n\ngrouped_df", 91 | "execution_count": null 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "59b04137-ca95-4fb8-b216-133272349a78", 96 | "metadata": { 97 | "name": "md_bar_chart", 98 | "collapsed": false, 99 | "resultHeight": 201 100 | }, 101 | "source": "## 3. Visualize Bar Chart\n\nHere, we'll visualize the data via a bar chart for the columns:\n- Query count\n- Bytes scanned\n- Percent of bytes scanned\n" 102 | }, 103 | { 104 | "cell_type": "code", 105 | "id": "3b382b54-fd8a-49f5-8bc9-72ca420608ff", 106 | "metadata": { 107 | "language": "python", 108 | "name": "py_bar_chart", 109 | "resultHeight": 623, 110 | "codeCollapsed": false 111 | }, 112 | "outputs": [], 113 | "source": "import altair as alt\nimport pandas as pd\n\n# Create bar chart\nchart = alt.Chart(grouped_df).mark_bar().encode(\n y=alt.Y('WAREHOUSE_NAME:N', \n title='',\n axis=alt.Axis(\n labels=True,\n labelLimit=250,\n tickMinStep=1,\n labelOverlap=False,\n labelPadding=10\n ),\n sort='-x'),\n x=alt.X('DAILY_CACHE_HIT_RATIO:Q', \n title='Cache Hit Ratio'),\n color=alt.Color('WAREHOUSE_NAME:N', legend=None),\n tooltip=[\n alt.Tooltip('WAREHOUSE_NAME', title='Warehouse'),\n alt.Tooltip('DAILY_CACHE_HIT_RATIO', title='Cache Hit Ratio'),\n alt.Tooltip('DAILY_EXECUTIONS', title='Daily Executions'),\n alt.Tooltip('AVG_EXECUTION_TIME', title='Avg Execution Time (ms)')\n ]\n).properties(\n width=400,\n height=600,\n title='Cache Hit Ratio by Warehouse'\n).configure_axis(\n labelFontSize=12,\n titleFontSize=14\n).configure_title(\n fontSize=16,\n anchor='middle'\n)\n\n# Display the chart\nst.altair_chart(chart, use_container_width=True)", 114 | "execution_count": null 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "3c995961-473b-42be-b824-9c5dcb8ef041", 119 | "metadata": { 120 | "name": "md_heatmap", 121 | "collapsed": false, 122 | "resultHeight": 201 123 | }, 124 | "source": "## 4. Visualize as Heatmap\n\nHere, we'll visualize the data via a heatmap for the columns:\n- Query count\n- Bytes scanned\n- Percent of bytes scanned\n" 125 | }, 126 | { 127 | "cell_type": "code", 128 | "id": "02b09580-6a70-4769-a8b1-68fda0dc72bf", 129 | "metadata": { 130 | "language": "python", 131 | "name": "py_heatmap", 132 | "resultHeight": 623, 133 | "codeCollapsed": false, 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": "import pandas as pd\nimport altair as alt\n\n# Convert QUERY_DATE to datetime if it isn't already\ndf['QUERY_DATE'] = pd.to_datetime(df['QUERY_DATE'])\n\n# Format date as string for display\ndf['DATE'] = df['QUERY_DATE'].dt.strftime('%Y-%m-%d')\n\n# Aggregate data by date and warehouse\nagg_df = df.groupby(['DATE', 'WAREHOUSE_NAME'])['DAILY_CACHE_HIT_RATIO'].sum().reset_index()\n\n# Create the heatmap\nheatmap = alt.Chart(agg_df).mark_rect(stroke='black', strokeWidth=1).encode(\n x=alt.X('DATE:O',\n title='Date',\n axis=alt.Axis(\n labelAngle=90,\n labelOverlap=False,\n tickCount=10\n )),\n y=alt.Y('WAREHOUSE_NAME:N',\n title='',\n axis=alt.Axis(\n labels=True,\n labelLimit=250,\n tickMinStep=1,\n labelOverlap=False,\n labelPadding=10\n )),\n color=alt.Color('DAILY_CACHE_HIT_RATIO:Q',\n title='Cache Hit Ratio',\n scale=alt.Scale(scheme='blues')),\n tooltip=['DATE', 'WAREHOUSE_NAME', \n alt.Tooltip('DAILY_CACHE_HIT_RATIO:Q', format='.2%')]\n).properties(\n title=f'Daily Warehouse Cache Hit Ratio Heatmap',\n width=500,\n height=600\n)\n\n# Add configuration to make the chart more interactive\nheatmap = heatmap.configure_axis(\n grid=False\n).configure_view(\n strokeWidth=0\n)\n\n# Display or save the chart\nst.altair_chart(heatmap, use_container_width=True)", 138 | "execution_count": null 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "id": "b9e3e4da-4674-46aa-9e91-ed8697bfef5b", 143 | "metadata": { 144 | "name": "md_pro_tip", 145 | "collapsed": false, 146 | "resultHeight": 134 147 | }, 148 | "source": "💡 Pro tip:\n\nWhen you see a low cache scan percentage for queries that repeatedly access the same data, you can significantly improve its performance by optimizing the cache usage. This is especially true for reports or dashboards that run similar queries throughout the day." 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "id": "eb3e9b67-6a6e-4218-b17a-3f8564a04d18", 153 | "metadata": { 154 | "name": "md_resources", 155 | "collapsed": false, 156 | "resultHeight": 268 157 | }, 158 | "source": "## Want to learn more?\n\n- Snowflake Docs on [Account Usage](https://docs.snowflake.com/en/sql-reference/account-usage) and [QUERY_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/query_history)\n- More about [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake)\n- For more inspiration on how to use Streamlit widgets in Notebooks, check out [Streamlit Docs](https://docs.streamlit.io/) and this list of what is currently supported inside [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake#label-notebooks-streamlit-support)\n- Check out the [Altair User Guide](https://altair-viz.github.io/user_guide/data.html) for further information on customizing Altair charts\n" 159 | } 160 | ] 161 | } -------------------------------------------------------------------------------- /Query_Caching_Effectiveness/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Query_Cost_Monitoring/Query_Cost_Monitoring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "cc4fb15e-f9db-44eb-9f60-1b9589b755cb", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false, 17 | "resultHeight": 336 18 | }, 19 | "source": "# Query Cost Monitoring\n\nA notebook that breaks down compute costs by individual query, allowing teams to identify high-cost operations.\n\nHere's our 4 step process:\n1. SQL query to retrieve query cost data\n2. Convert SQL table to a Pandas DataFrame\n3. Data preparation and filtering (using user input from Streamlit widgets)\n4. Data visualization and exploration" 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "42a7b143-0779-4706-affc-c214213f55c5", 24 | "metadata": { 25 | "name": "md_retrieve_data", 26 | "collapsed": false, 27 | "resultHeight": 231 28 | }, 29 | "source": "## 1. Retrieve Data\n\nTo gain insights on query costs, we'll write a SQL query to retrieve the `credits_used` data from the `snowflake.account_usage.metering_history` table and merging this with associated user, database, schema and warehouse information from the `snowflake.account_usage.query_history` table.\n" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "id": "d549f7ac-bbbd-41f4-9ee3-98284e587de1", 34 | "metadata": { 35 | "language": "sql", 36 | "name": "sql_data", 37 | "resultHeight": 511, 38 | "codeCollapsed": false, 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": "SELECT\n query_history.query_id,\n query_history.query_text,\n query_history.start_time,\n query_history.end_time,\n query_history.user_name,\n query_history.database_name,\n query_history.schema_name,\n query_history.warehouse_name,\n query_history.warehouse_size,\n metering_history.credits_used,\n execution_time/1000 as execution_time_s,\nFROM\n snowflake.account_usage.query_history\n JOIN snowflake.account_usage.metering_history ON query_history.start_time >= metering_history.start_time\n AND query_history.end_time <= metering_history.end_time\nWHERE\n query_history.start_time >= DATEADD (DAY, -7, CURRENT_TIMESTAMP())\nORDER BY\n query_history.query_id;", 43 | "execution_count": null 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "870b69dd-aae0-4dd3-93f7-7adce1268159", 48 | "metadata": { 49 | "name": "md_dataframe", 50 | "collapsed": false, 51 | "resultHeight": 102 52 | }, 53 | "source": "## 2. Convert Table to a DataFrame\n\nNext, we'll convert the table to a Pandas DataFrame.\n" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "id": "4a5559a8-ef3a-40c3-a9d5-54602403adab", 58 | "metadata": { 59 | "language": "python", 60 | "name": "py_dataframe", 61 | "codeCollapsed": false, 62 | "resultHeight": 511, 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": "sql_data.to_pandas()", 67 | "execution_count": null 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "59b04137-ca95-4fb8-b216-133272349a78", 72 | "metadata": { 73 | "name": "md_data_preparation", 74 | "collapsed": false, 75 | "resultHeight": 195 76 | }, 77 | "source": "## 3. Create an Interactive Slider Widget & Data Preparation\n\nHere, we'll create an interactive slider for dynamically selecting the number of days to analyze. This would then trigger the filtering of the DataFrame to the specified number of days.\n\nNext, we'll reshape the data by calculating the frequency count by hour and task name, which will subsequently be used for creating the heatmap in the next step.\n" 78 | }, 79 | { 80 | "cell_type": "code", 81 | "id": "aeff0dbb-5a3d-4c15-bcc6-f19e5f2398ac", 82 | "metadata": { 83 | "language": "python", 84 | "name": "cell9", 85 | "resultHeight": 1246, 86 | "codeCollapsed": false, 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": "import pandas as pd\nimport streamlit as st\nimport altair as alt\n\n# Get data\ndf = py_dataframe.copy()\n\n# Create date filter slider\nst.subheader(\"Select time duration\")\n\ncol = st.columns(3)\n\nwith col[0]:\n days = st.slider('Select number of days to analyze', \n min_value=1, \n max_value=7, \n value=7, \n step=1)\nwith col[1]:\n var = st.selectbox(\"Select a variable\", ['WAREHOUSE_NAME', 'USER_NAME', 'WAREHOUSE_SIZE'])\nwith col[2]:\n metric = st.selectbox(\"Select a metric\", [\"COUNT\", \"TOTAL_CREDITS_USED\"])\n\n# Filter data according to day duration\ndf['START_TIME'] = pd.to_datetime(df['START_TIME'])\nlatest_date = df['START_TIME'].max()\ncutoff_date = latest_date - pd.Timedelta(days=days)\nfiltered_df = df[df['START_TIME'] > cutoff_date].copy()\n \n# Prepare data for heatmap\nfiltered_df['HOUR_OF_DAY'] = filtered_df['START_TIME'].dt.hour\nfiltered_df['HOUR_DISPLAY'] = filtered_df['HOUR_OF_DAY'].apply(lambda x: f\"{x:02d}:00\")\n \n# Calculate frequency count by hour and query\n#agg_df = filtered_df.groupby(['QUERY_ID', 'HOUR_DISPLAY', var]).size().reset_index(name='COUNT')\n\n# Calculate frequency count and sum of credits by hour and query\nagg_df = (filtered_df.groupby(['QUERY_ID', 'HOUR_DISPLAY', var])\n .agg(\n COUNT=('QUERY_ID', 'size'),\n TOTAL_CREDITS_USED=('CREDITS_USED', 'sum')\n )\n .reset_index()\n)\n\nst.warning(f\"Analyzing {var} data for the last {days} days!\")\n\n\n\n## Initialize the button state in session state\nif 'expanded_btn' not in st.session_state:\n st.session_state.expanded_btn = False\n\n## Callback function to toggle the state\ndef toggle_expand():\n st.session_state.expanded_btn = not st.session_state.expanded_btn\n\n## Create button with callback\nst.button(\n '⊕ Expand DataFrames' if not st.session_state.expanded_btn else '⊖ Collapse DataFrames',\n on_click=toggle_expand,\n type='secondary' if st.session_state.expanded_btn else 'primary'\n)\n\n## State conditional\nif st.session_state.expanded_btn:\n expand_value = True\nelse:\n expand_value = False\n\nwith st.expander(\"See Filtered DataFrame\", expanded=expand_value):\n st.dataframe(filtered_df.head(100))\nwith st.expander(\"See Heatmap DataFrame\", expanded=expand_value):\n st.dataframe(agg_df)\n", 91 | "execution_count": null 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "35f31e4e-95d5-4ee5-a146-b9e93dd9d570", 96 | "metadata": { 97 | "name": "md_heatmap", 98 | "collapsed": false, 99 | "resultHeight": 102 100 | }, 101 | "source": "## 4. Create a Heatmap for Visualizing Query Cost\n\nFinally, a heatmap, and stacked bar chart, and bubble chart are generated that will allow us to gain insights on query cost and frequency." 102 | }, 103 | { 104 | "cell_type": "code", 105 | "id": "414edc5e-3597-478e-aac7-f787f68bb3b1", 106 | "metadata": { 107 | "language": "python", 108 | "name": "py_heatmap", 109 | "collapsed": false, 110 | "resultHeight": 366, 111 | "codeCollapsed": false 112 | }, 113 | "outputs": [], 114 | "source": "## Heatmap\nheatmap = alt.Chart(agg_df).mark_rect(stroke='black',strokeWidth=1).encode(\n x='HOUR_DISPLAY:O',\n #y='WAREHOUSE_NAME:N',\n y=alt.Y(f'{var}:N', \n title='',\n axis=alt.Axis(\n labels=True,\n labelLimit=250,\n tickMinStep=1,\n labelOverlap=False,\n labelPadding=10\n )),\n color=f'{metric}:Q',\n tooltip=['HOUR_DISPLAY', var, metric]\n).properties(\n title=f'Query Activity Heatmap by Hour and {var}'\n)\n\nst.altair_chart(heatmap, use_container_width=True)", 115 | "execution_count": null 116 | }, 117 | { 118 | "cell_type": "code", 119 | "id": "84ed25f3-03ef-495a-a12d-247970a29f4a", 120 | "metadata": { 121 | "language": "python", 122 | "name": "py_stacked_bar_chart", 123 | "codeCollapsed": false, 124 | "collapsed": false, 125 | "resultHeight": 423 126 | }, 127 | "outputs": [], 128 | "source": "## Stacked bar chart with time series\nbar_time = alt.Chart(agg_df).mark_bar().encode(\n x='HOUR_DISPLAY:O',\n y=f'{metric}:Q',\n color=alt.Color(f'{var}:N', legend=alt.Legend(orient='bottom')),\n tooltip=['HOUR_DISPLAY', var, metric]\n).properties(\n title=f'Query Activity by Hour and {var}',\n height=400\n)\n\nst.altair_chart(bar_time, use_container_width=True)\n", 129 | "execution_count": null 130 | }, 131 | { 132 | "cell_type": "code", 133 | "id": "0774909e-3ab5-48e4-92ea-c433488e96b7", 134 | "metadata": { 135 | "language": "python", 136 | "name": "py_bubble_plot", 137 | "collapsed": false, 138 | "resultHeight": 573, 139 | "codeCollapsed": false 140 | }, 141 | "outputs": [], 142 | "source": "## Bubble plot with size representing the metric\nbubble = alt.Chart(agg_df).mark_circle().encode(\n x='HOUR_DISPLAY:O',\n y=alt.Y(f'{var}:N', title=''),\n size=alt.Size(f'{metric}:Q', legend=alt.Legend(title='Query Count')),\n color=alt.Color(f'{var}:N', legend=None),\n tooltip=['HOUR_DISPLAY', var, metric]\n).properties(\n title=f'Query Distribution by Hour and {var}',\n height=550\n)\n\nst.altair_chart(bubble, use_container_width=True)", 143 | "execution_count": null 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "eb3e9b67-6a6e-4218-b17a-3f8564a04d18", 148 | "metadata": { 149 | "name": "md_resources", 150 | "collapsed": false, 151 | "resultHeight": 217 152 | }, 153 | "source": "## Want to learn more?\n\n- Snowflake Docs on [Account Usage](https://docs.snowflake.com/en/sql-reference/account-usage), [METERING_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/task_history) and [QUERY_HISTORY](https://docs.snowflake.com/en/sql-reference/account-usage/query_history)\n- More about [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake)\n- For more inspiration on how to use Streamlit widgets in Notebooks, check out [Streamlit Docs](https://docs.streamlit.io/) and this list of what is currently supported inside [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake#label-notebooks-streamlit-support)\n- Check out the [Altair User Guide](https://altair-viz.github.io/user_guide/data.html) for further information on customizing Altair charts\n" 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "6c11317d-7fd7-412d-aeae-cd131dd1530d", 158 | "metadata": { 159 | "name": "cell1", 160 | "collapsed": false 161 | }, 162 | "source": "" 163 | } 164 | ] 165 | } -------------------------------------------------------------------------------- /Query_Cost_Monitoring/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Query_Performance_Insights/Automated_Query_Performance_Insights_in_Snowflake_Notebooks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "d43a3edd-7c40-4a96-a4c6-c46e52b415ed", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false 17 | }, 18 | "source": "# Automated Query Performance Insights in Snowflake Notebooks\n\nIn this notebook, we'll provide SQL queries that you can use to analyze query history and gain insights on performance and bottlenecks.\n\nThe following 6 queries against the `ACCOUNT_USAGE` schema provide insight into the past performance of queries (examples 1-4), warehouses (example 5), and tasks (example 6)." 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "201438af-5d95-44b5-9582-ac165686ea47", 23 | "metadata": { 24 | "name": "md_1", 25 | "collapsed": false 26 | }, 27 | "source": "## 1. Top n longest-running queries\n\nThis query provides a listing of the top n (50 in the example below) longest-running queries in the last day. You can adjust the `DATEADD` function to focus on a shorter or longer period of time. Replace `STREAMLIT_DEMO_APPS` with the name of a warehouse." 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "c695373e-ac74-4b62-a1f1-08206cbd5c81", 32 | "metadata": { 33 | "language": "sql", 34 | "name": "sql_1", 35 | "codeCollapsed": false, 36 | "collapsed": false 37 | }, 38 | "source": "SELECT query_id,\n ROW_NUMBER() OVER(ORDER BY partitions_scanned DESC) AS query_id_int,\n query_text,\n total_elapsed_time/1000 AS query_execution_time_seconds,\n partitions_scanned,\n partitions_total,\nFROM snowflake.account_usage.query_history Q\nWHERE warehouse_name = 'STREAMLIT_DEMO_APPS' AND TO_DATE(Q.start_time) > DATEADD(day,-1,TO_DATE(CURRENT_TIMESTAMP()))\n AND total_elapsed_time > 0 --only get queries that actually used compute\n AND error_code IS NULL\n AND partitions_scanned IS NOT NULL\nORDER BY total_elapsed_time desc\nLIMIT 50;", 39 | "execution_count": null, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "fbb8e757-c732-46d8-a929-e291f6b8fff7", 45 | "metadata": { 46 | "name": "md_2", 47 | "collapsed": false 48 | }, 49 | "source": "## 2. Queries organized by execution time over past month\n\nThis query groups queries for a given warehouse by buckets for execution time over the last month. These trends in query completion time can help inform decisions to resize warehouses or separate out some queries to another warehouse. Replace `STREAMLIT_DEMO_APPS` with the name of a warehouse." 50 | }, 51 | { 52 | "cell_type": "code", 53 | "id": "07b6ef1f-36d3-4f94-a784-6a348f8214d6", 54 | "metadata": { 55 | "language": "sql", 56 | "name": "sql_2", 57 | "collapsed": false, 58 | "codeCollapsed": false 59 | }, 60 | "outputs": [], 61 | "source": "SELECT\n CASE\n WHEN Q.total_elapsed_time <= 1000 THEN 'Less than 1 second'\n WHEN Q.total_elapsed_time <= 60000 THEN '1 second to 1 minute'\n WHEN Q.total_elapsed_time <= 300000 THEN '1 minute to 5 minutes'\n ELSE 'more than 5 minutes'\n END AS BUCKETS,\n COUNT(query_id) AS number_of_queries\nFROM snowflake.account_usage.query_history Q\nWHERE TO_DATE(Q.START_TIME) > DATEADD(month,-1,TO_DATE(CURRENT_TIMESTAMP()))\n AND total_elapsed_time > 0\n AND warehouse_name = 'STREAMLIT_DEMO_APPS'\nGROUP BY 1;", 62 | "execution_count": null 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "fe72eeaf-21ab-491c-bf7b-9de506419512", 67 | "metadata": { 68 | "name": "md_3", 69 | "collapsed": false 70 | }, 71 | "source": "## 3. Find long running repeated queries\n\nYou can use the query hash (the value of the query_hash column in the ACCOUNT_USAGE QUERY_HISTORY view) to find patterns in query performance that might not be obvious. For example, although a query might not be excessively expensive during any single execution, a frequently repeated query could lead to high costs, based on the number of times the query runs.\n\nYou can use the query hash to identify the queries that you should focus on optimizing first. For example, the following query uses the value in the query_hash column to identify the query IDs for the 100 longest-running queries:" 72 | }, 73 | { 74 | "cell_type": "code", 75 | "id": "b8fe9d0d-3c06-4288-958d-44376364a0ae", 76 | "metadata": { 77 | "language": "sql", 78 | "name": "sql_3", 79 | "collapsed": false, 80 | "codeCollapsed": false 81 | }, 82 | "outputs": [], 83 | "source": "SELECT\n query_hash,\n COUNT(*),\n SUM(total_elapsed_time),\n ANY_VALUE(query_id)\n FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY\n WHERE warehouse_name = 'STREAMLIT_DEMO_APPS'\n AND DATE_TRUNC('day', start_time) >= CURRENT_DATE() - 7\n GROUP BY query_hash\n ORDER BY SUM(total_elapsed_time) DESC\n LIMIT 100;", 84 | "execution_count": null 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "98d2b8b5-ab49-4a15-bac1-fa026d3206aa", 89 | "metadata": { 90 | "name": "md_4", 91 | "collapsed": false 92 | }, 93 | "source": "## 4. Track the average performance of a query over time\n\nThe following statement computes the daily average total elapsed time for all queries that have a specific parameterized query hash (7f5c370a5cddc67060f266b8673a347b)." 94 | }, 95 | { 96 | "cell_type": "code", 97 | "id": "a37b360e-7c7e-4ff8-a81d-93c223498f15", 98 | "metadata": { 99 | "language": "sql", 100 | "name": "sql_4", 101 | "codeCollapsed": false, 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": "SELECT\n DATE_TRUNC('day', start_time),\n SUM(total_elapsed_time),\n ANY_VALUE(query_id)\n FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY\n WHERE query_parameterized_hash = '7f5c370a5cddc67060f266b8673a347b'\n AND DATE_TRUNC('day', start_time) >= CURRENT_DATE() - 30\n GROUP BY DATE_TRUNC('day', start_time);", 106 | "execution_count": null 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "8dce0934-ef0c-4bdb-a28a-25c1286f9789", 111 | "metadata": { 112 | "name": "md_5", 113 | "collapsed": false 114 | }, 115 | "source": "## 5. Total warehouse load\nThis query provides insight into the total load of a warehouse for executed and queued queries. These load values represent the ratio of the total execution time (in seconds) of all queries in a specific state in an interval by the total time (in seconds) for that interval.\n\nFor example, if 276 seconds was the total time for 4 queries in a 5 minute (300 second) interval, then the query load value is 276 / 300 = 0.92." 116 | }, 117 | { 118 | "cell_type": "code", 119 | "id": "24486435-31df-457e-9ce4-a55cce2824d1", 120 | "metadata": { 121 | "language": "sql", 122 | "name": "sql_5", 123 | "codeCollapsed": false, 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": "SELECT TO_DATE(start_time) AS date,\n warehouse_name,\n SUM(avg_running) AS sum_running,\n SUM(avg_queued_load) AS sum_queued\nFROM snowflake.account_usage.warehouse_load_history\nWHERE TO_DATE(start_time) >= DATEADD(month,-1,CURRENT_TIMESTAMP())\nGROUP BY 1,2\nHAVING SUM(avg_queued_load) >0;", 128 | "execution_count": null 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "e654c671-c5f4-40e2-9cb4-301a028e4b83", 133 | "metadata": { 134 | "name": "md_6", 135 | "collapsed": false 136 | }, 137 | "source": "## 6. Longest running tasks\nThis query lists the longest running tasks in the last day, which can indicate an opportunity to optimize the SQL being executed by the task." 138 | }, 139 | { 140 | "cell_type": "code", 141 | "id": "ff6c5cf8-7a65-460f-b95c-48e2559692b0", 142 | "metadata": { 143 | "language": "sql", 144 | "name": "sql_6", 145 | "codeCollapsed": false, 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": "SELECT DATEDIFF(seconds, query_start_time,completed_time) AS duration_seconds,*\nFROM snowflake.account_usage.task_history\nWHERE state = 'SUCCEEDED'\n AND query_start_time >= DATEADD (week, -1, CURRENT_TIMESTAMP())\nORDER BY duration_seconds DESC;", 150 | "execution_count": null 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "9989e783-5e01-4a59-aaee-cb71f05fd468", 155 | "metadata": { 156 | "name": "md_resources", 157 | "collapsed": false 158 | }, 159 | "source": "## Resources\n\nQueries used in this notebook is from the [Snowflake Docs](https://docs.snowflake.com/) on [Exploring execution times](https://docs.snowflake.com/en/user-guide/performance-query-exploring)" 160 | } 161 | ] 162 | } 163 | -------------------------------------------------------------------------------- /Query_Performance_Insights/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: [] 5 | -------------------------------------------------------------------------------- /Query_Performance_Insights_using_Streamlit/Build_an_Interactive_Query_Performance_App_with_Streamlit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "d43a3edd-7c40-4a96-a4c6-c46e52b415ed", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false 17 | }, 18 | "source": "# Build an Interactive Query Performance App in Snowflake Notebooks using Streamlit\n\nIn this notebook, we'll create an interactive Streamlit app for analyzing query history to shed light on longest-running queries. These insights can help in further actions to optimize computation. \n" 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "201438af-5d95-44b5-9582-ac165686ea47", 23 | "metadata": { 24 | "name": "md_query", 25 | "collapsed": false 26 | }, 27 | "source": "## SQL Query: Top n longest-running queries\n\nThis query provides a listing of the top n (50 in the example below) longest-running queries in the last day. You can adjust the `DATEADD` function to focus on a shorter or longer period of time. Replace `STREAMLIT_DEMO_APPS` with the name of a warehouse." 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "c695373e-ac74-4b62-a1f1-08206cbd5c81", 32 | "metadata": { 33 | "language": "sql", 34 | "name": "sql_query", 35 | "codeCollapsed": false, 36 | "collapsed": false 37 | }, 38 | "source": "SELECT query_id,\n ROW_NUMBER() OVER(ORDER BY partitions_scanned DESC) AS query_id_int,\n query_text,\n total_elapsed_time/1000 AS query_execution_time_seconds,\n partitions_scanned,\n partitions_total,\nFROM snowflake.account_usage.query_history Q\nWHERE warehouse_name = 'STREAMLIT_DEMO_APPS' AND TO_DATE(Q.start_time) > DATEADD(day,-1,TO_DATE(CURRENT_TIMESTAMP()))\n AND total_elapsed_time > 0 --only get queries that actually used compute\n AND error_code IS NULL\n AND partitions_scanned IS NOT NULL\nORDER BY total_elapsed_time desc\nLIMIT 50;", 39 | "execution_count": null, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "51f7f20c-f6d7-4e44-b22d-5409560ef0a3", 45 | "metadata": { 46 | "name": "md_app", 47 | "collapsed": false 48 | }, 49 | "source": "## Implementing the Interactive Query Performance App\n\nThe workflow is implemented using 5 Python libraries:\n- **Snowflake Snowpark**: Database connectivity to Snowflake\n- **Pandas**: Data wrangling\n- **Streamlit**: Web application framework\n- **Altair**: Data visualization\n- **NumPy**: Numerical computing\n\nUsers can provide the following input parameters:\n- Timeframes (day, week, month,\n- Number of rows to display, \n- Bin sizes for histograms\n- SQL commands to analyze\n\nThese input are used to retrieve and process data resulting in the generation of various visualizations and data analysis as follows:\n- Histogram of query execution time\n- Box plot of query execution time\n- Summary statistics" 50 | }, 51 | { 52 | "cell_type": "code", 53 | "id": "2bdb7d5a-f4dc-4eed-99bc-8726adfa5f8c", 54 | "metadata": { 55 | "language": "python", 56 | "name": "py_app", 57 | "collapsed": false, 58 | "codeCollapsed": false 59 | }, 60 | "outputs": [], 61 | "source": "from snowflake.snowpark.context import get_active_session\nimport pandas as pd\nimport streamlit as st\nimport altair as alt\nimport numpy as np\n\nst.title('Top n longest-running queries')\n\n# Input widgets\ncol = st.columns(3)\n\nwith col[0]:\n timeframe_option = st.selectbox('Select a timeframe', ('day', 'week', 'month'))\n\nwith col[1]:\n limit_option = st.slider('Display n rows', 10, 200, 100)\n\nwith col[2]:\n bin_option = st.slider('Bin size', 1, 30, 10)\n\nsql_command_option = st.multiselect('Select a SQL command to analyze', \n ['describe', 'execute', 'show', 'PUT', 'SELECT'],\n ['describe', 'show'])\n\n# Data retrieval\nsession = get_active_session()\ndf = session.sql(\n f\"\"\"\n SELECT query_id,\n ROW_NUMBER() OVER(ORDER BY partitions_scanned DESC) AS query_id_int,\n query_text,\n total_elapsed_time/1000 AS query_execution_time_seconds,\n partitions_scanned,\n partitions_total,\n FROM snowflake.account_usage.query_history Q\n WHERE warehouse_name = 'STREAMLIT_DEMO_APPS' AND TO_DATE(Q.start_time) > DATEADD({timeframe_option},-1,TO_DATE(CURRENT_TIMESTAMP()))\n AND total_elapsed_time > 0 --only get queries that actually used compute\n AND error_code IS NULL\n AND partitions_scanned IS NOT NULL\n ORDER BY total_elapsed_time desc\n LIMIT {limit_option};\n \"\"\"\n ).to_pandas()\n\ndf = df[df['QUERY_TEXT'].str.lower().str.startswith(tuple(commands.lower() for commands in sql_command_option))]\n\nst.title('Histogram of Query Execution Times')\n\n# Create a DataFrame for the histogram data\nhist, bin_edges = np.histogram(df['QUERY_EXECUTION_TIME_SECONDS'], bins=bin_option)\n\nhistogram_df = pd.DataFrame({\n 'bin_start': bin_edges[:-1],\n 'bin_end': bin_edges[1:],\n 'count': hist\n})\nhistogram_df['bin_label'] = histogram_df.apply(lambda row: f\"{row['bin_start']:.2f} - {row['bin_end']:.2f}\", axis=1)\n\n# Create plots\nhistogram_plot = alt.Chart(histogram_df).mark_bar().encode(\n x=alt.X('bin_label:N', sort=histogram_df['bin_label'].tolist(),\n axis=alt.Axis(title='QUERY_EXECUTION_TIME_SECONDS', labelAngle=90)),\n y=alt.Y('count:Q', axis=alt.Axis(title='Count')),\n tooltip=['bin_label', 'count']\n)\n\nbox_plot = alt.Chart(df).mark_boxplot(\n extent=\"min-max\",\n color='yellow'\n).encode(\n alt.X(\"QUERY_EXECUTION_TIME_SECONDS:Q\", scale=alt.Scale(zero=False))\n).properties(\n height=200\n)\n\nst.altair_chart(histogram_plot, use_container_width=True)\nst.altair_chart(box_plot, use_container_width=True)\n\n\n# Data display\nwith st.expander('Show data'):\n st.dataframe(df)\nwith st.expander('Show summary statistics'):\n st.write(df['QUERY_EXECUTION_TIME_SECONDS'].describe())", 62 | "execution_count": null 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "9989e783-5e01-4a59-aaee-cb71f05fd468", 67 | "metadata": { 68 | "name": "md_resources", 69 | "collapsed": false 70 | }, 71 | "source": "## Resources\n\nQueries used in this notebook is from the [Snowflake Docs](https://docs.snowflake.com/) on [Exploring execution times](https://docs.snowflake.com/en/user-guide/performance-query-exploring)\n\nFurther information on the use of Streamlit can be found at the [Streamlit Docs](https://docs.streamlit.io/)." 72 | } 73 | ] 74 | } 75 | -------------------------------------------------------------------------------- /Query_Performance_Insights_using_Streamlit/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - numpy=* 7 | - pandas=* 8 | - snowflake-snowpark-python=* 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Snowflake Notebook Demos 2 | Snowflake Notebooks is your familiar, interactive development environment to perform Data Science, Data Engineering, and AI/ML workloads end-to-end in Snowflake. Write Python & SQL in the same interface. 3 | 4 | This repo contains a collection of Snowflake Notebook demos, tutorials, and examples. Browse each folder to access the notebook files associated with each demo. Here is a list of notebooks you can find in this repo. 5 | 6 | 7 | 10 | 18 | 19 | 20 | 21 | 24 | 35 | 36 | 37 | 38 | 41 | 50 | 51 | 52 | 53 | 56 | 64 | 65 | 66 | 69 | 81 | 82 | 83 | 86 | 95 | 96 |
8 | Image 9 | 11 |

Getting Started

12 | 17 |
22 | Image 23 | 25 |

Data Administration

26 | 34 |
39 | Image 40 | 42 |

Data Science

43 | 49 |
54 | Image 55 | 57 |

Data Engineering

58 | 63 |
67 | Image 68 | 70 |

Machine Learning

71 | 80 |
84 | Image 85 | 87 |

Using Notebooks

88 | 94 |
97 | 98 | 99 | ## Load demo notebooks to Snowflake 100 | 101 | The notebook files are available for download as `.ipynb` files. To load the demo notebooks into your Snowflake Notebook, follow these steps: 102 | 103 | 1. On Github, click into each folder containing the tutorial and the corresponding `.ipynb file`, such as [this](https://github.com/Snowflake-Labs/notebook-demo/blob/main/My%20First%20Notebook%20Project/My%20First%20Notebook%20Project.ipynb). Download the file by clicking on the `Download raw file` from the top right. 104 | 105 | 2. Go to the Snowflake web interface, [Snowsight](https://app.snowflake.com), on your browser. 106 | 107 | 3. Navigate to `Project` > `Notebooks` from the left menu bar. 108 | 109 | 3. Import the .ipynb file you've download into your Snowflake Notebook by using the `Import from .ipynb` button located on the top right of the Notebooks page. 110 | 111 | 4. Select the file from your local directory and press `Open`. 112 | 113 | 5. A `Create Notebook` dialog will show up. Select a database, schema, and warehouse for the Notebook and click `Create`. 114 | 115 | ## Resources 116 | 117 | Here are some resources to learn more about Snowflake Notebooks: 118 | 119 | * [Documentation](https://docs.snowflake.com/LIMITEDACCESS/snowsight-notebooks/ui-snowsight-notebooks-about) 120 | * [YouTube Playlist](https://www.youtube.com/playlist?list=PLavJpcg8cl1Efw8x_fBKmfA2AMwjUaeBI) 121 | * [Solution Center](https://developers.snowflake.com/solutions/?_sft_technology=notebooks) 122 | 123 | ## License 124 | 125 | All code and notebooks included in this repo is available with an Apache 2.0 license. 126 | 127 | ## Other links 128 | 129 | * Interested in developing and running interactive Streamlit apps in Snowflake? Check out the [Streamlit in Snowflake Demo Repo](https://github.com/Snowflake-Labs/snowflake-demo-streamlit/) to learn more! 130 | -------------------------------------------------------------------------------- /Reference cells and variables/Reference cells and variables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d40f15d5-0f06-4c81-b4e6-a760771d44c2", 6 | "metadata": { 7 | "collapsed": false, 8 | "name": "cell1" 9 | }, 10 | "source": [ 11 | "# Reference cells and variables in Snowflake Notebooks" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "884f6e12-725b-4ae2-b9c9-5eaa4f4f964f", 17 | "metadata": { 18 | "collapsed": false, 19 | "name": "cell2" 20 | }, 21 | "source": [ 22 | "You can reference the results of previous cells in a cell in your notebook. This allows you to seamless switch between working in Python and SQL and reuse the results and variables.\n", 23 | "\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "1ad40569-c979-461e-a2a0-98449785ba2f", 29 | "metadata": { 30 | "collapsed": false, 31 | "name": "cell3" 32 | }, 33 | "source": [ 34 | "## Referencing SQL output in Python cells\n", 35 | "\n", 36 | "We can access the SQL results directly in Python and convert the results to a Snowpark or pandas dataframe.\n", 37 | "\n", 38 | "The cell reference is based on the cell name. Note that if you change the cell name, you will also need to update the subsequent cell reference accordingly.\n", 39 | "\n", 40 | "\n", 41 | "### Example 1: Access SQL results as Snowpark or Pandas Dataframes" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "3775908f-ca36-4846-8f38-5adca39217f2", 48 | "metadata": { 49 | "codeCollapsed": false, 50 | "language": "sql", 51 | "name": "cell4" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "-- assign Query Tag to Session. This helps with performance monitoring and troubleshooting\n", 56 | "ALTER SESSION SET query_tag = '{\"origin\":\"sf_sit-is\",\"name\":\"notebook_demo_pack\",\"version\":{\"major\":1, \"minor\":0},\"attributes\":{\"is_quickstart\":0, \"source\":\"sql\", \"vignette\":\"reference_cells\"}}';\n", 57 | "\n", 58 | "SELECT 'FRIDAY' as SNOWDAY, 0.2 as CHANCE_OF_SNOW\n", 59 | "UNION ALL\n", 60 | "SELECT 'SATURDAY',0.5\n", 61 | "UNION ALL \n", 62 | "SELECT 'SUNDAY', 0.9;" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "8d50cbf4-0c8d-4950-86cb-114990437ac9", 69 | "metadata": { 70 | "codeCollapsed": false, 71 | "language": "python", 72 | "name": "cell5" 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "snowpark_df = cell4.to_df()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "c695373e-ac74-4b62-a1f1-08206cbd5c81", 83 | "metadata": { 84 | "codeCollapsed": false, 85 | "language": "python", 86 | "name": "cell6" 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "pandas_df = cell4.to_pandas()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "585a54f7-5dd4-412a-9c42-89d5c5d5978c", 96 | "metadata": { 97 | "collapsed": false, 98 | "name": "cell7" 99 | }, 100 | "source": [ 101 | "## Referencing variables in SQL code\n", 102 | "\n", 103 | "You can use the Jinja syntax `{{..}}` to reference Python variables within your SQL queries as follows.\n", 104 | "\n", 105 | "### Example 2: Using Python variable value in a SQL query\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "e73b633a-57d4-436c-baae-960c92c9cef6", 112 | "metadata": { 113 | "codeCollapsed": false, 114 | "collapsed": false, 115 | "language": "sql", 116 | "name": "cell8" 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "-- Create a dataset of countries\n", 121 | "CREATE OR REPLACE TABLE countries (\n", 122 | " country_name VARCHAR(100)\n", 123 | ");\n", 124 | "\n", 125 | "INSERT INTO countries (country_name) VALUES\n", 126 | " ('USA'),('Canada'),('United Kingdom'),('Germany'),('France'),\n", 127 | " ('Australia'),('Japan'),('China'),('India'),('Brazil');" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "e7a6f119-4f67-4ef5-a35f-117a7f502475", 134 | "metadata": { 135 | "codeCollapsed": false, 136 | "language": "python", 137 | "name": "cell9" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "c = \"'USA'\"" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "60a59077-a4b1-4699-81a5-645addd8ad6d", 148 | "metadata": { 149 | "codeCollapsed": false, 150 | "language": "sql", 151 | "name": "cell10" 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "-- Filter to record where country is USA\n", 156 | "SELECT * FROM countries WHERE COUNTRY_NAME = {{c}}" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "id": "decf8b5e-e804-439d-a186-3a329da12563", 162 | "metadata": { 163 | "name": "cell11" 164 | }, 165 | "source": [ 166 | "### Example 3: Using Python dataframe in a SQL query" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "9b49d972-3966-4fa6-9457-f028b06484a3", 173 | "metadata": { 174 | "codeCollapsed": false, 175 | "language": "sql", 176 | "name": "cell12" 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "-- Create dataset with columns PRODUCT_ID, RATING, PRICE\n", 181 | "SELECT CONCAT('SNOW-',UNIFORM(1000,9999, RANDOM())) AS PRODUCT_ID, \n", 182 | " ABS(NORMAL(5, 3, RANDOM())) AS RATING, \n", 183 | " ABS(NORMAL(750, 200::FLOAT, RANDOM())) AS PRICE\n", 184 | "FROM TABLE(GENERATOR(ROWCOUNT => 100));" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "b7040f85-0ab8-4bdb-a36e-33599b79ea54", 191 | "metadata": { 192 | "codeCollapsed": false, 193 | "language": "sql", 194 | "name": "cell13" 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "-- Filter to products where price is greater than 500\n", 199 | "SELECT * FROM {{cell12}} where PRICE > 500" 200 | ] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Streamlit Notebook", 206 | "name": "streamlit" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 5 211 | } 212 | -------------------------------------------------------------------------------- /Role_Based_Access_Auditing_with_Streamlit/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Scheduled_Query_Execution_Report/Scheduled_Query_Execution_Report.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "cc4fb15e-f9db-44eb-9f60-1b9589b755cb", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false, 17 | "resultHeight": 285 18 | }, 19 | "source": "# Scheduled Query Execution Report\n\nA notebook to report on failed or long-running scheduled queries, providing insights into reliability issues.\n\nHere's a breakdown of the steps:\n1. Retrieve Data\n2. Convert Table to a DataFrame\n3. Create an Interactive Slider Widget & Data Preparation\n4. Create a Heatmap for Visualizing Scheduled Query Execution" 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "42a7b143-0779-4706-affc-c214213f55c5", 24 | "metadata": { 25 | "name": "md_retrieve_data", 26 | "collapsed": false, 27 | "resultHeight": 170 28 | }, 29 | "source": "## 1. Retrieve Data\n\nFirstly, we'll write an SQL query to retrieve the execution history for scheduled queries, along with their status, timing metrics, and execution status. \n\nWe're obtaining this from the `snowflake.account_usage.task_history` table." 30 | }, 31 | { 32 | "cell_type": "code", 33 | "id": "39f7713b-dd7a-41a2-872e-cc534c6dc4f6", 34 | "metadata": { 35 | "language": "sql", 36 | "name": "sql_data", 37 | "resultHeight": 439, 38 | "collapsed": false, 39 | "codeCollapsed": false 40 | }, 41 | "outputs": [], 42 | "source": "SELECT \n name,\n database_name,\n query_id,\n query_text,\n schema_name,\n scheduled_time,\n query_start_time,\n completed_time,\n DATEDIFF('second', query_start_time, completed_time) as execution_time_seconds,\n state,\n error_code,\n error_message,\nFROM snowflake.account_usage.task_history\nWHERE scheduled_time >= DATEADD(days, -1, CURRENT_TIMESTAMP())\nORDER BY scheduled_time DESC;", 43 | "execution_count": null 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "870b69dd-aae0-4dd3-93f7-7adce1268159", 48 | "metadata": { 49 | "name": "md_dataframe", 50 | "collapsed": false, 51 | "resultHeight": 102 52 | }, 53 | "source": "## 2. Convert Table to a DataFrame\n\nNext, we'll convert the table to a Pandas DataFrame." 54 | }, 55 | { 56 | "cell_type": "code", 57 | "id": "4a5559a8-ef3a-40c3-a9d5-54602403adab", 58 | "metadata": { 59 | "language": "python", 60 | "name": "py_dataframe", 61 | "codeCollapsed": false, 62 | "resultHeight": 439, 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": "sql_data.to_pandas()", 67 | "execution_count": null 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "59b04137-ca95-4fb8-b216-133272349a78", 72 | "metadata": { 73 | "name": "md_data_preparation", 74 | "collapsed": false, 75 | "resultHeight": 195 76 | }, 77 | "source": "## 3. Create an Interactive Slider Widget & Data Preparation\n\nHere, we'll create an interactive slider for dynamically selecting the number of days to analyze. This would then trigger the filtering of the DataFrame to the specified number of days.\n\nNext, we'll reshape the data by calculating the frequency count by hour and task name, which will subsequently be used for creating the heatmap in the next step." 78 | }, 79 | { 80 | "cell_type": "code", 81 | "id": "ba8fa564-d7d5-4d1c-9f6b-400f9c05ecae", 82 | "metadata": { 83 | "language": "python", 84 | "name": "py_data_preparation", 85 | "codeCollapsed": false, 86 | "resultHeight": 216 87 | }, 88 | "outputs": [], 89 | "source": "import pandas as pd\nimport streamlit as st\nimport altair as alt\n\n# Create date filter slider\nst.subheader(\"Select time duration\")\ndays = st.slider('Select number of days to analyze', \n min_value=10, \n max_value=90, \n value=30, \n step=10)\n \n# Filter data according to day duration\nlatest_date = pd.to_datetime(df['SCHEDULED_TIME']).max()\ncutoff_date = latest_date - pd.Timedelta(days=days)\nfiltered_df = df[pd.to_datetime(df['SCHEDULED_TIME']) > cutoff_date].copy()\n \n# Prepare data for heatmap\nfiltered_df['HOUR_OF_DAY'] = pd.to_datetime(filtered_df['SCHEDULED_TIME']).dt.hour\nfiltered_df['HOUR_DISPLAY'] = filtered_df['HOUR_OF_DAY'].apply(lambda x: f\"{x:02d}:00\")\n \n# Calculate frequency count by hour and task name\nagg_df = filtered_df.groupby(['NAME', 'HOUR_DISPLAY', 'STATE']).size().reset_index(name='COUNT')\n\nst.warning(f\"Analyzing data for the last {days} days!\")", 90 | "execution_count": null 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "id": "35f31e4e-95d5-4ee5-a146-b9e93dd9d570", 95 | "metadata": { 96 | "name": "md_heatmap", 97 | "collapsed": false, 98 | "resultHeight": 128 99 | }, 100 | "source": "## 4. Create a Heatmap for Visualizing Scheduled Query Execution\n\nFinally, a heatmap and summary statistics table are generated that will allow us to gain insights on the task name and state (e.g. `SUCCEEDED`, `FAILED`, `SKIPPED`)." 101 | }, 102 | { 103 | "cell_type": "code", 104 | "id": "e3049001-f3ba-4b66-ba54-c9f02f551992", 105 | "metadata": { 106 | "language": "python", 107 | "name": "py_heatmap", 108 | "codeCollapsed": false, 109 | "resultHeight": 791 110 | }, 111 | "outputs": [], 112 | "source": "# Create heatmap\nchart = alt.Chart(agg_df).mark_rect(\n stroke='black',\n strokeWidth=1\n).encode(\n x=alt.X('HOUR_DISPLAY:O', \n title='Hour of Day',\n axis=alt.Axis(\n labels=True,\n tickMinStep=1,\n labelOverlap=False\n )),\n y=alt.Y('NAME:N', \n title='',\n axis=alt.Axis(\n labels=True,\n labelLimit=200,\n tickMinStep=1,\n labelOverlap=False,\n labelPadding=10\n )),\n color=alt.Color('COUNT:Q', \n title='Number of Executions'),\n row=alt.Row('STATE:N', \n title='Task State',\n header=alt.Header(labelAlign='left')),\n tooltip=[\n alt.Tooltip('NAME', title='Task Name'),\n alt.Tooltip('HOUR_DISPLAY', title='Hour'),\n alt.Tooltip('STATE', title='State'),\n alt.Tooltip('COUNT', title='Number of Executions')\n ]\n).properties(\n height=100,\n width=450\n).configure_view(\n stroke=None,\n continuousWidth=300\n).configure_axis(\n labelFontSize=10\n)\n\n# Display the chart\nst.subheader(f'Task Execution Frequency by State ({days} Days)')\nst.altair_chart(chart)\n\n# Optional: Display summary statistics\nst.subheader(\"Summary Statistics\")\nsummary_df = filtered_df.groupby('NAME').agg({\n 'STATE': lambda x: pd.Series(x).value_counts().to_dict()\n}).reset_index()\n\n# Format the state counts as separate columns\nstate_counts = pd.json_normalize(summary_df['STATE']).fillna(0).astype(int)\nsummary_df = pd.concat([summary_df['NAME'], state_counts], axis=1)\n\nst.dataframe(summary_df)", 113 | "execution_count": null 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "eb3e9b67-6a6e-4218-b17a-3f8564a04d18", 118 | "metadata": { 119 | "name": "md_resources", 120 | "collapsed": false, 121 | "resultHeight": 217 122 | }, 123 | "source": "## Want to learn more?\n\n- Snowflake Docs on [Account Usage](https://docs.snowflake.com/en/sql-reference/account-usage) and [TASK_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/task_history)\n- More about [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake)\n- For more inspiration on how to use Streamlit widgets in Notebooks, check out [Streamlit Docs](https://docs.streamlit.io/) and this list of what is currently supported inside [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake#label-notebooks-streamlit-support)\n- Check out the [Altair User Guide](https://altair-viz.github.io/user_guide/data.html) for further information on customizing Altair charts" 124 | } 125 | ] 126 | } -------------------------------------------------------------------------------- /Scheduled_Query_Execution_Report/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Schema_Change_Tracker/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Snowflake_Notebooks_Summit_2024_Demo/aileen_summit_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "30fcf7ae-e7f3-4a88-8afc-6568831d1c2a", 14 | "metadata": { 15 | "name": "Title", 16 | "collapsed": false, 17 | "resultHeight": 333 18 | }, 19 | "source": "# :date: Send :orange[Daily Digest] of Fresh Foods Customer Reviews to :orange[Slack] \n\n## Features\n:gray[In this demo, we'll cover the following features:]\n- :gray[Calling Snowflake Cortex functions]\n- :gray[Integrating with external endpoints, i.e. Slack APIs]\n- :gray[Scheduling the notebook to run daily]\n- :gray[Keeping version control with Git]\n- :green[**BONUS**] :gray[- Run one notebook from another :knot: :knot: :knot:]" 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "754480e1-8983-4b6c-8ba7-270e9dc5994f", 24 | "metadata": { 25 | "name": "Step_1_Get_data", 26 | "collapsed": false, 27 | "resultHeight": 60 28 | }, 29 | "source": "## Step :one: - Get the customer reviews data :speech_balloon:" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "id": "465f4adb-3571-483b-90da-cd3e576b9435", 34 | "metadata": { 35 | "language": "sql", 36 | "name": "Get_data", 37 | "collapsed": false, 38 | "codeCollapsed": false 39 | }, 40 | "outputs": [], 41 | "source": "USE SCHEMA PUBLIC.PUBLIC;\nSELECT * FROM FRESH_FOODS_REVIEWS;", 42 | "execution_count": null 43 | }, 44 | { 45 | "cell_type": "code", 46 | "id": "89f98a73-ef13-4a4e-a8c6-7ed8bf620930", 47 | "metadata": { 48 | "language": "python", 49 | "name": "Set_review_date", 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": "from datetime import date\nimport streamlit as st\n\nreview_date = date(2024, 6, 4) # change to `date.today()` to always grab the current date \nst.write(review_date)", 54 | "execution_count": null 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "d3530f1e-55dd-43d9-9e09-0c0797116102", 59 | "metadata": { 60 | "name": "Step_2_Cortex", 61 | "collapsed": false, 62 | "resultHeight": 377 63 | }, 64 | "source": "## Step :two: - Ask Snowflake Cortex to generate the daily digest :mega:\nSnowflake Cortex is a fully-managed service that enables access to industry-leading large language models (LLMs).\n- COMPLETE: Given a prompt, returns a response that completes the prompt. This function accepts either a single prompt or a conversation with multiple prompts and responses.\n\n- EMBED_TEXT_768: Given a piece of text, returns a vector embedding that represents that text.\n\n- EXTRACT_ANSWER: Given a question and unstructured data, returns the answer to the question if it can be found in the data.\n\n- SENTIMENT: Returns a sentiment score, from -1 to 1, representing the detected positive or negative sentiment of the given text.\n\n- SUMMARIZE: Returns a summary of the given text.\n\n- TRANSLATE: Translates given text from any supported language to any other." 65 | }, 66 | { 67 | "cell_type": "code", 68 | "id": "58a6bf2f-34df-452d-946f-ba416b07118d", 69 | "metadata": { 70 | "language": "sql", 71 | "name": "Cortex_SUMMARIZE", 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": "WITH CUSTOMER_REVIEWS AS(\n SELECT LISTAGG(DISTINCT REVIEW) AS REVIEWS \n FROM {{Get_data}} \n WHERE to_date(DATE) = '{{review_date}}' )\n\nSELECT SNOWFLAKE.CORTEX.SUMMARIZE(REVIEWS) FROM CUSTOMER_REVIEWS;", 76 | "execution_count": null 77 | }, 78 | { 79 | "cell_type": "code", 80 | "id": "eea93bfd-ed59-4478-9931-b145261dab5b", 81 | "metadata": { 82 | "language": "python", 83 | "name": "Summary", 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": "summary_text = Cortex_SUMMARIZE.to_pandas().iloc[0]['SNOWFLAKE.CORTEX.SUMMARIZE(REVIEWS)']\nst.write(summary_text)", 88 | "execution_count": null 89 | }, 90 | { 91 | "cell_type": "code", 92 | "id": "4849cc86-d8b4-4b7c-a4b2-f73174798593", 93 | "metadata": { 94 | "language": "sql", 95 | "name": "Daily_avg_score", 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": "SELECT AVG(SNOWFLAKE.CORTEX.SENTIMENT(REVIEW)) AS AVERAGE_RATING FROM FRESH_FOODS_REVIEWS WHERE DATE = '{{review_date}}';", 100 | "execution_count": null 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "c61883bc-ff05-4627-9558-681383d477f6", 105 | "metadata": { 106 | "name": "Step_3_Slack", 107 | "collapsed": false, 108 | "resultHeight": 60 109 | }, 110 | "source": "## Step :three: - Send the summary and sentiment to Slack :tada:\n" 111 | }, 112 | { 113 | "cell_type": "code", 114 | "id": "f69f5fcf-f470-48a6-a688-259440c95741", 115 | "metadata": { 116 | "language": "python", 117 | "name": "Send_to_Slack", 118 | "collapsed": false, 119 | "codeCollapsed": false 120 | }, 121 | "outputs": [], 122 | "source": "import requests\nimport numpy as np\n\n\nheaders = {\n 'Content-Type': 'Content-type: application/json',\n}\n\n# Extract Daily_avg_score contents\nsentiment_score = str(np.round(Daily_avg_score.to_pandas().values[0][0], 2))\n\n\ndata = {\n\t\"blocks\": [\n\t\t{\n\t\t\t\"type\": \"section\",\n\t\t\t\"text\": {\n\t\t\t\t\"type\": \"mrkdwn\",\n\t\t\t\t\"text\": f\":mega: *Daily summary | Sentiment score: {sentiment_score} | {review_date}*\"\n\t\t\t}\n\t\t},\n\t\t{\n\t\t\t\"type\": \"section\",\n\t\t\t\"text\": {\n\t\t\t\t\"type\": \"mrkdwn\",\n\t\t\t\t\"text\": summary_text\n\t\t\t}\n\t\t},\n\t\t{\n\t\t\t\"type\": \"divider\"\n\t\t},\n\t\t{\n\t\t\t\"type\": \"context\",\n\t\t\t\"elements\": [\n\t\t\t\t{\n\t\t\t\t\t\"type\": \"mrkdwn\",\n\t\t\t\t\t\"text\": \"\"\n\t\t\t\t}\n\t\t\t]\n\t\t}\n\t]\n}\n\nresponse = requests.post(\n 'https://hooks.slack.com/services/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', \n headers=headers, \n json=data)\n\nif response.status_code == 200:\n st.write('✅ Daily summary sent to Slack')", 123 | "execution_count": null 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "89b1c2bd-043b-4313-a20c-91a927e4dbd6", 128 | "metadata": { 129 | "name": "Step_4_Schedule", 130 | "collapsed": false, 131 | "resultHeight": 60 132 | }, 133 | "source": "## Step :four: - Schedule the notebook to send daily updates automatically" 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "8780c297-a747-44f9-8f94-ae9a3084814d", 138 | "metadata": { 139 | "name": "Git_integration", 140 | "collapsed": false, 141 | "resultHeight": 538 142 | }, 143 | "source": "## Let's keep track of code changes!\n- :rainbow[GitHub], :orange[GitLab], :blue[BitBucket], :violet[Azure DevOps]\n\n![](https://pngimg.com/uploads/github/github_PNG23.png)" 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "a1089358-dc72-4c1b-bb20-29d86e6ecdd2", 148 | "metadata": { 149 | "name": "Bonus_Chain_notebooks", 150 | "collapsed": false, 151 | "resultHeight": 60 152 | }, 153 | "source": "## Bonus - :chains: Chain multiple notebooks together " 154 | }, 155 | { 156 | "cell_type": "code", 157 | "id": "440692da-0080-4352-87ee-37e94d24027f", 158 | "metadata": { 159 | "language": "sql", 160 | "name": "Run_2nd_notebook", 161 | "collapsed": false, 162 | "codeCollapsed": false 163 | }, 164 | "outputs": [], 165 | "source": "EXECUTE NOTEBOOK PUBLIC.PUBLIC.AILEEN_SUMMIT_DEEP_ANALYSIS_2()", 166 | "execution_count": null 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "97229677-6288-414c-906f-9e74ee1d31de", 171 | "metadata": { 172 | "name": "cell1", 173 | "collapsed": false, 174 | "resultHeight": 176 175 | }, 176 | "source": "## You can also:\n- ### Wrap EXECUTE NOTEBOOK in business logic and call it from a Python cell :bulb:\n- ### Integrate with other orchestration tools :keyboard:" 177 | }, 178 | { 179 | "cell_type": "code", 180 | "id": "3157f79a-f841-4be8-9a50-de312a474723", 181 | "metadata": { 182 | "language": "python", 183 | "name": "Run_on_condition", 184 | "collapsed": false, 185 | "codeCollapsed": false 186 | }, 187 | "outputs": [], 188 | "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n\nsentiment_score_flt = np.round(Daily_avg_score.to_pandas().values[0][0], 2)\n \nif sentiment_score_flt < 0.9:\n st.markdown(\"\"\"\n :rotating_light: Sentiment is below threshold! \n \n Kick off the 2nd notebook Deep Analysis.\"\"\")\n session.sql(\"EXECUTE NOTEBOOK PUBLIC.PUBLIC.AILEEN_SUMMIT_DEEP_ANALYSIS_2()\").collect()\nelse:\n st.write(\":sunflower: Sentiment is good. Do nothing.\")", 189 | "execution_count": null 190 | } 191 | ] 192 | } 193 | -------------------------------------------------------------------------------- /Snowflake_Semantic_View/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: [] 5 | -------------------------------------------------------------------------------- /Snowflake_Trail_Alerts_Notifications/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake=* 6 | - snowflake-ml-python=* 7 | - snowflake-snowpark-python=* 8 | -------------------------------------------------------------------------------- /Snowflake_Trail_Alerts_Notifications/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Snowflake_Trail_Alerts_Notifications/screenshot.png -------------------------------------------------------------------------------- /Streamlit_Zero_To_Hero_Machine_Learning_App/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - streamlit=1.35.0 6 | - snowflake-snowpark-python 7 | - scikit-learn=1.3.0 8 | - pandas=2.0.3 9 | - numpy=1.24.3 10 | -------------------------------------------------------------------------------- /Telco Churn Data Analysis/Telco Churn Data Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "de537cfd", 6 | "metadata": {}, 7 | "source": [ 8 | "## This repo has been moved\n", 9 | "\n", 10 | "Visit [this Github repo](https://github.com/Snowflake-Labs/sfguide-data-analysis-churn-prediction-in-snowflake-notebooks/) to see the full quickstart source code." 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Python 3 (ipykernel)", 17 | "language": "python", 18 | "name": "python3" 19 | }, 20 | "language_info": { 21 | "codemirror_mode": { 22 | "name": "ipython", 23 | "version": 3 24 | }, 25 | "file_extension": ".py", 26 | "mimetype": "text/x-python", 27 | "name": "python", 28 | "nbconvert_exporter": "python", 29 | "pygments_lexer": "ipython3", 30 | "version": "3.11.5" 31 | } 32 | }, 33 | "nbformat": 4, 34 | "nbformat_minor": 5 35 | } 36 | -------------------------------------------------------------------------------- /Telco Churn Data Analysis/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - imbalanced-learn=0.11.0 6 | - snowflake-ml-python=1.3.1 7 | -------------------------------------------------------------------------------- /Visual Data Stories with Snowflake Notebooks/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - matplotlib=3.7.2 6 | - plotly=5.19.0 7 | -------------------------------------------------------------------------------- /Visual Data Stories with Snowflake Notebooks/snowflake-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/snowflake-demo-notebooks/982169ee826e4eb851e964275f7afe6539727574/Visual Data Stories with Snowflake Notebooks/snowflake-logo.png -------------------------------------------------------------------------------- /Warehouse_Utilization_with_Streamlit/Warehouse_Utilization_with_Streamlit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "Streamlit Notebook", 5 | "name": "streamlit" 6 | } 7 | }, 8 | "nbformat_minor": 5, 9 | "nbformat": 4, 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "id": "cc4fb15e-f9db-44eb-9f60-1b9589b755cb", 14 | "metadata": { 15 | "name": "md_title", 16 | "collapsed": false 17 | }, 18 | "source": "# Analyze Warehouse Utilization in Snowflake Notebooks with Streamlit\n\nA notebook that generates a heatmap of warehouse usage patterns to identify peak hours that can help with cost optimization.\n\nHere's what we're implementing to investigate the tables:\n1. Retrieve warehouse utilization data\n2. Convert table to a DataFrame\n3. Create an interactive slider widget\n4. Create a Heatmap for visualizing warehouse usage patterns" 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "42a7b143-0779-4706-affc-c214213f55c5", 23 | "metadata": { 24 | "name": "md_retrieve_data", 25 | "collapsed": false 26 | }, 27 | "source": "## 1. Retrieve warehouse utilization data\n\nFirstly, we'll write a SQL query to retrieve warehouse utilization data." 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "e17f14a5-ea50-4a1d-bc15-c64a6447d0a8", 32 | "metadata": { 33 | "language": "sql", 34 | "name": "sql_warehouse_data", 35 | "codeCollapsed": false, 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": "SELECT \n DATE(start_time) AS usage_date,\n HOUR(start_time) AS hour_of_day,\n warehouse_name,\n avg_running,\n avg_queued_load,\n start_time,\n end_time\nFROM snowflake.account_usage.warehouse_load_history\nWHERE start_time >= DATEADD(month, -1, CURRENT_TIMESTAMP())\nORDER BY warehouse_name, start_time;", 40 | "execution_count": null 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "b2ef4485-566e-4b11-bb5a-8085c9bc0c97", 45 | "metadata": { 46 | "name": "md_dataframe", 47 | "collapsed": false 48 | }, 49 | "source": "## 2. Convert table to a DataFrame\n\nNext, we'll convert the table to a Pandas DataFrame." 50 | }, 51 | { 52 | "cell_type": "code", 53 | "id": "014ceccb-9447-43c9-ad8f-a91a80722de1", 54 | "metadata": { 55 | "language": "python", 56 | "name": "py_dataframe", 57 | "collapsed": false, 58 | "codeCollapsed": false 59 | }, 60 | "outputs": [], 61 | "source": "sql_warehouse_data.to_pandas()", 62 | "execution_count": null 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "d4027f90-ae2a-41e7-8a09-5c088b3ab3bf", 67 | "metadata": { 68 | "name": "md_", 69 | "collapsed": false 70 | }, 71 | "source": "## 3. Create an Interactive slider widget\n\nLet's create an interactive slider using Streamlit. This would allow users to select the number of days to analyze, which would filter the DataFrame. \n\nFinally, we'll calculate the total warehouse load (`TOTAL_LOAD`) and format the hour display (`HOUR_DISPLAY`) for each record." 72 | }, 73 | { 74 | "cell_type": "code", 75 | "id": "137f2fc5-c5df-4dd4-b223-0e0690b6f8a6", 76 | "metadata": { 77 | "language": "python", 78 | "name": "py_data_preparation", 79 | "codeCollapsed": false, 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": "import pandas as pd\nimport streamlit as st\n\n# Get data\ndf = py_dataframe.copy()\n\n# Create date filter slider\ndays = st.slider('Select number of days to analyze', \n min_value=10, \n max_value=90, \n value=30, \n step=10)\n\n# Filter data based on selected days and create a copy\nlatest_date = pd.to_datetime(df['USAGE_DATE']).max()\ncutoff_date = latest_date - pd.Timedelta(days=days)\nfiltered_df = df[pd.to_datetime(df['USAGE_DATE']) > cutoff_date].copy()\n\n# Prepare data and create heatmap\n#filtered_df.loc[:, 'TOTAL_LOAD'] = filtered_df['AVG_RUNNING'] + filtered_df['AVG_QUEUED_LOAD']\n#filtered_df.loc[:, 'HOUR_DISPLAY'] = filtered_df['HOUR_OF_DAY'].apply(lambda x: f\"{x:02d}:00\")\nfiltered_df['TOTAL_LOAD'] = filtered_df['AVG_RUNNING'] + filtered_df['AVG_QUEUED_LOAD']\nfiltered_df['HOUR_DISPLAY'] = filtered_df['HOUR_OF_DAY'].apply(lambda x: f\"{x:02d}:00\")\n\nst.warning(f\"You've selected {days} days to analyze!\")\nfiltered_df", 84 | "execution_count": null 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "84929a0b-de27-4655-93dc-fd15bac9f3e5", 89 | "metadata": { 90 | "name": "md_heatmap", 91 | "collapsed": false 92 | }, 93 | "source": "## 4. Create a Heatmap for visualizing warehouse usage patterns\n\nFinally, we're create a heatmap using Altair. The heatmap shows the warehouse usage pattern across different hours of the day. Color intensity represents the total load and interactive tooltips showing detailed metrics for each cell." 94 | }, 95 | { 96 | "cell_type": "code", 97 | "id": "f84a45e7-288f-400c-8a99-badb37a13707", 98 | "metadata": { 99 | "language": "python", 100 | "name": "py_heatmap", 101 | "codeCollapsed": false, 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": "import altair as alt\nimport streamlit as st\n\nchart = alt.Chart(filtered_df).mark_rect(\n stroke='black',\n strokeWidth=1\n).encode(\n x=alt.X('HOUR_DISPLAY:O', \n title='Hour of Day',\n axis=alt.Axis(\n labels=True,\n tickMinStep=1,\n labelOverlap=False\n )),\n y=alt.Y('WAREHOUSE_NAME:N', \n title='Warehouse Name',\n axis=alt.Axis(\n labels=True,\n labelLimit=200,\n tickMinStep=1,\n labelOverlap=False,\n labelPadding=10\n )),\n color=alt.Color('TOTAL_LOAD:Q', title='Total Load'),\n tooltip=['WAREHOUSE_NAME', 'HOUR_DISPLAY', 'TOTAL_LOAD', \n 'AVG_RUNNING', 'AVG_QUEUED_LOAD']\n).properties(\n #width=700,\n #height=450,\n title=f'Warehouse Usage Patterns ({days} Days)'\n).configure_view(\n stroke=None,\n continuousHeight=400\n).configure_axis(\n labelFontSize=10\n)\n\n# Display the chart\nst.altair_chart(chart, use_container_width=True)", 106 | "execution_count": null 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "f6e54924-57e2-4dfb-8bf1-bad9b7fb635d", 111 | "metadata": { 112 | "name": "md_resources", 113 | "collapsed": false 114 | }, 115 | "source": "## Want to learn more?\n\n- Snowflake Docs on [Account Usage](https://docs.snowflake.com/en/sql-reference/account-usage) and [WAREHOUSE_LOAD_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/warehouse_load_history)\n- More about [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake)\n- For more inspiration on how to use Streamlit widgets in Notebooks, check out [Streamlit Docs](https://docs.streamlit.io/) and this list of what is currently supported inside [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-use-with-snowflake#label-notebooks-streamlit-support)\n- Check out the [Altair User Guide](https://altair-viz.github.io/user_guide/data.html) for further information on customizing Altair charts" 116 | } 117 | ] 118 | } 119 | -------------------------------------------------------------------------------- /Warehouse_Utilization_with_Streamlit/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - altair=* 6 | - pandas=* 7 | -------------------------------------------------------------------------------- /Working with Git/Working with Git.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "38d31fbc-6666-4495-a2b1-d716ffe24329", 6 | "metadata": { 7 | "collapsed": false, 8 | "name": "cell1" 9 | }, 10 | "source": [ 11 | "In this example, we will demonstrate how you can easily go from prototyping for development purposes to production with Git integration.\n", 12 | "\n", 13 | "We will show an example of a simple data pipeline with one query. By changing the `MODE` variable to `DEV` or `PROD` with different warehouse and schema configurations.\n", 14 | "\n", 15 | "For `DEV`, we will be using an extra small warehouse on a sample of the TPCH data.\n", 16 | "For `PROD`, we will be using a large warehouse on a sample of the TPCH data that is 100X the size of the DEV sample." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "3775908f-ca36-4846-8f38-5adca39217f2", 23 | "metadata": { 24 | "codeCollapsed": false, 25 | "collapsed": false, 26 | "language": "python", 27 | "name": "cell2" 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "MODE = \"DEV\" # Parameter to control whether to run in DEV or PROD mode\n", 32 | "\n", 33 | "if MODE == \"DEV\":\n", 34 | " # For development, use XSMALL warehouse on TPCH data with scale factor of 1\n", 35 | " warehouse_name = \"GIT_EXAMPLE_DEV_WH\"\n", 36 | " schema_name = \"TPCH_SF1\"\n", 37 | " size = 'XSMALL'\n", 38 | "elif MODE == \"PROD\": \n", 39 | " # For production, use LARGE warehouse on TPCH data with scale factor of 100\n", 40 | " warehouse_name = \"GIT_EXAMPLE_PROD_WH\"\n", 41 | " schema_name = \"TPCH_SF100\"\n", 42 | " size = 'LARGE'" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "01bd1a4d-1715-4c10-8fdc-08be7b115be5", 48 | "metadata": { 49 | "name": "cell3" 50 | }, 51 | "source": [ 52 | "Let's create and use a warehouse with the specified name and size." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "55bb9c45-e1e4-49ba-a7db-e5eb671ad13a", 59 | "metadata": { 60 | "codeCollapsed": false, 61 | "collapsed": false, 62 | "language": "sql", 63 | "name": "cell4" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "-- Create warehouse with specified name and size\n", 68 | "CREATE OR REPLACE WAREHOUSE {{warehouse_name}} WITH WAREHOUSE_SIZE= {{size}};" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "2b1f4b91-7988-432b-afe1-cb599eea5cc6", 75 | "metadata": { 76 | "collapsed": false, 77 | "language": "sql", 78 | "name": "cell5" 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "-- Use specified warehouse for subsequent query\n", 83 | "USE WAREHOUSE {{warehouse_name}};" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "f330162f-b59e-467d-bc4e-5c297993c4ee", 89 | "metadata": { 90 | "collapsed": false, 91 | "name": "cell6" 92 | }, 93 | "source": [ 94 | "Use the TPC-H Sample dataset with differing scale factor. \n", 95 | "- Note: Sample data sets are provided in a database named SNOWFLAKE_SAMPLE_DATA that has been shared with your account from the Snowflake SFC_SAMPLES account. If you do not see the database, you can create it yourself. Refer to [Using the Sample Database](https://docs.snowflake.com/en/user-guide/sample-data-using)." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "edb15abf-6061-4e29-9d45-85b0cc806e71", 102 | "metadata": { 103 | "codeCollapsed": false, 104 | "collapsed": false, 105 | "language": "sql", 106 | "name": "cell7" 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "USE SCHEMA SNOWFLAKE_SAMPLE_DATA.{{schema_name}}; " 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "024892ff-b2df-4a4d-9308-1760751b4dae", 116 | "metadata": { 117 | "collapsed": false, 118 | "name": "cell8" 119 | }, 120 | "source": [ 121 | "Check out the number of rows in the `LINEITEM` table." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "e73a5b30-fdcc-4dd6-9619-f19a5c31e769", 128 | "metadata": { 129 | "codeCollapsed": false, 130 | "collapsed": false, 131 | "language": "sql", 132 | "name": "cell9" 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "SELECT COUNT(*) FROM LINEITEM;" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "115c9b33-f508-4385-806d-20bada66fe18", 142 | "metadata": { 143 | "collapsed": false, 144 | "name": "cell10" 145 | }, 146 | "source": [ 147 | "Now let's run a query on this dataset:\n", 148 | "- The query lists totals for extended price, discounted extended price, discounted extended price plus tax, average quantity, average extended price, and average discount. These aggregates are grouped by RETURNFLAG and LINESTATUS, and listed in ascending order of RETURNFLAG and LINESTATUS. A count of the number of line items in each group is included." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "8d50cbf4-0c8d-4950-86cb-114990437ac9", 155 | "metadata": { 156 | "codeCollapsed": false, 157 | "collapsed": false, 158 | "language": "sql", 159 | "name": "cell11" 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "select\n", 164 | " l_returnflag,\n", 165 | " l_linestatus,\n", 166 | " sum(l_quantity) as sum_qty,\n", 167 | " sum(l_extendedprice) as sum_base_price,\n", 168 | " sum(l_extendedprice * (1-l_discount)) as sum_disc_price,\n", 169 | " sum(l_extendedprice * (1-l_discount) * (1+l_tax)) as sum_charge,\n", 170 | " avg(l_quantity) as avg_qty,\n", 171 | " avg(l_extendedprice) as avg_price,\n", 172 | " avg(l_discount) as avg_disc,\n", 173 | " count(*) as count_order\n", 174 | " from\n", 175 | " lineitem\n", 176 | " where\n", 177 | " l_shipdate <= dateadd(day, -90, to_date('1998-12-01'))\n", 178 | " group by\n", 179 | " l_returnflag,\n", 180 | " l_linestatus\n", 181 | " order by\n", 182 | " l_returnflag,\n", 183 | " l_linestatus;" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "170637df-6e8b-498a-8f2a-fda1a41c21ca", 189 | "metadata": { 190 | "collapsed": false, 191 | "name": "cell12" 192 | }, 193 | "source": [ 194 | "Using the cell referencing, we get the query ID and history of the query we just ran." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "c49eb85b-6956-4da6-949f-1939c6a1dcc4", 201 | "metadata": { 202 | "codeCollapsed": false, 203 | "collapsed": false, 204 | "language": "python", 205 | "name": "cell13" 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "# Get query ID of the referenced cell\n", 210 | "query_id = cell11.result_scan_sql().split(\"'\")[1]" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "id": "dfd22f9f-44ef-4a3f-99e6-7c774b02eea7", 217 | "metadata": { 218 | "codeCollapsed": false, 219 | "collapsed": false, 220 | "language": "sql", 221 | "name": "cell14" 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "select * from table(information_schema.query_history_by_warehouse('{{warehouse_name}}')) \n", 226 | "where query_id = '{{query_id}}';" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "id": "ef4d7fcb-9729-4409-8bce-7a7081b98e87", 232 | "metadata": { 233 | "name": "cell15" 234 | }, 235 | "source": [ 236 | "Finally, we compile all of this information into a report to document the run information." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "9b718981-9577-4996-b212-0cf7ffb4f23b", 243 | "metadata": { 244 | "codeCollapsed": false, 245 | "collapsed": false, 246 | "language": "python", 247 | "name": "cell16" 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "import streamlit as st\n", 252 | "from datetime import datetime\n", 253 | "st.header(f\"[{MODE}] Run Report\")\n", 254 | "st.markdown(f\"Generated on: {datetime.now()}\")\n", 255 | "\n", 256 | "st.markdown(f\"### System Information\")\n", 257 | "# Print session information\n", 258 | "from snowflake.snowpark.context import get_active_session\n", 259 | "session = get_active_session()\n", 260 | "# Add a query tag to the session. This helps with troubleshooting and performance monitoring.\n", 261 | "session.query_tag = {\"origin\":\"sf_sit-is\", \n", 262 | " \"name\":\"notebook_demo_pack\", \n", 263 | " \"version\":{\"major\":1, \"minor\":0},\n", 264 | " \"attributes\":{\"is_quickstart\":1, \"source\":\"notebook\", \"vignette\":\"working_with_git\"}}\n", 265 | "st.markdown(f\"**Database:** {session.get_current_database()[1:-1]}\")\n", 266 | "st.markdown(f\"**Schema:** {session.get_current_schema()[1:-1]}\")\n", 267 | "st.markdown(f\"**Warehouse:** {session.get_current_warehouse()[1:-1]}\")\n", 268 | "\n", 269 | "st.markdown(f\"### Query Information\")\n", 270 | "# Print session information\n", 271 | "st.markdown(f\"**Query ID:** {query_id}\")\n", 272 | "result_info = cell14.to_pandas()\n", 273 | "st.markdown(\"**Query Text:**\")\n", 274 | "st.code(result_info[\"QUERY_TEXT\"].values[0],language='sql',line_numbers=True)\n", 275 | "st.markdown(\"**Runtime information:**\")\n", 276 | "st.dataframe(result_info[['START_TIME','END_TIME','TOTAL_ELAPSED_TIME']])" 277 | ] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "Streamlit Notebook", 283 | "name": "streamlit" 284 | } 285 | }, 286 | "nbformat": 4, 287 | "nbformat_minor": 5 288 | } 289 | -------------------------------------------------------------------------------- /Working with Git/environment.yml: -------------------------------------------------------------------------------- 1 | name: app_environment 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - snowflake=0.8.0 -------------------------------------------------------------------------------- /Working with Git/git_setup.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE SECRET git_secret_example 2 | TYPE = password 3 | USERNAME = '' 4 | PASSWORD = ''; 5 | 6 | CREATE OR REPLACE API INTEGRATION git_api_integration_example 7 | API_PROVIDER = git_https_api 8 | API_ALLOWED_PREFIXES = ('https://github.com/') 9 | ALLOWED_AUTHENTICATION_SECRETS = (git_secret_example) 10 | ENABLED = TRUE; 11 | 12 | DROP SECRET git_secret_example; 13 | DROP API INTEGRATION git_api_integration_example; -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | default_connection_name = default 2 | --------------------------------------------------------------------------------