├── .devcontainer ├── Dockerfile ├── Microsoft.PowerShell_profile.ps1 ├── config.fish ├── devcontainer.json └── requirements.txt ├── .gitattributes ├── .github └── workflows │ ├── ruff.yml │ ├── sqlfluff.yml │ └── yamllint-ci.yml ├── .gitignore ├── .sqlfluff ├── .sqlfluffignore ├── .yamllint ├── CI_Examples ├── python-ci.yml ├── python-pr.yml ├── sqlfluff-ci.yml ├── sqlfluff-pr.yml ├── yamllint-ci.yml └── yamllint-pr.yml ├── Docker ├── Dockerfile ├── Populate_SQL_Server_Docker_Container.py ├── docker-compose.yml └── requirements.txt ├── Fivetran └── disable_tables_with_zero_rows_fivetran_api.py ├── Python ├── Snowflake_Insert_Statements.py ├── Snowpark_Backload_API_Data.py ├── Snowpark_Create_Stored_Procedure.py ├── Snowpark_Example_Backload_SQL_Server_Data.py ├── Stack.ipynb ├── compare_two_lists_for_differences.py ├── connecting_to_snowflake_using_python.py ├── connecting_to_sql_server_using_python.py ├── determine_sql_field_length.py ├── load_json_to_snowflake.py ├── parse_xml_compare_differences.py ├── pull_records_for_all_sql_tables.py ├── read_sql_server_write_snowflake.py ├── sql_insert_statement_from_csv.py └── sql_style_join_csv.py ├── README.md ├── SQL ├── Load_CSV_to_Snowflake │ ├── PUT.sql │ ├── Snowflake_Worksheet_Load_CSV.sql │ └── snowsql.sh ├── Snowflake_Account_Setup.sql ├── Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql ├── Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql ├── Snowflake_Clean_Staging_Area.sql ├── Snowflake_Cloning.sql ├── Snowflake_Data_Pipeline_From_Internal_Stage.sql ├── Snowflake_Find_Duplicates.sql ├── Snowflake_Find_Missing_Dates.sql ├── Snowflake_Flatten_JSON_Example.sql ├── Snowflake_ForLoop_GrantPermissions.sql ├── Snowflake_Merge_Into_Example.sql ├── Snowflake_Python_Stored_Procedure_Example.sql ├── Snowflake_Shorten_Huge_Union_Queries.sql └── Snowflake_Time_Travel.sql ├── Shell ├── Create_gitignore_and_add_lines.sh ├── Microsoft.PowerShell_profile.ps1 ├── Pass_secret_at_runtime_to_py_script.ps1 ├── Search_specific_branch_name.ps1 ├── Search_specific_branch_name.sh ├── create_gitignore_and_add_lines.ps1 ├── git_mv_multiple_files.ps1 ├── run_all_python_files_in_dir.ps1 └── run_groovy_script_in_Docker.sh ├── SnowSQL_CICD ├── build.yml ├── deploy.yml ├── snowsql.yml └── sqlfluff_pr_check.yml ├── dbt ├── dbt_python_model_example.py └── filter_dbt_catalog_query_snowflake.sql └── requirements.txt /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go/.devcontainer/base.Dockerfile 2 | 3 | # [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster 4 | ARG VARIANT=1-bullseye 5 | FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT} 6 | 7 | # [Choice] Node.js version: none, lts/*, 16, 14, 12, 10 8 | ARG NODE_VERSION="none" 9 | RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi 10 | 11 | # Install powershell 12 | ARG PS_VERSION="7.2.1" 13 | # powershell-7.3.0-linux-x64.tar.gz 14 | # powershell-7.3.0-linux-arm64.tar.gz 15 | RUN ARCH="$(dpkg --print-architecture)"; \ 16 | if [ "${ARCH}" = "amd64" ]; then \ 17 | PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-x64.tar.gz"; \ 18 | elif [ "${ARCH}" = "arm64" ]; then \ 19 | PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm64.tar.gz"; \ 20 | elif [ "${ARCH}" = "armhf" ]; then \ 21 | PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm32.tar.gz"; \ 22 | fi; \ 23 | wget https://github.com/PowerShell/PowerShell/releases/download/$PS_BIN -O pwsh.tar.gz; \ 24 | mkdir /usr/local/pwsh && \ 25 | tar Cxvfz /usr/local/pwsh pwsh.tar.gz && \ 26 | rm pwsh.tar.gz 27 | 28 | ENV PATH=$PATH:/usr/local/pwsh 29 | 30 | RUN echo 'deb http://download.opensuse.org/repositories/shells:/fish:/release:/3/Debian_11/ /' | tee /etc/apt/sources.list.d/shells:fish:release:3.list; \ 31 | curl -fsSL https://download.opensuse.org/repositories/shells:fish:release:3/Debian_11/Release.key | gpg --dearmor | tee /etc/apt/trusted.gpg.d/shells_fish_release_3.gpg > /dev/null; \ 32 | apt-get update && export DEBIAN_FRONTEND=noninteractive \ 33 | && apt-get install -y --no-install-recommends \ 34 | fish \ 35 | tmux \ 36 | fzf \ 37 | && apt-get clean 38 | 39 | ARG USERNAME=vscode 40 | 41 | # Download the oh-my-posh binary 42 | RUN mkdir /home/${USERNAME}/bin; \ 43 | wget https://github.com/JanDeDobbeleer/oh-my-posh/releases/latest/download/posh-linux-$(dpkg --print-architecture) -O /home/${USERNAME}/bin/oh-my-posh; \ 44 | chmod +x /home/${USERNAME}/bin/oh-my-posh; \ 45 | chown ${USERNAME}: /home/${USERNAME}/bin; 46 | 47 | # NOTE: devcontainers are Linux-only at this time but when 48 | # Windows or Darwin is supported someone will need to improve 49 | # the code logic above. 50 | 51 | # Setup a neat little PowerShell experience 52 | RUN pwsh -Command Install-Module posh-git -Scope AllUsers -Force; \ 53 | pwsh -Command Install-Module z -Scope AllUsers -Force; \ 54 | pwsh -Command Install-Module PSFzf -Scope AllUsers -Force; \ 55 | pwsh -Command Install-Module Terminal-Icons -Scope AllUsers -Force; 56 | 57 | # add the oh-my-posh path to the PATH variable 58 | ENV PATH "$PATH:/home/${USERNAME}/bin" 59 | 60 | # Add vscode default dir to the PATH variable 61 | ENV PATH "$PATH:/home/vscode/.local/bin" 62 | 63 | # Can be used to override the devcontainer prompt default theme: 64 | ENV POSH_THEME="https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/night-owl.omp.json" 65 | 66 | # Deploy oh-my-posh prompt to Powershell: 67 | COPY Microsoft.PowerShell_profile.ps1 /home/${USERNAME}/.config/powershell/Microsoft.PowerShell_profile.ps1 68 | 69 | # Deploy oh-my-posh prompt to Fish: 70 | COPY config.fish /home/${USERNAME}/.config/fish/config.fish 71 | 72 | # Everything runs as root during build time, so we want 73 | # to make sure the vscode user can edit these paths too: 74 | RUN chmod 777 -R /home/${USERNAME}/.config 75 | 76 | # Override vscode's own Bash prompt with oh-my-posh: 77 | RUN sed -i 's/^__bash_prompt$/#&/' /home/${USERNAME}/.bashrc && \ 78 | echo "eval \"\$(oh-my-posh init bash --config $POSH_THEME)\"" >> /home/${USERNAME}/.bashrc 79 | 80 | # Override vscode's own ZSH prompt with oh-my-posh: 81 | RUN echo "eval \"\$(oh-my-posh init zsh --config $POSH_THEME)\"" >> /home/${USERNAME}/.zshrc 82 | 83 | # Set container timezone: 84 | ARG TZ="UTC" 85 | RUN ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime 86 | 87 | # Required for Python - Confluent Kafka on M1 Silicon 88 | RUN apt update && apt -y install software-properties-common gcc 89 | RUN git clone https://github.com/edenhill/librdkafka 90 | RUN cd librdkafka && ./configure && make && make install && ldconfig 91 | 92 | # [Optional] Uncomment the next line to use go get to install anything else you need 93 | # RUN go get -x github.com/JanDeDobbeleer/battery 94 | 95 | # [Optional] Uncomment this line to install global node packages. 96 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1 -------------------------------------------------------------------------------- /.devcontainer/Microsoft.PowerShell_profile.ps1: -------------------------------------------------------------------------------- 1 | Import-Module posh-git 2 | Import-Module PSFzf -ArgumentList 'Ctrl+t', 'Ctrl+r' 3 | Import-Module z 4 | Import-Module Terminal-Icons 5 | 6 | Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete 7 | 8 | $env:POSH_GIT_ENABLED=$true 9 | oh-my-posh init pwsh --config $env:POSH_THEME | Invoke-Expression 10 | 11 | # NOTE: You can override the above env var from the devcontainer.json "args" under the "build" key. 12 | function PassGen { 13 | param ( 14 | [int]$Length = 20 15 | ) 16 | 17 | $ValidCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+=' 18 | $Password = '' 19 | 20 | for ($i = 0; $i -lt $Length; $i++) { 21 | $RandomIndex = Get-Random -Minimum 0 -Maximum $ValidCharacters.Length 22 | $Password += $ValidCharacters[$RandomIndex] 23 | } 24 | 25 | return $Password 26 | } 27 | 28 | Set-Alias -Name pg -Value PassGen 29 | # Aliases 30 | Set-Alias -Name ac -Value Add-Content -------------------------------------------------------------------------------- /.devcontainer/config.fish: -------------------------------------------------------------------------------- 1 | # Activate oh-my-posh prompt: 2 | oh-my-posh init fish --config $POSH_THEME | source 3 | 4 | # NOTE: You can override the above env vars from the devcontainer.json "args" under the "build" key. -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go 3 | { 4 | "name": "oh-my-posh", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "args": { 8 | // Update the VARIANT arg to pick a version of Go: 1, 1.16, 1.17 9 | // Append -bullseye or -buster to pin to an OS version. 10 | // Use -bullseye variants on local arm64/Apple Silicon. 11 | "VARIANT": "1.19-bullseye", 12 | // Options: 13 | 14 | "POSH_THEME": "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json", 15 | 16 | // Override me with your own timezone: 17 | "TZ": "America/Moncton", 18 | // Use one of the "TZ database name" entries from: 19 | // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 20 | 21 | "NODE_VERSION": "lts/*", 22 | //Powershell version 23 | "PS_VERSION": "7.2.1" 24 | } 25 | }, 26 | "runArgs": ["--cap-add=SYS_PTRACE", 27 | "--security-opt", 28 | "seccomp=unconfined" 29 | ], 30 | 31 | "features": { 32 | "ghcr.io/devcontainers/features/azure-cli:1": { 33 | "version": "latest" 34 | }, 35 | "ghcr.io/devcontainers/features/python:1": { 36 | "version": "3.8" 37 | }, 38 | "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, 39 | "ghcr.io/devcontainers-contrib/features/terraform-asdf:2": {}, 40 | "ghcr.io/devcontainers-contrib/features/yamllint:2": {}, 41 | "ghcr.io/devcontainers/features/docker-in-docker:2": {}, 42 | "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {}, 43 | "ghcr.io/devcontainers/features/github-cli:1": {}, 44 | "ghcr.io/devcontainers-contrib/features/spark-sdkman:2": { 45 | "jdkVersion": "11" 46 | }, 47 | "ghcr.io/dhoeric/features/google-cloud-cli:1": { 48 | "version": "latest" 49 | } 50 | }, 51 | 52 | // Set *default* container specific settings.json values on container create. 53 | "customizations": { 54 | "vscode": { 55 | "settings": { 56 | "go.toolsManagement.checkForUpdates": "local", 57 | "go.useLanguageServer": true, 58 | "go.gopath": "/go", 59 | "go.goroot": "/usr/local/go", 60 | "terminal.integrated.profiles.linux": { 61 | "bash": { 62 | "path": "bash" 63 | }, 64 | "zsh": { 65 | "path": "zsh" 66 | }, 67 | "fish": { 68 | "path": "fish" 69 | }, 70 | "tmux": { 71 | "path": "tmux", 72 | "icon": "terminal-tmux" 73 | }, 74 | "pwsh": { 75 | "path": "pwsh", 76 | "icon": "terminal-powershell" 77 | } 78 | }, 79 | "terminal.integrated.defaultProfile.linux": "pwsh", 80 | "terminal.integrated.defaultProfile.windows": "pwsh", 81 | "terminal.integrated.defaultProfile.osx": "pwsh", 82 | "tasks.statusbar.default.hide": true, 83 | "terminal.integrated.tabs.defaultIcon": "terminal-powershell", 84 | "terminal.integrated.tabs.defaultColor": "terminal.ansiBlue", 85 | "workbench.colorTheme": "GitHub Dark Dimmed", 86 | "workbench.iconTheme": "material-icon-theme" 87 | }, 88 | 89 | // Add the IDs of extensions you want installed when the container is created. 90 | "extensions": [ 91 | "ms-mssql.mssql", 92 | "snowflake.snowflake-vsc", 93 | "golang.go", 94 | "ms-vscode.powershell", 95 | "ms-python.python", 96 | "ms-python.vscode-pylance", 97 | "redhat.vscode-yaml", 98 | "ms-vscode-remote.remote-containers", 99 | "ms-toolsai.jupyter", 100 | "eamodio.gitlens", 101 | "yzhang.markdown-all-in-one", 102 | "davidanson.vscode-markdownlint", 103 | "editorconfig.editorconfig", 104 | "esbenp.prettier-vscode", 105 | "github.vscode-pull-request-github", 106 | "akamud.vscode-theme-onedark", 107 | "PKief.material-icon-theme", 108 | "GitHub.github-vscode-theme", 109 | "actboy168.tasks", 110 | "bastienboutonnet.vscode-dbt", 111 | "innoverio.vscode-dbt-power-user", 112 | "redhat.vscode-xml", 113 | "adpyke.vscode-sql-formatter", 114 | "inferrinizzard.prettier-sql-vscode", 115 | "github.vscode-github-actions", 116 | "ms-python.black-formatter" 117 | ] 118 | } 119 | }, 120 | 121 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 122 | // "forwardPorts": [3000], 123 | 124 | // Use 'postCreateCommand' to run commands after the container is created. 125 | "postCreateCommand": "pip3 install --user -r .devcontainer/requirements.txt --use-pep517", 126 | 127 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 128 | "remoteUser": "vscode" 129 | } 130 | -------------------------------------------------------------------------------- /.devcontainer/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.5.2 2 | prefect==2.7.7 3 | prefect-sqlalchemy==0.2.2 4 | prefect-gcp[cloud_storage]==0.2.4 5 | protobuf 6 | pyarrow==10.0.1 7 | pandas-gbq==0.18.1 8 | psycopg2-binary==2.9.5 9 | sqlalchemy==1.4.46 10 | ipykernel 11 | polars 12 | dbt-core 13 | dbt-bigquery 14 | dbt-postgres 15 | dbt-snowflake 16 | pyspark 17 | # confluent-kafka==1.9.2 18 | snowflake-snowpark-python 19 | scikit-learn 20 | ruff 21 | sqlfluff -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.sql linguist-detectable=true 2 | *.yml linguist-detectable=true 3 | *.yml linguist-language=YAML 4 | *.ipynb linguist-detectable=false -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff Testing 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v2 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.x 17 | 18 | - name: Install dependencies 19 | run: pip install ruff 20 | 21 | - name: Test Ruff installation 22 | run: ruff --version 23 | 24 | - name: Run ruff 25 | run: ruff check ./Python/ 26 | -------------------------------------------------------------------------------- /.github/workflows/sqlfluff.yml: -------------------------------------------------------------------------------- 1 | name: SQLFluff Testing 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v2 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.x 17 | 18 | - name: Install dependencies 19 | run: pip install sqlfluff 20 | 21 | - name: Run SQLFluff 22 | run: git ls-files | grep \.sql | sqlfluff lint --dialect snowflake 23 | -------------------------------------------------------------------------------- /.github/workflows/yamllint-ci.yml: -------------------------------------------------------------------------------- 1 | name: yamllint testing 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v2 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.x 17 | 18 | - name: Install yamllint 19 | run: pip install yamllint 20 | 21 | - name: Run yamllint 22 | run: git ls-files | grep \.yml | yamllint . 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode/ 3 | /.ruff_cache 4 | /.ipynb_checkpoints 5 | Snowflake_Azure_Blob_Auto_ingest_Snowpipe.sql 6 | -------------------------------------------------------------------------------- /.sqlfluff: -------------------------------------------------------------------------------- 1 | [sqlfluff] 2 | 3 | # Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html 4 | # Or run 'sqlfluff dialects' 5 | dialect = snowflake 6 | 7 | # One of [raw|jinja|python|placeholder] 8 | templater = jinja 9 | 10 | # Comma separated list of rules to exclude, or None 11 | # See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules 12 | # AM04 (ambiguous.column_count) and ST06 (structure.column_order) are 13 | # two of the more controversial rules included to illustrate usage. 14 | exclude_rules = ambiguous.column_count, structure.column_order 15 | warnings = LT05 16 | 17 | # The standard max_line_length is 80 in line with the convention of 18 | # other tools and several style guides. Many projects however prefer 19 | # something a little longer. 20 | # Set to zero or negative to disable checks. 21 | max_line_length = 120 22 | 23 | # CPU processes to use while linting. 24 | # The default is "single threaded" to allow easy debugging, but this 25 | # is often undesirable at scale. 26 | # If positive, just implies number of processes. 27 | # If negative or zero, implies number_of_cpus - specified_number. 28 | # e.g. -1 means use all processors but one. 0 means all cpus. 29 | processes = -1 30 | 31 | # If using the dbt templater, we recommend setting the project dir. 32 | ; [sqlfluff:templater:dbt] 33 | ; project_dir = ./ 34 | 35 | [sqlfluff:indentation] 36 | # While implicit indents are not enabled by default. Many of the 37 | # SQLFluff maintainers do use them in their projects. 38 | allow_implicit_indents = true 39 | 40 | # The default configuration for aliasing rules is "consistent" 41 | # which will auto-detect the setting from the rest of the file. This 42 | # is less desirable in a new project and you may find this (slightly 43 | # more strict) setting more useful. 44 | [sqlfluff:rules:aliasing.table] 45 | aliasing = explicit 46 | [sqlfluff:rules:aliasing.column] 47 | aliasing = explicit 48 | [sqlfluff:rules:aliasing.length] 49 | min_alias_length = 3 50 | 51 | # The default configuration for capitalisation rules is "consistent" 52 | # which will auto-detect the setting from the rest of the file. This 53 | # is less desirable in a new project and you may find this (slightly 54 | # more strict) setting more useful. 55 | # Typically we find users rely on syntax highlighting rather than 56 | # capitalisation to distinguish between keywords and identifiers. 57 | # Clearly, if your organisation has already settled on uppercase 58 | # formatting for any of these syntax elements then set them to "upper". 59 | # See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason 60 | [sqlfluff:rules:capitalisation.keywords] 61 | capitalisation_policy = lower 62 | [sqlfluff:rules:capitalisation.identifiers] 63 | capitalisation_policy = lower 64 | [sqlfluff:rules:capitalisation.functions] 65 | extended_capitalisation_policy = lower 66 | [sqlfluff:rules:capitalisation.literals] 67 | capitalisation_policy = lower 68 | [sqlfluff:rules:capitalisation.types] 69 | extended_capitalisation_policy = lower 70 | -------------------------------------------------------------------------------- /.sqlfluffignore: -------------------------------------------------------------------------------- 1 | # SQLFluff doesn't work well with Snowflake loops, functions 2 | # or Python stored procedures. Ignoring those files here 3 | Snowflake_ForLoop_GrantPermissions.sql 4 | Snowflake_Python_Stored_Procedure_Example.sql 5 | Snowflake_Shorten_Huge_Union_Queries.sql 6 | Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql 7 | PUT.sql 8 | Snowflake_Time_Travel.sql 9 | -------------------------------------------------------------------------------- /.yamllint: -------------------------------------------------------------------------------- 1 | yaml-files: 2 | - '*.yml' 3 | - '*.yaml' 4 | - '.yamllint' 5 | 6 | rules: 7 | braces: enable 8 | brackets: enable 9 | colons: enable 10 | commas: enable 11 | comments: 12 | level: warning 13 | comments-indentation: 14 | level: warning 15 | document-end: disable 16 | document-start: disable 17 | empty-lines: enable 18 | empty-values: disable 19 | hyphens: enable 20 | indentation: enable 21 | key-duplicates: enable 22 | key-ordering: disable 23 | new-line-at-end-of-file: enable 24 | new-lines: disable 25 | octal-values: disable 26 | quoted-strings: disable 27 | trailing-spaces: enable 28 | truthy: 29 | level: warning 30 | # 120 chars should be enough, but don't fail if a line is longer 31 | line-length: 32 | max: 120 33 | level: warning 34 | -------------------------------------------------------------------------------- /CI_Examples/python-ci.yml: -------------------------------------------------------------------------------- 1 | name: Python Continuous Integration 2 | 3 | parameters: 4 | - name: jobName 5 | default: 'PythonCI' 6 | - name: jobDisplay 7 | default: 'Lint .py files with Ruff' 8 | 9 | trigger: 10 | branches: 11 | include: 12 | - '*' 13 | exclude: 14 | - main 15 | 16 | pool: 17 | vmImage: 'ubuntu-latest' 18 | 19 | jobs: 20 | - job: ${{ parameters.jobName }} 21 | timeoutInMinutes: 10 22 | displayName: ${{ parameters.jobDisplay }} 23 | 24 | workspace: 25 | clean: outputs 26 | 27 | steps: 28 | # Checkout repo 29 | - checkout: self 30 | fetchDepth: 0 31 | clean: true 32 | 33 | # List Pipeline directory and Build Source Version 34 | - script: | 35 | ls -R $(System.DefaultWorkingDirectory) 36 | displayName: List directory contents 37 | 38 | - script: | 39 | echo "Build.SourceVersion: $(Build.SourceVersion)" 40 | displayName: Print Build.SourceVersion 41 | 42 | # Install Ruff 43 | - script: | 44 | pip install ruff 45 | displayName: Install Ruff 46 | 47 | # Test Ruff Installation and list all .py files in repo 48 | - script: | 49 | ruff --version 50 | echo "All changes in this commit:" 51 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.py$' || 52 | echo "No Python files changed." 53 | displayName: Test Ruff Install, List all .py files 54 | 55 | # Lint Python 56 | - script: | 57 | changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.py$') ) 58 | if [[ "${#changed[@]}" -gt 0 ]]; then 59 | failed=false 60 | for filename in "${changed[@]}"; do 61 | if [[ -f "$filename" ]]; then 62 | echo "linting $filename" 63 | ruff check "$filename" || failed=true 64 | else 65 | echo "File not found: $filename" 66 | fi 67 | done 68 | if [[ $failed == true ]]; then 69 | exit 1 70 | fi 71 | fi 72 | displayName: Lint .py files 73 | -------------------------------------------------------------------------------- /CI_Examples/python-pr.yml: -------------------------------------------------------------------------------- 1 | name: Python Pull Request Check 2 | 3 | parameters: 4 | - name: jobName 5 | default: 'PythonCI' 6 | - name: jobDisplay 7 | default: 'Lint repo with Ruff + run all unit tests' 8 | 9 | trigger: 10 | branches: 11 | include: 12 | - main 13 | 14 | pool: 15 | vmImage: 'ubuntu-latest' 16 | 17 | jobs: 18 | - job: ${{ parameters.jobName }} 19 | timeoutInMinutes: 30 20 | displayName: ${{ parameters.jobDisplay }} 21 | 22 | workspace: 23 | clean: outputs 24 | 25 | steps: 26 | # Checkout repo 27 | - checkout: self 28 | fetchDepth: 1 29 | clean: true 30 | 31 | # Install Ruff 32 | - script: | 33 | pip install pytest pytest-azurepipelines pytest-cov ruff 34 | displayName: Install Pytest, Pytest Code Coverage and Ruff 35 | 36 | # Test Ruff Installation and list all files in repo 37 | - script: | 38 | echo "Ruff Version:" && ruff --version 39 | echo "Pytest Version:" && pytest --version 40 | echo "Pytest Coverage Version:" && pytest-cov --version 41 | echo "Pytest Azure Pipelines Version:" && pytest-azurepipelines --version 42 | git ls-files | grep '\.py$' 43 | displayName: Test Installs, List all files for CI 44 | 45 | # Lint SQL 46 | - script: | 47 | git ls-files | grep '\.py$' | ruff check . 48 | displayName: Analyzing the code with Ruff 49 | continueOnError: true 50 | 51 | - script: | 52 | pytest tests/ --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-report=xml 53 | displayName: Run all Python unit tests 54 | condition: always() 55 | continueOnError: false 56 | 57 | - task: PublishCodeCoverageResults@1 58 | inputs: 59 | codeCoverageTool: Cobertura 60 | summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml' 61 | -------------------------------------------------------------------------------- /CI_Examples/sqlfluff-ci.yml: -------------------------------------------------------------------------------- 1 | # Azure CI pipeline that lints new/modified SQL files after every push to a git repository. 2 | # See the SQLFluff GitHub for more info: [SQLFluff](https://github.com/sqlfluff/sqlfluff) 3 | name: SQLFluff Continuous Integration 4 | 5 | parameters: 6 | - name: jobName 7 | default: 'SQLFluffCI' 8 | - name: jobDisplay 9 | default: 'Lint .sql files with SQLFluff' 10 | 11 | trigger: 12 | branches: 13 | include: 14 | - '*' 15 | exclude: 16 | - main 17 | 18 | pool: 19 | vmImage: 'ubuntu-latest' 20 | 21 | jobs: 22 | - job: ${{ parameters.jobName }} 23 | timeoutInMinutes: 10 24 | displayName: ${{ parameters.jobDisplay }} 25 | 26 | workspace: 27 | clean: outputs 28 | 29 | steps: 30 | # Checkout repo 31 | - checkout: self 32 | fetchDepth: 0 33 | clean: true 34 | 35 | # List Pipeline directory and Build Source Version 36 | - script: | 37 | ls -R $(System.DefaultWorkingDirectory) 38 | displayName: List directory contents 39 | 40 | - script: | 41 | echo "Build.SourceVersion: $(Build.SourceVersion)" 42 | displayName: Print Build.SourceVersion 43 | 44 | # Install SQLFluff 45 | - script: | 46 | pip install sqlfluff 47 | displayName: Install SQLFluff 48 | 49 | # Test SQLFluff Installation and list all .sql files in repo 50 | - script: | 51 | sqlfluff --version 52 | echo "All changes in this commit:" 53 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.sql$' || 54 | echo "No SQL files changed." 55 | displayName: Test SQLFluff Install, List all .sql files 56 | 57 | # Lint SQL 58 | - script: | 59 | changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.sql$') ) 60 | if [[ "${#changed[@]}" -gt 0 ]]; then 61 | failed=false 62 | for filename in "${changed[@]}"; do 63 | if [[ -f "$filename" ]]; then 64 | echo "linting $filename" 65 | sqlfluff lint "$filename" --dialect snowflake || failed=true 66 | else 67 | echo "File not found: $filename" 68 | fi 69 | done 70 | if [[ $failed == true ]]; then 71 | exit 1 72 | fi 73 | fi 74 | displayName: Lint .sql files 75 | -------------------------------------------------------------------------------- /CI_Examples/sqlfluff-pr.yml: -------------------------------------------------------------------------------- 1 | # Azure CI pipeline that lints all SQL files during a PR. 2 | # See the SQLFluff GitHub for more info: [SQLFluff](https://github.com/sqlfluff/sqlfluff) 3 | name: SQLFluff PR Check 4 | 5 | parameters: 6 | - name: jobName 7 | default: 'SQLFluffPR' 8 | - name: jobDisplay 9 | default: 'Lint repo with SQLFluff' 10 | 11 | trigger: 12 | branches: 13 | include: 14 | - main 15 | 16 | pool: 17 | vmImage: 'ubuntu-latest' 18 | 19 | jobs: 20 | - job: ${{ parameters.jobName }} 21 | timeoutInMinutes: 30 22 | displayName: ${{ parameters.jobDisplay }} 23 | 24 | workspace: 25 | clean: outputs 26 | 27 | steps: 28 | # Checkout repo 29 | - checkout: self 30 | fetchDepth: 1 31 | clean: true 32 | 33 | # Install SQLFluff 34 | - script: | 35 | pip install sqlfluff 36 | displayName: Download and Install SQLFluff 37 | 38 | # Test SQLFluff Installation and list all files in repo 39 | - script: | 40 | sqlfluff --version 41 | git ls-files | grep \.sql 42 | displayName: Test SQLFluff Install, List all files for CI 43 | 44 | # Lint SQL 45 | - script: | 46 | git ls-files | grep \.sql | sqlfluff lint --dialect snowflake 47 | displayName: Analyzing the code with SQLFluff 48 | -------------------------------------------------------------------------------- /CI_Examples/yamllint-ci.yml: -------------------------------------------------------------------------------- 1 | # Azure CI pipeline that lints YAML files in the dbt repository. 2 | # See the yamllint GitHub for more info: [yamllint](https://github.com/adrienverge/yamllint) 3 | name: yamllint Continuous Integration 4 | 5 | parameters: 6 | - name: jobName 7 | default: 'YAMLLintCI' 8 | - name: jobDisplay 9 | default: 'Lint .yml files with YAMLLint' 10 | 11 | trigger: 12 | branches: 13 | include: 14 | - '*' 15 | exclude: 16 | - main 17 | 18 | pool: 19 | vmImage: 'ubuntu-latest' 20 | 21 | jobs: 22 | - job: ${{ parameters.jobName}} 23 | timeoutInMinutes: 10 24 | displayName: ${{ parameters.jobDisplay }} 25 | 26 | workspace: 27 | clean: outputs 28 | 29 | steps: 30 | # Checkout repo 31 | - checkout: self 32 | fetchDepth: 0 33 | clean: true 34 | 35 | # List Pipeline directory and Build Source Version 36 | - script: | 37 | ls -R $(System.DefaultWorkingDirectory) 38 | displayName: List directory contents 39 | 40 | - script: | 41 | echo "Build.SourceVersion: $(Build.SourceVersion)" 42 | displayName: Print Build.SourceVersion 43 | 44 | # Install yamllint 45 | - script: | 46 | pip install yamllint 47 | displayName: Install yamllint 48 | 49 | # Test yamllint Installation and list all .yml files in repo 50 | - script: | 51 | yamllint --version 52 | echo "All changes in this commit:" 53 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.yml$' || 54 | echo "No YAML files changed." 55 | displayName: Test yamllint Install, List all .yml files 56 | 57 | # Lint YAML 58 | - script: | 59 | changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.yml$') ) 60 | if [[ "${#changed[@]}" -gt 0 ]]; then 61 | failed=false 62 | for filename in "${changed[@]}"; do 63 | if [[ -f "$filename" ]]; then 64 | echo "linting $filename" 65 | yamllint "$filename" || failed=true 66 | else 67 | echo "File not found: $filename" 68 | fi 69 | done 70 | if [[ $failed == true ]]; then 71 | exit 1 72 | fi 73 | fi 74 | displayName: Lint .yml files 75 | -------------------------------------------------------------------------------- /CI_Examples/yamllint-pr.yml: -------------------------------------------------------------------------------- 1 | # Azure CI pipeline that lints all YAML files during a PR. 2 | # See the yamllint GitHub for more info: [yamllint](https://github.com/adrienverge/yamllint) 3 | name: yamllint PR Check 4 | 5 | parameters: 6 | - name: jobName 7 | default: 'yamllintPR' 8 | - name: jobDisplay 9 | default: 'Lint .yml files with yamllint' 10 | 11 | trigger: 12 | branches: 13 | include: 14 | - main 15 | 16 | pool: 17 | vmImage: 'ubuntu-latest' 18 | 19 | jobs: 20 | - job: ${{ parameters.jobName}} 21 | timeoutInMinutes: 10 22 | displayName: ${{ parameters.jobDisplay }} 23 | 24 | workspace: 25 | clean: outputs 26 | 27 | steps: 28 | # Checkout repo 29 | - checkout: self 30 | fetchDepth: 1 31 | clean: true 32 | 33 | # Install yamllint 34 | - script: | 35 | pip install yamllint 36 | displayName: Download yamllint 37 | 38 | # Test yamllint installation and list all .yml files in the repo 39 | - script: | 40 | yamllint --version 41 | git ls-files | grep \.yml 42 | displayName: Test yamllint Install, list all .yml files 43 | 44 | # Lint YAML 45 | - script: | 46 | git ls-files | grep \.yml | yamllint . 47 | displayName: Lint .yml files 48 | -------------------------------------------------------------------------------- /Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # I wrote this Dockerfile to run the Python script inside of my container, but it doesn't work because of our Windows auth at work 2 | FROM python:3.8-slim 3 | 4 | ENV DEBIAN_FRONTEND="noninteractive"\ 5 | ACCEPT_EULA="y" 6 | 7 | # install system dependencies 8 | # Microsoft SQL Server Prerequisites 9 | RUN apt-get update -y \ 10 | && apt-get install -y gcc curl gnupg build-essential\ 11 | unixodbc unixodbc-dev tdsodbc freetds-common freetds-bin freetds-dev\ 12 | && curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - \ 13 | && curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list \ 14 | && apt-get update \ 15 | && apt-get install -y --no-install-recommends locales apt-transport-https\ 16 | && apt-get -y --no-install-recommends install msodbcsql18 unixodbc-dev 17 | 18 | WORKDIR /usr/src/app 19 | 20 | COPY requirements.txt ./ 21 | 22 | RUN pip install --no-cache-dir -r requirements.txt 23 | 24 | COPY . . 25 | 26 | CMD [ "python", "./SQL_Server_ForLoop.py" ] 27 | -------------------------------------------------------------------------------- /Docker/Populate_SQL_Server_Docker_Container.py: -------------------------------------------------------------------------------- 1 | """Title: Populate SQL Server Docker Container with production data 2 | By: Martin Palkovic 3 | Date: 2022-07-25 4 | Description: Recently I had a need for a small, lightweight SQL Server development 5 | environment where I could play around with data and not impact anything in production. 6 | This python script was my solution - it iteratively creates and populates tables 7 | in a test database that resides within a docker container. 8 | 9 | Due to our Windows auth at work, I couldn't get this to run in a docker-compose 10 | file (i.e within the container). The solution is to run docker-compose to initialize 11 | SQL Server in the container, and then run this script locally 12 | 13 | Exec in shell: 14 | cd your/file/location 15 | docker-compose up 16 | python3 Populate_SQL_Server_Docker_Container.py 17 | """ 18 | 19 | #import modules 20 | import pandas as pd 21 | from sqlalchemy.engine import URL 22 | from sqlalchemy import create_engine 23 | 24 | #server credentials - prod 25 | prod_server = 'prod_server' 26 | prod_db = 'prod_db' 27 | 28 | #server credentials - docker 29 | docker_server = 'localhost' 30 | docker_db = 'test_db' 31 | username = 'sa' 32 | password = 'Your-Strong!Password@Here%' 33 | #------------------------- 34 | driver = 'SQL Server' 35 | schema = 'dbo' 36 | 37 | def sqlalchemy_cnxn(driver, server, db): 38 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}" 39 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection}) 40 | engine = create_engine(url) 41 | return engine 42 | 43 | # SQLAlchemy for Prod 44 | prod_engine = sqlalchemy_cnxn(driver, prod_server, prod_db) 45 | 46 | # SQLAlchemy for Docker 47 | docker_engine = sqlalchemy_cnxn(driver, docker_server, docker_db) 48 | 49 | docker_engine.execute(''' 50 | if not exists (select 1 from sys.databases where name = N'test_db') 51 | create database test_db; 52 | ''' 53 | ) 54 | 55 | """create a list of each table in the database, 56 | and remove table names from the list that contain numbers 57 | (i.e duplicates/backups with dates on the end) 58 | If you only want certain tables, you can manipulate this list however you like. 59 | Only table names on this list will be queried from your prod database in the 60 | for loop below""" 61 | prod_tables = [table for table in prod_engine.table_names()] 62 | prod_tables = [i for i in prod_tables if not any(char.isdigit() for char in i)] 63 | 64 | # This block is needed to connect to the db now that we have created it 65 | docker_engine = sqlalchemy_cnxn(driver, docker_server, docker_db) 66 | 67 | """iterate over each table to populate the Docker container 68 | Note that this takes ~1 min per 50 tables""" 69 | for table in prod_tables: 70 | try: 71 | #read 72 | query = f'select top 1000 * from {prod_db}.{schema}.{table}' 73 | results = prod_engine.execute(query) 74 | df_sql = pd.read_sql(query, prod_engine) 75 | 76 | #write 77 | df_sql.to_sql(f'{table}', schema= f'{schema}', 78 | con = docker_engine, chunksize=1, 79 | index=False, if_exists='replace') 80 | except Exception: 81 | print(f'failed to insert {table} to docker container') -------------------------------------------------------------------------------- /Docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.9' 2 | 3 | services: 4 | SQL-Server: 5 | image: mcr.microsoft.com/mssql/server:2022-latest 6 | container_name: SQL_Server_Dev_Environment 7 | restart: unless-stopped 8 | ports: 9 | - "1433:1433" 10 | environment: 11 | - ACCEPT_EULA=Y 12 | - SA_PASSWORD=-Your-Strong!Password@Here% 13 | 14 | # I cant actually get this to work due to our Windows auth/active directory situation at Cooke... 15 | # i.e from within the container, my script doesn't know how to authenticate to our production SQL server 16 | # python: 17 | # container_name: SQL_Server_Python_Script 18 | # build: ./ 19 | # command: python3 ./SQL_Server_ForLoop.py 20 | -------------------------------------------------------------------------------- /Docker/requirements.txt: -------------------------------------------------------------------------------- 1 | pyodbc 2 | sqlalchemy 3 | pandas 4 | numpy -------------------------------------------------------------------------------- /Fivetran/disable_tables_with_zero_rows_fivetran_api.py: -------------------------------------------------------------------------------- 1 | """ Import Modules """ 2 | import os 3 | import json 4 | import requests 5 | import pandas as pd 6 | 7 | from sqlalchemy.engine import URL 8 | from sqlalchemy import create_engine 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | # Retrieve Fivetran secrets 14 | fivetran_key = os.getenv("FIVETRAN_KEY") 15 | fivetran_secret = os.getenv("FIVETRAN_SECRET") 16 | 17 | # -------------------------------------------- 18 | """ Retrieve list of Fivetran connector IDs""" 19 | 20 | # Define API variables 21 | group_id = "my_fivetran_group_id" 22 | url = "https://api.fivetran.com/v1/groups/" + group_id + "/connectors" 23 | headers = {"Accept": "application/json"} 24 | 25 | # API GET request 26 | response = requests.get(url, headers=headers, auth=(fivetran_key, fivetran_secret)) 27 | data = response.json() 28 | 29 | # Save Fivetran connector list to file 30 | with open("fivetran_connector_list.json", "w") as file: 31 | json.dump(data, file, indent=4) 32 | 33 | # Create a dictionary containing the database name(key) and connector ID(value) 34 | connector_id_dict = { 35 | item["schema"].upper() 36 | if item["schema"] != "db_name_you_want_capitalized" 37 | else item["schema"].capitalize(): item["id"] 38 | for item in data["data"]["items"] 39 | } 40 | 41 | print( 42 | f"""Dictionary of connector ID's for Fivetran databases: 43 | {connector_id_dict} \n""" 44 | ) 45 | 46 | # ------------------------------------------------------------------ 47 | """ Establish SQL Server Connection""" 48 | 49 | # Define variables 50 | driver = "SQL Server" 51 | server = "my_server" 52 | 53 | # Define connection function 54 | def sqlalchemy_cnxn(driver, server, db): 55 | """ Function for connecting to SQL Server via SQLAlchemy """ 56 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}" 57 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection}) 58 | engine = create_engine(url) 59 | return engine 60 | 61 | # ------------------------------------------------------------ 62 | """ Loop over list of databases/connector IDs to retrive tables 63 | with 0 rows from SQL server, and call a PATCH request with the Fivetran API 64 | to disable tables with 0 rows for that connector""" 65 | 66 | for database in connector_id_dict.keys(): 67 | engine = sqlalchemy_cnxn(driver, server, database) 68 | 69 | print(f"successfully connected to {server}.{database}!\n") 70 | print() # new line 71 | 72 | # Query the sys schema for the database to get tables with 0 rows of data 73 | query = f""" 74 | SELECT 75 | t.NAME AS TableName, 76 | p.rows AS RowCounts 77 | FROM {database}.sys.tables AS t 78 | 79 | INNER JOIN {database}.sys.partitions AS p 80 | ON t.object_id = p.OBJECT_ID 81 | 82 | WHERE 83 | t.NAME NOT LIKE 'dt%' 84 | AND t.is_ms_shipped = 0 85 | AND p.rows = 0 86 | 87 | GROUP BY 88 | t.Name, p.Rows 89 | 90 | ORDER BY 91 | t.Name 92 | """ 93 | 94 | # load results of query to Pandas dataframe 95 | df = pd.read_sql(query, engine) 96 | 97 | print(f"tables with 0 rows of data in {database} database: {len(df)}\n") 98 | 99 | tables_to_unsync = df["TableName"].tolist() 100 | 101 | # Create a JSON payload of tables to disable 102 | tables_payload = {table_name: {"enabled": False} for table_name in tables_to_unsync} 103 | payload = {"enabled": True, "tables": tables_payload} 104 | 105 | # For testing, if needed 106 | # with open(f"{database}_payload.json", "w") as file: 107 | # json.dump(payload, file, indent = 4) 108 | 109 | # # ###################################### 110 | """ Fivetran API Call to disable tables""" 111 | 112 | connector_id = connector_id_dict[database] 113 | print(f"Connector ID for {database}: {connector_id}\n") 114 | 115 | schema_name = "dbo" 116 | url = ( 117 | "https://api.fivetran.com/v1/connectors/" 118 | + connector_id 119 | + "/schemas/" 120 | + schema_name 121 | ) 122 | 123 | headers = {"Content-Type": "application/json", "Accept": "application/json"} 124 | 125 | """Fivetran API call - comment this block if you are testing the script""" 126 | response = requests.patch(url, 127 | json = payload, 128 | headers = headers, 129 | auth = (fivetran_key, fivetran_secret)) 130 | 131 | data = response.json() 132 | print(f"Successfully called the Fivetran API for the {connector_id} connector!\n") 133 | 134 | # For testing, if needed 135 | # with open('fivetran_api_response.json', 'w') as file: 136 | # file.write(str(data)) 137 | # print(f"Successfully saved logs to file!\n") 138 | 139 | # break #LEAVE THIS IN IF YOU ARE TESTING 140 | -------------------------------------------------------------------------------- /Python/Snowflake_Insert_Statements.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | df = pd.read_csv(r"your/file/here.csv") 5 | df = df.replace({np.nan: "NULL"}) 6 | 7 | print("successfully read csv!\n") 8 | 9 | 10 | def sql_insert_statement_from_dataframe(source, target): 11 | print("insert into " + target + "(" + str(", ".join(source.columns)) + ") values ") 12 | for i, x in source.iterrows(): 13 | values = x.values 14 | formatted_values = [] 15 | for val in values: 16 | if val == "NULL": 17 | formatted_values.append(val) 18 | else: 19 | formatted_values.append("'" + str(val) + "'") 20 | if i == len(source) - 1: 21 | print("(" + str(", ".join(formatted_values)) + ");") 22 | else: 23 | print("(" + str(", ".join(formatted_values)) + "),") 24 | 25 | 26 | sql_insert_statement_from_dataframe(df, "my_db.my_schema.my_table") 27 | -------------------------------------------------------------------------------- /Python/Snowpark_Backload_API_Data.py: -------------------------------------------------------------------------------- 1 | # **********************************************************************# 2 | # Title: Backload API data using Snowpark Python 3 | # By: Martin Palkovic 4 | # Date: 2022-11-18 5 | # Description: Here is another Snowpark example, where you can loop through 6 | # an API call and insert the JSON response for each days worth of data 7 | # into a VARIANT table in Snowflake 8 | # *********************************************************************# 9 | 10 | # Import modules 11 | import os 12 | import json 13 | import requests 14 | 15 | from datetime import date, timedelta 16 | from snowflake.snowpark import Session 17 | 18 | from dotenv import load_dotenv 19 | 20 | load_dotenv() 21 | 22 | # Establish Snowflake Connection using Snowpark 23 | account = os.getenv("SNOWFLAKE_ACCT") 24 | user = os.getenv("SNOWFLAKE_USER") 25 | password = os.getenv("SNOWFLAKE_PASSWORD") 26 | role = os.getenv("SNOWFLAKE_ROLE") 27 | role = "SYSADMIN" 28 | warehouse = "MY_WH" 29 | database = "DEV" 30 | schema = "MY_SCHEMA" 31 | target_table = "MY_TABLE" 32 | 33 | api_key = os.getenv("MY_API_KEY") 34 | 35 | 36 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema): 37 | connection_parameters = { 38 | "account": account, 39 | "user": user, 40 | "password": password, 41 | "role": role, 42 | "warehouse": warehouse, 43 | "database": database, 44 | "schema": schema, 45 | } 46 | session = Session.builder.configs(connection_parameters).create() 47 | return session 48 | 49 | 50 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema) 51 | 52 | print( 53 | session.sql( 54 | "SELECT CURRENT_WAREHOUSE(), CURRENT_DATABASE(), CURRENT_SCHEMA()" 55 | ).collect() 56 | ) 57 | 58 | # API variables 59 | headers = {"APIKey": f"{api_key}"} 60 | 61 | 62 | # Define a function so we can loop over a date range 63 | def daterange(start_date, end_date): 64 | for n in range(int((end_date - start_date).days)): 65 | yield start_date + timedelta(n) 66 | 67 | 68 | start_date = date(2019, 1, 1) 69 | end_date = date(2022, 11, 18) 70 | 71 | # Loop through 4 years worth of API data, insert into Snowflake VARIANT table 72 | for dates in daterange(start_date, end_date): 73 | url = f"https://api.mywebsite.com/api/data?&startDate={date}&endDate={date}" 74 | response = requests.request("GET", url, headers=headers) 75 | 76 | formatted_json = json.loads(response.text) 77 | formatted_json = json.dumps(formatted_json, indent=4) 78 | 79 | # insert to Snowflake 80 | session.sql( 81 | f"""INSERT INTO {target_table} (JSON_DATA, INSERT_DATE) 82 | SELECT PARSE_JSON('{formatted_json}'), 83 | CURRENT_TIMESTAMP();""" 84 | ).collect() 85 | -------------------------------------------------------------------------------- /Python/Snowpark_Create_Stored_Procedure.py: -------------------------------------------------------------------------------- 1 | # This only runs on a Python 3.8 environment 2 | 3 | # import modules 4 | import os 5 | import snowflake 6 | import pandas as pd 7 | 8 | from snowflake.snowpark import Session 9 | from snowflake.snowpark.types import StringType 10 | 11 | from dotenv import load_dotenv 12 | 13 | load_dotenv() 14 | 15 | # Establish Snowflake Connection 16 | account = os.getenv("SNOWFLAKE_ACCT") 17 | user = os.getenv("SNOWFLAKE_USER") 18 | password = os.getenv("SNOWFLAKE_PASSWORD") 19 | role = os.getenv("SNOWFLAKE_ROLE") 20 | warehouse = "REPORTING_WH" 21 | database = "STAGING_DEV" 22 | schema = "MISC" 23 | 24 | 25 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema): 26 | connection_parameters = { 27 | "account": account, 28 | "user": user, 29 | "password": password, 30 | "role": role, 31 | "warehouse": warehouse, 32 | "database": database, 33 | "schema": schema, 34 | } 35 | session = Session.builder.configs(connection_parameters).create() 36 | return session 37 | 38 | 39 | print("Connecting to Snowpark...\n") 40 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema) 41 | 42 | print( 43 | session.sql( 44 | "select current_warehouse(), current_database(), current_schema()" 45 | ).collect(), 46 | "\n", 47 | ) 48 | print("Connected!\n") 49 | 50 | session.sql( 51 | """create or replace table 52 | mytable(amount number comment 'fake amounts for testing', 53 | fruits string comment 'fake types of fruit for testing')""" 54 | ).show() 55 | 56 | session.sql("""create or replace table mytable2 like mytable""").show() 57 | 58 | session.sql( 59 | """insert into mytable values (1, 'apple'), 60 | (2, 'orange'), 61 | (5, 'grape'), 62 | (7, 'cantelope'), 63 | (9, 'pineapple'), 64 | (17, 'banana'), 65 | (21, 'tangerine')""" 66 | ).show() 67 | 68 | session.sql( 69 | """insert into mytable2 values (1, 'apple'), 70 | (3, 'orange'), 71 | (5, 'grape'), 72 | (7, 'strawberry'), 73 | (10, 'pineapple'), 74 | (17, 'banana'), 75 | (22, 'raspberry')""" 76 | ).show() 77 | 78 | 79 | def print_differences( 80 | session: snowflake.snowpark.Session, 81 | table1: str, 82 | table2: str, 83 | field1: str, 84 | field2: str, 85 | ): 86 | # read the tables into a snowpark dataframe 87 | table1 = session.table(table1) 88 | table2 = session.table(table2) 89 | 90 | # convert to pandas 91 | df1 = table1.to_pandas() 92 | df2 = table2.to_pandas() 93 | 94 | # convert the the fields of interest from each table to a list 95 | list1 = df1[field1].to_list() 96 | list2 = df2[field2].to_list() 97 | 98 | return ", ".join(item for item in list1 if item not in list2) 99 | 100 | 101 | session.add_packages("snowflake-snowpark-python") 102 | 103 | print("Registering Stored Procedure with Snowflake...\n") 104 | 105 | session.sproc.register( 106 | func=print_differences, 107 | return_type=StringType(), 108 | input_types=[StringType(), StringType(), StringType(), StringType()], 109 | is_permanent=True, 110 | name="PRINT_DIFFERENCES", 111 | replace=True, 112 | stage_location="@UDF_STAGE", 113 | ) 114 | 115 | print("Stored Procedure registered with Snowflake!\n") 116 | 117 | # You can return the results on one line using the sql() method: 118 | """session.sql('''call print_differences('MYTABLE', 119 | 'MYTABLE2', 120 | 'FRUITS', 121 | 'FRUITS')''').show()""" 122 | 123 | # Call stored procedure, print results as dataframe 124 | x = session.call("print_differences", "MYTABLE", "MYTABLE2", "FRUITS", "FRUITS") 125 | print(x, "\n") 126 | 127 | df = pd.DataFrame({"Differences": x.split(",")}) 128 | print(df) 129 | -------------------------------------------------------------------------------- /Python/Snowpark_Example_Backload_SQL_Server_Data.py: -------------------------------------------------------------------------------- 1 | # **********************************************************************# 2 | # Title: Basic Snowpark Example for backloading data to Snowflake 3 | # By: Martin Palkovic 4 | # Date: 2022-11-18 5 | # Description: Recently I needed to backload some exchange rate data into Snowflake from 6 | # SQL Server, and was excited because I got to test out Snowpark! It is a really nice 7 | # way to interact with Snowflake using Python. 8 | # *********************************************************************# 9 | 10 | # Import modules 11 | import os 12 | from sqlalchemy.engine import URL 13 | from sqlalchemy import create_engine 14 | 15 | import pandas as pd 16 | 17 | from snowflake.snowpark import Session 18 | 19 | from dotenv import load_dotenv 20 | 21 | load_dotenv() 22 | 23 | # Establish SQL Server Connection 24 | driver = "SQL Server" 25 | server = "my_server" 26 | database = "my_db" 27 | schema = "dbo" 28 | table = "Daily_Exchange_Rates" 29 | 30 | 31 | # Define connection function 32 | def sqlalchemy_cnxn(driver, server, db): 33 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}" 34 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection}) 35 | engine = create_engine(url) 36 | return engine 37 | 38 | 39 | engine = sqlalchemy_cnxn(driver, server, database) 40 | 41 | # If you're not performing any data transformation at the 42 | # SQL Server level, this is a great way to parameterize column names 43 | columns = f"""SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS 44 | WHERE TABLE_NAME LIKE N'{table}'""" 45 | 46 | df_cols = pd.read_sql(columns, engine) 47 | columns = ", ".join(df_cols["COLUMN_NAME"].to_list()) 48 | 49 | query = f"""SELECT {columns} FROM {database}.{schema}.{table}""" 50 | 51 | # load query to dataframe 52 | df_fx = pd.read_sql(query, engine) 53 | print("Total records from SQL Server:", len(df_fx)) 54 | 55 | # -------------------------------------------- 56 | 57 | # Establish Snowpark Connection 58 | account = os.getenv("SNOWFLAKE_ACCT") 59 | user = os.getenv("SNOWFLAKE_USER") 60 | password = os.getenv("SNOWFLAKE_PASSWORD") 61 | role = os.getenv("SNOWFLAKE_ROLE") 62 | warehouse = "REPORTING_WH" 63 | database = "DEV" 64 | schema = "MY_SCHEMA" 65 | target_table = "CURRENCY_EXCHANGE_RATES" 66 | temp_table = "FX_RATE_TEMP" 67 | 68 | 69 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema): 70 | connection_parameters = { 71 | "account": account, 72 | "user": user, 73 | "password": password, 74 | "role": role, 75 | "warehouse": warehouse, 76 | "database": database, 77 | "schema": schema, 78 | } 79 | session = Session.builder.configs(connection_parameters).create() 80 | return session 81 | 82 | 83 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema) 84 | 85 | print( 86 | session.sql( 87 | "select current_warehouse(), current_database(), current_schema()" 88 | ).collect() 89 | ) 90 | 91 | # --------------------------------------------------------------------- 92 | 93 | # Transform the data (if needed) to match the format that is required for Snowflake 94 | # In my case, the data in the source data did not match what I needed 95 | # for Snowflake. 96 | 97 | df_sf = pd.DataFrame() 98 | 99 | df_sf[["FROM_CURRENCY", "TO_CURRENCY"]] = df_fx["EXGTBLID_TRANSFORMED"].str.split( 100 | "-", 1, expand=True 101 | ) 102 | df_sf = df_sf[ 103 | df_sf["TO_CURRENCY"].str.contains("|".join(["AVG", "BUY", "SELL", "ALL"])) is False 104 | ] # drops rows that contain junk data 105 | 106 | df_sf["EFFECTIVE_START"] = df_fx["EXCHDATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S") 107 | df_sf["EFFECTIVE_STOP"] = ( 108 | df_fx["EXCHDATE"] + pd.DateOffset(days=7, hours=23, minutes=59) 109 | ).dt.strftime("%Y-%m-%d %H:%m:%s.%S") 110 | 111 | df_sf["RATE"] = df_fx["XCHGRATE"] 112 | 113 | # Get current datetime 114 | df_sf["STAGE_DATE"] = pd.Timestamp.now() 115 | df_sf["STAGE_DATE"] = df_sf["STAGE_DATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S") 116 | 117 | # strip all whitespace from every field 118 | df_sf = df_sf.apply(lambda x: x.str.strip() if x.dtype == "object" else x) 119 | print("Total records after transformations:", len(df_sf)) 120 | 121 | columns = ", ".join(df_sf.columns) 122 | 123 | # Create Snowpark DataFrame 124 | df = session.create_dataframe(df_sf) 125 | 126 | df.write.mode("overwrite").save_as_table( 127 | f"{temp_table}", column_order="name", table_type="temporary" 128 | ) 129 | 130 | session.sql(f"SELECT COUNT(*) FROM {temp_table}").collect() 131 | 132 | # OPTION 1: Overwrite + insert new data 133 | session.sql( 134 | f"""INSERT OVERWRITE INTO {target_table} ({columns}) 135 | SELECT {columns} FROM {temp_table}""" 136 | ).collect() 137 | 138 | # ------------------------------------------------------------- 139 | 140 | # OPTION 2: Incremental load 141 | session.sql( 142 | f"""MERGE INTO {target_table} Dest 143 | USING ( 144 | SELECT {columns} FROM {temp_table} 145 | QUALIFY ROW_NUMBER() OVER ( 146 | PARTITION BY MY_KEY 147 | ORDER BY DATE ASC) = 1 148 | ) Source 149 | ON Dest.MY_KEY = Source.MY_KEY 150 | AND Dest.FROM_CURRENCY = Source.FROM_CURRENCY 151 | AND Dest.TO_CURRENCY = Source.TO_CURRENCY 152 | WHEN MATCHED THEN UPDATE 153 | SET Dest.FROM_CURRENCY = Source.FROM_CURRENCY 154 | , Dest.TO_CURRENCY = Source.TO_CURRENCY 155 | , Dest.DATE = Source.DATE 156 | , Dest.RATE = Source.RATE 157 | , Dest.STAGE_DATE = Source.STAGE_DATE 158 | 159 | WHEN NOT MATCHED THEN INSERT( 160 | FROM_CURRENCY 161 | , TO_CURRENCY 162 | , DATE 163 | , RATE 164 | , STAGE_DATE 165 | ) 166 | VALUES( 167 | Source.FROM_CURRENCY 168 | , Source.TO_CURRENCY 169 | , Source.EFFECTIVE_START 170 | , Source.RATE 171 | , Source.STAGE_DATE 172 | ) 173 | """ 174 | ).collect() 175 | -------------------------------------------------------------------------------- /Python/Stack.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MartyC-137/Data-Engineering/d5c850def89f2cc2f28b88b713313486ab20a9e7/Python/Stack.ipynb -------------------------------------------------------------------------------- /Python/compare_two_lists_for_differences.py: -------------------------------------------------------------------------------- 1 | """Compare two lists for differences 2 | By: Martin Palkovic 3 | Date: 2022-02-09""" 4 | # ------------------------------ 5 | # a common work task is to compare two database ID fields 6 | # against each other to determine which records exist 7 | # in one table but not another. This operation can take 10+ 8 | # minutes to run in SQl and is syntactically heavy, but is 9 | # fast and easy in Python. 10 | 11 | 12 | # Copy and paste your fields below 13 | # to identify records that are unique to one of the tables 14 | 15 | list1 = ["red", "blue", "yellow", 7, 25] # copy and paste your values into here 16 | list2 = ["yellow", 7, "blue", 1, 5.4] 17 | 18 | # returns items that are in list1 but not in list2 19 | list_difference = [item for item in list1 if item not in list2] 20 | print(list_difference) 21 | -------------------------------------------------------------------------------- /Python/connecting_to_snowflake_using_python.py: -------------------------------------------------------------------------------- 1 | """ Import Modules """ 2 | import os 3 | from dotenv import load_dotenv 4 | from snowflake import connector 5 | # import pandas as pd 6 | 7 | load_dotenv() 8 | 9 | # establish connection to Snowflake using .env file 10 | connection = connector.connect( 11 | user=os.getenv("SNOWFLAKE_USER"), 12 | password=os.getenv("SNOWFLAKE_PASSWORD"), 13 | account=os.getenv("SNOWFLAKE_ACCT"), 14 | role=os.getenv("SNOWFLAKE_ROLE"), 15 | warehouse="REPORTING_WH", 16 | ) 17 | 18 | # sample SQL query, paste whatever you'd like in here 19 | sql_query = "select * from database.schema.table limit 10;" 20 | 21 | # execute the query 22 | cursor = connection.cursor() 23 | cursor.execute(sql_query) 24 | 25 | # load the data in to Pandas 26 | df = cursor.fetch_pandas_all() 27 | df.head() 28 | -------------------------------------------------------------------------------- /Python/connecting_to_sql_server_using_python.py: -------------------------------------------------------------------------------- 1 | # import modules 2 | import pyodbc 3 | import pandas as pd 4 | 5 | # set all rows and columns visible 6 | # pd.set_option('display.max_columns', None) 7 | # pd.set_option('display.max_rows', None) 8 | 9 | 10 | # server credentials 11 | server = "server" 12 | database = "database" 13 | 14 | # sql connection - uses AD to authenticate 15 | cnxn = pyodbc.connect( 16 | Trusted_Connection="Yes", Driver="{SQL Server}", Server=server, Database=database 17 | ) 18 | cursor = cnxn.cursor() 19 | 20 | # stick your query inside the triple quotes 21 | query = """select top 10 * from database.dbo.table""" 22 | 23 | # load query to dataframe 24 | df_sql = pd.read_sql(query, cnxn) 25 | df_sql.head() 26 | -------------------------------------------------------------------------------- /Python/determine_sql_field_length.py: -------------------------------------------------------------------------------- 1 | """Determing the maximum Length of a field for database table design 2 | By: Martin Palkovic 3 | Date: 2022-02-04 4 | 5 | When building ETL/Integration jobs to Snowflake (or building any SQL table), 6 | you need to designate how many characters are allowed in a field. I like to use 7 | Python to quantitatively answer this question rather than manually counting or 8 | guessing how many characters to allow in a varchar field """ 9 | 10 | #import modules 11 | import pyodbc 12 | import pandas as pd 13 | 14 | #set all rows and columns visible 15 | #pd.set_option('display.max_columns', None) 16 | #pd.set_option('display.max_rows', None) 17 | 18 | #server credentials 19 | server = 'server' 20 | database = 'database' 21 | 22 | #sql connection 23 | cnxn = pyodbc.connect( 24 | Trusted_Connection= 'Yes', 25 | Driver= '{SQL Server}', 26 | Server= server, 27 | Database= database 28 | ) 29 | cursor = cnxn.cursor() 30 | 31 | """stick your query inside the triple quotes""" 32 | 33 | query = """SELECT * FROM """ 34 | 35 | #load query to dataframe 36 | df_sql = pd.read_sql(query, cnxn) 37 | df_sql.head() 38 | 39 | """Example""" 40 | #Field of Interest 41 | foi = 'Item_Key' 42 | print('{} maximum record length ='.format(foi), 43 | max(df_sql[foi].astype(str).map(len)), 'characters') 44 | # Output: Item_Key maximum record length = 19 characters 45 | 46 | #Or run a for loop to get values for every column: 47 | for c in df_sql.columns: 48 | print('{} maximum record length ='.format(c), 49 | max(df_sql[c].astype(str).map(len)), 'characters', 50 | 'data type = {}'.format(df_sql[c].dtype)) 51 | 52 | #object == varchar 53 | """ 54 | Company maximum record length = 18 characters , data type = object 55 | Company_Key maximum record length = 4 characters , data type = object 56 | Site_Key maximum record length = 4 characters , data type = object 57 | Item_Key maximum record length = 19 characters , data type = object 58 | Item_Description maximum record length = 100 characters , data type = object 59 | Species maximum record length = 15 characters , data type = object 60 | Standard_Cost maximum record length = 8 characters , data type = float64 61 | Current_Cost maximum record length = 8 characters , data type = float64 62 | Category maximum record length = 16 characters , data type = object 63 | Sub_Category maximum record length = 22 characters , data type = object 64 | Size maximum record length = 8 characters , data type = object 65 | Grade maximum record length = 7 characters , data type = object 66 | Country_Of_Origin maximum record length = 15 characters , data type = object 67 | Pallet maximum record length = 10 characters , data type = object 68 | Bin maximum record length = 15 characters , data type = object 69 | Order_Allocation maximum record length = 15 characters , data type = object 70 | Production_Date maximum record length = 10 characters , data type = datetime64[ns] 71 | Production_Age maximum record length = 4 characters , data type = int64 72 | Lot_Date maximum record length = 10 characters , data type = datetime64[ns] 73 | Lot_Age maximum record length = 7 characters , data type = float64 74 | Weight maximum record length = 18 characters , data type = float64 75 | Cases maximum record length = 9 characters , data type = float64 76 | """ -------------------------------------------------------------------------------- /Python/load_json_to_snowflake.py: -------------------------------------------------------------------------------- 1 | """Example script to load multiple JSONs to a named Snowflake staging area, 2 | then copy the JSONs into a Snowflake table 3 | By: Martin Palkovic 4 | Date: 2022-07-28 5 | Description: Sometimes in a dev environment, 6 | I need to manipulate a JSON file to see the effect those changes 7 | will have on my data pipeline. Here's a quick script I wrote 8 | to batch load json files into Snowflake, after I've altered some of the fields 9 | """ 10 | 11 | import os 12 | from snowflake import connector 13 | 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | # folder containing your json files 19 | root = r"C:\Directory\containing\JSON\files" 20 | 21 | # Connect to your Snowflake account 22 | cnxn = connector.connect( 23 | user=os.getenv("SNOWFLAKE_USER"), 24 | password=os.getenv("SNOWFLAKE_PASSWORD"), 25 | account=os.getenv("SNOWFLAKE_ACCT"), 26 | role=os.getenv("SNOWFLAKE_ROLE"), 27 | warehouse="REPORTING_WH", 28 | ) 29 | 30 | cursor = cnxn.cursor() 31 | cursor.execute("create or replace stage MY_STAGE;") 32 | cursor.execute("use role SYSADMIN;") 33 | 34 | for file in os.listdir(root): 35 | full_path = os.path.join(root, file) 36 | cursor.execute(f"put file://{full_path} @MY_STAGE;") 37 | 38 | copy_statement = file + ".gz" 39 | cursor.execute( 40 | f"""copy into EXAMPLE_TABLE (JSON_DATA, INSERT DATE) 41 | from (select t.$1, 42 | current_timestamp() 43 | from @MY_STAGE/{copy_statement} t) 44 | file_format = (type = JSON);""" 45 | ) 46 | cursor.close() 47 | cnxn.close() 48 | -------------------------------------------------------------------------------- /Python/parse_xml_compare_differences.py: -------------------------------------------------------------------------------- 1 | """ 2 | Name: Parse XML, extract a field, compare that field to a field from a csv for diffs 3 | By: Martin Palkovic 4 | Date: 2022-08-18 5 | Description: 6 | """ 7 | 8 | # Import Modules 9 | import pandas as pd 10 | 11 | # Paste your xml here 12 | xml = """ 13 | 14 | 15 | 16 | 17 | 1 18 | warehouse1 19 | 1 20 | 127 21 | 9.16 22 | 08/16/2022 15:38:55 23 | 24 | 25 | 26 | 27 | 2 28 | warehouse2 29 | 2 30 | 450 31 | 13.3 32 | 08/17/2022 15:39:26 33 | 34 | 35 | 36 | 37 | """ 38 | 39 | # Parse XML 40 | df = pd.read_xml(xml, xpath=".//Property") 41 | 42 | # Extract only the columns we need from the XML 43 | df_pallet = df.loc[df["name"] == "Pallet"] 44 | 45 | # Read CSV 46 | df_csv = pd.read_csv(r"your_csv_here.csv") 47 | 48 | # Convert values to Python list, cast to integer 49 | pallet = df_pallet["Property"].tolist() 50 | pallet = [int(i) for i in pallet] 51 | csv = df_csv["Pallet"].tolist() 52 | 53 | # Compare differences 54 | print([i for i in pallet if i not in csv]) 55 | -------------------------------------------------------------------------------- /Python/pull_records_for_all_sql_tables.py: -------------------------------------------------------------------------------- 1 | """Title: Data Pull for all views in SQL database 2 | By: Martin Palkovic 3 | Date: 2022-11-08 4 | Description: Script to loop through every view in my_db and pull 100 records. 5 | The Business Analyst for a project at work asked for the structure of 6 | each my_db table, this was the fastest way to do it 7 | """ 8 | 9 | # import modules 10 | from sqlalchemy.engine import URL 11 | from sqlalchemy import create_engine 12 | 13 | import pandas as pd 14 | 15 | # SQL Server Connection - uses Active Directory to authenticate 16 | driver = "SQL Server" 17 | server = "my_server" 18 | database = "my_db" 19 | schema = "dbo" 20 | 21 | 22 | # Define connection function 23 | def sqlalchemy_cnxn(driver, server, db): 24 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}" 25 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection}) 26 | engine = create_engine(url) 27 | return engine 28 | 29 | 30 | engine = sqlalchemy_cnxn(driver, server, database) 31 | 32 | list_of_views = "SELECT name FROM sys.views" 33 | 34 | my_server_views = pd.read_sql(list_of_views, engine) 35 | list_of_sql_views = sorted(my_server_views["name"].to_list()) 36 | list_of_sql_views = [ 37 | x for x in list_of_sql_views if x != "DailySensorReadings" 38 | ] 39 | # I had one table with 50M + rows that was causing performance issues, I removed it here 40 | 41 | for view in list_of_sql_views: 42 | try: 43 | query = f"SELECT TOP 100 * FROM {database}.{schema}.{view}" 44 | results = engine.execute(query) 45 | df = pd.read_sql(query, engine) 46 | if len(df) > 0: 47 | df.to_csv(f"{view}.csv") 48 | else: 49 | pass 50 | except Exception: 51 | print(f"failed to generate data for view {view}") 52 | -------------------------------------------------------------------------------- /Python/read_sql_server_write_snowflake.py: -------------------------------------------------------------------------------- 1 | """Script to read data from SQL Server and write it to Snowflake 2 | By: Martin Palkovic 3 | Date: 2022-09-14 4 | Description: For a work task, I needed to add some historical exchange rate data 5 | to Snowflake for analytical reporting. This data existed on SQL server, so I wrote this 6 | Python script to read the data from SQL Server, transform it, and load it into 7 | Snowflake. I've modified this as a minimum reproducable example for the purposes of my 8 | project portfolio. 9 | """ 10 | 11 | #Step 1: Read data from SQL Server 12 | 13 | # import modules 14 | import os 15 | import pyodbc 16 | import pandas as pd 17 | 18 | from snowflake import connector 19 | from dotenv import load_dotenv 20 | load_dotenv() 21 | 22 | # set all rows and columns visible 23 | # pd.set_option('display.max_columns', None) 24 | # pd.set_option('display.max_rows', None) 25 | 26 | # server credentials 27 | server = "my_server" 28 | database = "my_database" 29 | 30 | # sql connection 31 | cnxn = pyodbc.connect( 32 | Trusted_Connection="Yes", Driver="{SQL Server}", Server=server, Database=database 33 | ) 34 | cursor = cnxn.cursor() 35 | 36 | # stick your query inside the triple quotes 37 | query = """select * from DATABASE.SCHEMA.EXCHANGERATES 38 | where EXCHDATE > '2021-09-03' and EXCHDATE < '2021-09-09' 39 | order by EXCHDATE asc""" 40 | 41 | # load query to dataframe 42 | df_fx = pd.read_sql(query, cnxn) 43 | print(df_fx.dtypes) 44 | 45 | # -------------------------------------------------------- 46 | 47 | # Step 2: Create a dataframe that matches the Snowflake table we are inserting to 48 | df_sf = pd.DataFrame() 49 | 50 | # Create the from and to currency columns 51 | df_sf[["FROM_CURRENCY", "TO_CURRENCY"]] = df_fx["EXCHANGE_ID"].str.split( 52 | "-", 1, expand=True 53 | ) 54 | df_sf = df_sf[ 55 | df_sf["TO_CURRENCY"].str.contains("AVG") is False 56 | ] # drops rows that show avg - there are some GBP AVG 57 | 58 | # Create the start and stop date columns 59 | df_sf["EFFECTIVE_START"] = df_fx["EXCHDATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S") 60 | df_sf["EFFECTIVE_STOP"] = ( 61 | df_fx["EXCHDATE"] + pd.DateOffset(days=7, hours=23, minutes=59) 62 | ).dt.strftime("%Y-%m-%d %H:%m:%s.%S") 63 | 64 | # Exchange Rate 65 | df_sf["RATE"] = df_fx["XCHGRATE"] 66 | 67 | # Get current datetime 68 | df_sf["STAGE_DATE"] = pd.Timestamp.now() 69 | 70 | # strip all whitespace from every field 71 | df_sf = df_sf.apply(lambda x: x.str.strip() if x.dtype == "object" else x) 72 | 73 | # diagnostic check...number of rows, data types etc. 74 | print("Number of rows:", len(df_sf)) 75 | print(df_sf.dtypes) 76 | # print(df_sf.head()) 77 | df_sf.to_csv("FXRates.csv", header=False, index=False) 78 | 79 | # ------------------------------------------------------------------ 80 | # Step 3: Write data to Snowflake 81 | # Establish connection to Cooke Snowflake 82 | cnxn = connector.connect( 83 | user=os.getenv("SNOWFLAKE_USER"), 84 | password=os.getenv("SNOWFLAKE_PASSWORD"), 85 | account=os.getenv("SNOWFLAKE_ACCT"), 86 | role=os.getenv("SNOWFLAKE_ROLE"), 87 | warehouse="REPORTING_WH", 88 | ) 89 | # assign csv to variable 90 | csv = r"\FXRates.csv.csv" 91 | staged_file = os.path.basename(csv) + ".gz" 92 | 93 | # execute write operations 94 | cursor = cnxn.cursor() 95 | cursor.execute("use database STAGING_DEV;") 96 | cursor.execute("use schema MY_SCHEMA;") 97 | cursor.execute("create or replace stage FX_RATES;") 98 | cursor.execute(f"put file://{csv} @FX_RATES;") 99 | cursor.execute( 100 | f"""copy into CURRENCY_EXCHANGE_RATES(FROM_CURRENCY, 101 | TO_CURRENCY, 102 | EFFECTIVE_START, 103 | EFFECTIVE_STOP, 104 | RATE, 105 | STAGE_DATE) 106 | from @FX_RATES/{staged_file} 107 | file_format = (type = CSV)""" 108 | ) 109 | cursor.execute('rm @MY_SCHEMA.FX_RATES pattern = ".*FX_RATES.*";') 110 | 111 | cursor.close() 112 | cnxn.close() 113 | -------------------------------------------------------------------------------- /Python/sql_insert_statement_from_csv.py: -------------------------------------------------------------------------------- 1 | """Generate a SQL insert statement from a csv file 2 | By: Martin Palkovic 3 | Date: 2022-03-14""" 4 | 5 | import pandas as pd 6 | 7 | # Filepath for the csv 8 | df = pd.read_csv("my_file.csv") 9 | 10 | # In my case I only wanted after row 1022 11 | df = df.iloc[1022:] 12 | 13 | # There are some weird unicode characters in the excel sheet I received, 14 | # I removed them with this for loop: 15 | for column in df.columns: 16 | df[column] = df[column].str.split().str.join(" ") 17 | 18 | 19 | # Define Function 20 | def sql_insert_statement_from_dataframe(source, target): 21 | """This function generates a SQL insert statement""" 22 | for index, row in source.iterrows(): 23 | # full insert statement: 24 | print( 25 | "insert into " 26 | + target 27 | + "(" 28 | + str(", ".join(source.columns)) 29 | + ") values " 30 | + str(tuple(row.values)) 31 | + ";" 32 | ) 33 | 34 | 35 | # Execute Function 36 | sql_insert_statement_from_dataframe(df, "database.schema.table") 37 | """ 38 | #Full insert statement: 39 | insert into database.schema.table(code, 40 | expense_type, 41 | acct, 42 | company) 43 | values ('02113', 44 | 'Accounts Receivable, 45 | Other', 46 | '35400', 47 | 'An_Awesome_Company'); 48 | 49 | insert into database.schema.table(code, 50 | expense_type, 51 | acct, 52 | company) 53 | values ('02114', 54 | 'Accounts Payable', 55 | '36500', 56 | 'A_Different_Company'); 57 | insert into database.schema.table(code, 58 | expense_type, 59 | acct, 60 | company) values ('02115', 61 | 'Donations', 62 | '12220', 63 | 'Another_Company'); 64 | 65 | #just the values: 66 | ('02113', 'Accounts Receivable, Other', '35400', 'An_Awesome_Company'), 67 | ('02114', 'Accounts Payable', '36500', 'A_Different_Company'), 68 | ('02115', 'Donations', '12220', 'Another_Company'), 69 | """ 70 | -------------------------------------------------------------------------------- /Python/sql_style_join_csv.py: -------------------------------------------------------------------------------- 1 | """Performing a SQL style join on two csv files 2 | By: Martin Palkovic 3 | Date: 2022-02-11 4 | 5 | Description: The inventory team is producing Excel sheets on a weekly basis 6 | and would like to move comments from one sheet to another. Inventory goes out, 7 | new inventory comes in, and they want the comments transfered on items that are 8 | still in stock. I wasn't sure how to do this in SQL without making new tables 9 | in the database and decided to use Python. 10 | 11 | Note that this program is specific to a workflow I do for the Inventory team, 12 | and you cant really make a one size fits all program for this task since you 13 | need to specify which fields you want to join. But hopefully it will give you 14 | an idea of how to do this if you encounter a similar task 15 | """ 16 | 17 | import os 18 | import pandas as pd 19 | 20 | old_csv = input("Enter filepath for the old csv: ") 21 | while not os.path.isfile(old_csv): 22 | print("Error: that is not a valid file, try again...") 23 | old_csv = input("Enter filepath for the old csv: ") 24 | 25 | new_csv = input("Enter filepath for the new csv: ") 26 | while not os.path.isfile(new_csv): 27 | print("Error: that is not a valid file, try again...") 28 | new_csv = input("Enter filepath for the new csv: ") 29 | 30 | try: 31 | df_old = pd.read_csv(old_csv, low_memory=False) 32 | df_new = pd.read_csv(new_csv, low_memory=False) 33 | 34 | # makes all column names lower case, ensuring they meet the join criteria 35 | # i.e if the user capitalizes one of the column names one week but not the next, 36 | # it doesn't matter with this block of code 37 | df_old.columns = map(str.lower, df_old.columns) 38 | df_new.columns = map(str.lower, df_new.columns) 39 | 40 | # removes any whitespace from the column names 41 | df_old = df_old.rename(columns=lambda x: x.strip()) 42 | df_new = df_new.rename(columns=lambda x: x.strip()) 43 | 44 | df_old = df_old.loc[:, df_old.columns.isin(["columns_you_want_to_keep"])] 45 | df_old = df_old.reset_index(drop=True) 46 | 47 | df_new = df_new.loc[:, ~df_new.columns.isin(["columns_you_want_to_keep"])] 48 | df_new = df_new.reset_index(drop=True) 49 | 50 | df = pd.merge( 51 | df_new, 52 | df_old.drop_duplicates(subset=["pallet"]), 53 | how="left", 54 | on=["pallet"], 55 | suffixes=("", "_drop"), 56 | ) 57 | 58 | df = df.drop([c for c in df.columns if "drop" in c], axis=1) 59 | df.columns = map(str.capitalize, df.columns) 60 | 61 | file_name = input("Enter your file name (dont add the .csv extension): ") 62 | df.to_csv("{}.csv".format(file_name)) 63 | 64 | except BaseException as exception: 65 | print(f"An exception occurred: {exception}") 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Portfolio 2 | 3 | 6 | 7 | [![Ruff](https://github.com/MartyC-137/Data-Engineering/actions/workflows/ruff.yml/badge.svg)](https://github.com/MartyC-137/Data-Engineering/actions/workflows/ruff.yml) 8 | [![SQLFluff](https://github.com/MartyC-137/Data-Engineering/actions/workflows/sqlfluff.yml/badge.svg)](https://github.com/MartyC-137/Data-Engineering/actions/workflows/sqlfluff.yml) 9 | 10 | --- 11 | 12 | ### Introduction 13 | 14 | This repository contains numerous work examples of code I use in my day to day work as a data engineer, all of which has been modified as minimum reproducible examples. My favourite tools are Snowflake, Python, and dbt, and I also have an interest in DevOps as it pertains to data engineering. 15 | 16 |
17 | Python  18 | Snowflake  19 | dbt  20 |
21 | 22 | [![Linkedin Badge](https://img.shields.io/badge/-Martin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/mpalkovic/) 23 | [![Resume Badge](https://img.shields.io/badge/-Resume-blue?style=flat&logo=Resume&logoColor=white)](https://my.visualcv.com/martin-palkovic/) 24 | 25 | ### Table of Contents 26 | * [Python Examples](https://github.com/MartyC-137/Data-Engineering/tree/main/Python) 27 | - [Snowpark example - backload data from SQL Server](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Snowpark_Example_Backload_Data.py) 28 | - [Snowpark example - backload data from API](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Snowpark_Backload_API_Data.py) 29 | - [Automated SQL insert statements from a CSV file](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Generate_SQL_Insert_Statements_From_CSV.py) 30 | - [Extract data from SQL Server, transform, and load to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Read_SQLServer_Write_Snowflake.py) 31 | - [Batch load JSON files to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/LoadJSONToSnowflake.py) 32 | - [SQL Server data Pull - 100 Records from every view in a database](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Pull_records_for_all_SQL_tables_in_db.py) 33 | * [SQL Examples](https://github.com/MartyC-137/Data-Engineering/tree/main/SQL) 34 | - [Only grant permissions on tables with > 0 rows of data - Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_ForLoop_GrantPermissions.sql) 35 | - [Auto Ingest Snowpipe from Azure Blob to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql) 36 | - [Shorten large union queries using Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Shorten_Huge_Union_Queries.sql) 37 | - [Basic Snowflake CDC Pipeline using Streams and Tasks](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql) 38 | - [Find missing dates in a date field - Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Find_Missing_Dates.sql) 39 | - [Snowflake data pipeline from internal stage](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Data_Pipeline_From_Internal_Stage.sql) 40 | * [Snowflake CI/CD using Azure Pipelines - SQLFluff testing, build and deploy using SnowSQL](https://github.com/MartyC-137/Data-Engineering/tree/main/SnowSQL_CICD) 41 | * [SQLFluff and yamllint pipelines for a dbt project](https://github.com/MartyC-137/Data-Engineering/tree/main/CI_Examples) 42 | 43 | --- 44 | 45 | ### Usage 46 | 47 | ```bash 48 | # Clone the repository 49 | $ git clone https://github.com/MartyC-137/Data-Engineering.git 50 | 51 | # Connect to the repository 52 | $ cd Data-Engineering 53 | ``` 54 | -------------------------------------------------------------------------------- /SQL/Load_CSV_to_Snowflake/PUT.sql: -------------------------------------------------------------------------------- 1 | !set variable_substitution=true; 2 | put file://&{csv_path} @~&{stage} auto_compress=true; 3 | -------------------------------------------------------------------------------- /SQL/Load_CSV_to_Snowflake/Snowflake_Worksheet_Load_CSV.sql: -------------------------------------------------------------------------------- 1 | /*****************************************************/ 2 | -- Worksheet: Loading a local csv to a Snowflake table 3 | -- Date: 2022-12-08 4 | /*****************************************************/ 5 | 6 | /* Set session variables 7 | Enter the relevant database, schema, table and file format names here 8 | */ 9 | set role_name = 'sysadmin'; 10 | set wh = 'reporting_wh'; 11 | set db = 'my_new_db'; 12 | set sch = 'my_schema'; 13 | set table_name = 'my_table'; 14 | set fileformat = 'my_file_format'; 15 | set stage_name = 'my_stage'; 16 | 17 | /* initialize session */ 18 | -- role, warehouse 19 | use role identifier($role_name); 20 | use warehouse identifier($wh); 21 | 22 | -- database 23 | create database if not exists identifier($db); 24 | use database identifier($db); 25 | 26 | -- schema 27 | create schema if not exists identifier($sch); 28 | use schema identifier($sch); 29 | 30 | -- file format 31 | create file format if not exists identifier($fileformat) 32 | type = csv 33 | field_delimiter = ',' 34 | empty_field_as_null = true 35 | skip_header = 1 36 | comment = 'file format for loading csv files to Snowflake'; 37 | 38 | -- stage 39 | create stage if not exists identifier($stage_name) 40 | file_format = $fileformat; --this may need to be typed out 41 | show stages; 42 | 43 | -- table; 44 | create table if not exists identifier($table_name) ( 45 | field1 varchar, 46 | field2 number 47 | ); 48 | 49 | /* the PUT command must be executed in the SnowSQL CLI! 50 | See the following documentation on this topic: 51 | https://docs.snowflake.com/en/user-guide/snowsql-install-config.html 52 | https://docs.snowflake.com/en/user-guide/data-load-internal-tutorial.html 53 | 54 | download link: https://developers.snowflake.com/snowsql/ 55 | put file://c:\your\filepath\here\my_file.csv; 56 | */ 57 | 58 | /* confirm that the PUT command worked */ 59 | list @my_stage; 60 | 61 | copy into identifier($table_name) 62 | from @my_stage/my_file.csv.gz --variables dont work in conjunction with the @ argument 63 | file_format = (format_name = $fileformat) 64 | on_error = 'skip_file'; 65 | 66 | -- confirm the COPY INTO command worked 67 | select * from identifier($table_name); 68 | -------------------------------------------------------------------------------- /SQL/Load_CSV_to_Snowflake/snowsql.sh: -------------------------------------------------------------------------------- 1 | snowsql -c dev -s my_schema -f PUT.sql -D csv_path=your_csv_path\your_csv.csv -D stage=my_stage -------------------------------------------------------------------------------- /SQL/Snowflake_Account_Setup.sql: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | -- Script: Account Setup in Snowflake 3 | -- CreateBy: Martin Palkovic 4 | -- Create date: 2022-11-01 5 | -- Description: Script to set up a warehouse, 6 | -- role and user with basic privileges 7 | /******************************************************************************/ 8 | 9 | /* Set session variables*/ 10 | set role_name = 'my_role'; 11 | set user_name = 'my_user'; 12 | set wh_name = 'my_warehouse'; 13 | set db_name = 'my_db'; 14 | 15 | /* Create warehouse for service account */ 16 | use role sysadmin; 17 | create or replace warehouse identifier($wh_name) 18 | warehouse_size = xsmall 19 | auto_suspend = 60 20 | auto_resume = true 21 | min_cluster_count = 1 22 | max_cluster_count = 5 23 | scaling_policy = standard 24 | comment = 'Warehouse for service account to query the Snowflake API'; 25 | 26 | /* Create role */ 27 | use role securityadmin; 28 | create or replace role identifier($role_name) 29 | comment = 'Default role for service account my_user'; 30 | 31 | /* Create user */ 32 | use role accountadmin; 33 | create or replace user identifier($user_name) 34 | login_name = $user_name 35 | display_name = $user_name 36 | password = '********************' 37 | must_change_password = false 38 | default_role = $role_name 39 | default_warehouse = $wh_name 40 | comment = 'Service account for application to query the Snowflake API'; 41 | 42 | /* grant account permissions */ 43 | grant role identifier($role_name) to user identifier($user_name); 44 | grant usage on warehouse identifier($wh_name) to role identifier($role_name); 45 | grant usage on database identifier($db_name) to role identifier($role_name); 46 | grant usage on all schemas in database identifier($db_name) to role identifier( 47 | $role_name 48 | ); 49 | grant select on all tables in database identifier($db_name) to role identifier( 50 | $role_name 51 | ); 52 | 53 | /* Future Grants */ 54 | grant select on future tables in database identifier( 55 | $db_name 56 | ) to role identifier($role_name); 57 | grant usage on future schemas in database identifier( 58 | $db_name 59 | ) to role identifier($role_name); 60 | 61 | /* Confirm access is correct */ 62 | show grants to role identifier($role_name); 63 | 64 | show grants of role identifier($role_name); 65 | show grants to user identifier($user_name); 66 | -------------------------------------------------------------------------------- /SQL/Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql: -------------------------------------------------------------------------------- 1 | /**********************************************************************/ 2 | -- Title: Azure Blob Snowpipe setup 3 | -- By: Martin Palkovic 4 | -- Date: 2022-11-09 5 | -- Description: Snowflake set up of an auto-ingest snowpipe from Azure Blob Storage to Snowflake table. 6 | -- Documentation: https://docs.snowflake.com/en/user-guide/data-load-snowpipe-auto-azure.html 7 | /*********************************************************************/ 8 | 9 | /* Set session variables */ 10 | set session_role = 'sysadmin'; 11 | set session_warehouse = 'reporting_wh'; 12 | set session_database = 'dev'; 13 | set session_table = 'my_table'; 14 | set project_name = 'MY_PROJECT'; 15 | set storage_loc = 'azure://your_blob_account_here.blob.core.windows.net/my_project'; 16 | set tenant_id = 'a123b4c5-1234-123a-a12b-1a23b45678c9'; -- example tenant id from Snowflake docs 17 | 18 | /* Initialize Environment */ 19 | use role identifier($session_role); 20 | use warehouse identifier($session_warehouse); 21 | use database identifier($session_database); 22 | 23 | create schema if not exists identifier($project_name); 24 | use schema identifier($project_name); 25 | 26 | /* Create storage integration for Snowflake to connect to Azure Blob. 27 | See the 'Configuring Secure Access to Cloud Storage' section in the url above*/ 28 | create storage integration if not exists identifier($project_name) 29 | type = external_stage 30 | storage_provider = 'AZURE' 31 | enabled = true 32 | azure_tenant_id = $tenant_id 33 | storage_allowed_locations = ($storage_loc) 34 | comment = 'Storage Integration for moving my_project data into Snowflake'; 35 | 36 | /* The output of this command is needed for setup in the Azure Portal */ 37 | desc storage integration identifier($project_name); 38 | 39 | /* Create notification integration to connect Snowflake to Azure Event Grid. 40 | See Step 2 of 'Configuring Automation With Azure Event Grid'*/ 41 | create notification integration if not exists identifier($project_name) 42 | enabled = true 43 | type = queue 44 | notification_provider = azure_storage_queue 45 | azure_storage_queue_primary_uri = '' 46 | azure_tenant_id = $tenant_id 47 | comment = 'Notification Integration for moving my_project data into Snowflake'; 48 | 49 | /* The output of this command is needed for setup in the Azure Portal */ 50 | desc notification integration identifier($project_name); 51 | 52 | /* Create a Snowflake stage */ 53 | create stage if not exists identifier($project_name) 54 | url = $storage_loc 55 | storage_integration = $project_name 56 | comment = 'Staging area for my_project data, between Azure Blob and Snowflake'; 57 | 58 | -- show stages; 59 | 60 | /* Create a Snowpipe that will be notified via Azure Event Grid 61 | when a file is added to the Azure Blob instance specified above*/ 62 | create pipe if not exists identifier($project_name) 63 | auto_ingest = true 64 | integration = $project_name 65 | as 66 | copy into $session_table 67 | from @$project_name 68 | file_format = (type = 'csv') 69 | comment = 'Auto Ingest Snowpipe for moving data from Azure Blob to Snowflake. When a file is added to 70 | Azure Blob, this Snowpipe will automatically trigger'; 71 | -------------------------------------------------------------------------------- /SQL/Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | -- Script: Basic CDC Pipeline using Streams and Tasks in Snowflake 3 | -- CreateBy: Martin Palkovic 4 | -- Create date: 2022-11-01 5 | -- Description: Basic implementation of a Streams/Tasks workflow in Snowflake. 6 | -- Streams detect DML changes to one table and will update another table based 7 | -- on those changes 8 | /******************************************************************************/ 9 | 10 | /* Set session variables */ 11 | set role_name = 'sysadmin'; 12 | set wh = 'my_wh'; 13 | set db = 'my_db'; 14 | set schema_name = 'my_schema'; 15 | set dest_table = 'my_table'; 16 | set stream_name = 'my_stream'; 17 | set source_table = 'staging_db.staging_schema.staging_table'; 18 | set proc_name = 'my_procedure'; 19 | set task_name = 'push_my_table'; 20 | 21 | /* Initialize Environment */ 22 | use role identifier($role_name); 23 | use warehouse identifier($wh); 24 | 25 | create database if not exists identifier($db); 26 | create schema if not exists identifier($schema_name); 27 | 28 | use database identifier($db); 29 | use schema identifier($schema_name); 30 | 31 | create table if not exists identifier($dest_table) 32 | comment = 'JSON data from API, streaming from the staging database' 33 | clone identifier($source_table); 34 | 35 | create stream if not exists identifier($stream_name) on table identifier($source_table) 36 | comment = 'CDC stream from staging table to prod table'; 37 | 38 | /* quick diagnostic check */ 39 | show streams; 40 | select * from identifier($stream_name); 41 | 42 | create or replace procedure identifier($proc_name)() 43 | returns varchar 44 | language sql 45 | execute as owner 46 | as 47 | $$ 48 | begin 49 | merge into my_table DEST using ( 50 | select * from my_stream 51 | qualify row_number() over ( 52 | partition by json_data:ID order by insert_date) = 1 53 | ) SOURCE 54 | on DEST.json_data:ID = SOURCE.json_data:ID 55 | when matched and metadata$action = 'INSERT' then 56 | update set DEST.json_data = SOURCE.json_data, 57 | DEST.insert_date = current_timestamp() 58 | when not matched and metadata$action = 'INSERT' then 59 | insert (DEST.json_data, DEST.insert_date) 60 | values(SOURCE.json_data, current_timestamp()); 61 | return 'CDC records successfully inserted'; 62 | end; 63 | $$; 64 | 65 | create or replace task identifier($task_name) 66 | warehouse = LOAD_WH 67 | schedule = '1 minute' 68 | comment = 'Change data capture task that pulls over new data once a minute' 69 | when system$stream_has_data ('my_stream') 70 | as 71 | call my_procedure(); 72 | 73 | /* grant execute task priveleges to role sysadmin */ 74 | set role_name = 'accountadmin'; 75 | use role identifier($role_name); 76 | grant execute task on account to role identifier($role_name); 77 | 78 | /* tasks are created in a suspended state by default, you must 'resume' them to schedule them */ 79 | set role_name = 'sysadmin'; 80 | use role identifier($role_name); 81 | alter task identifier($task_name) resume; 82 | 83 | select * from identifier($my_table); 84 | 85 | show tasks; 86 | select * from table(information_schema.task_history()) order by SCHEDULED_TIME; 87 | -------------------------------------------------------------------------------- /SQL/Snowflake_Clean_Staging_Area.sql: -------------------------------------------------------------------------------- 1 | /*******************************************************************/ 2 | -- Procedure: sp_clean_stage 3 | -- Created By: Martin Palkovic 4 | -- Create date: 2022-08-16 5 | -- Organization: Cooke Inc. 6 | -- Summary: Delete files from a named Snowflake staging area 7 | -- Description: In data pipelines, we sometimes stick files in a named 8 | -- Snowflake internal staging area - occasionally, you'll want to purge the 9 | -- files from here. Append this stored procedure call as the last step in your pipeline 10 | -- to keep your staging area clean 11 | /*******************************************************************/ 12 | use warehouse REPORTING_WH; 13 | use database STAGING_DEV; 14 | use schema NS_LANDING; 15 | 16 | create or replace procedure sp_clean_stage( 17 | stage_name varchar, DAYS number, DRY_RUN boolean 18 | ) 19 | returns varchar 20 | language sql 21 | execute as caller 22 | as 23 | $$ 24 | declare 25 | ListFiles resultset; 26 | LastModified date; 27 | RemovedCount number := 0; 28 | TotalCount number := 0; 29 | begin 30 | ListFiles := (execute immediate 'ls @' || stage_name ); 31 | let C1 cursor for ListFiles; 32 | for files in C1 do 33 | TotalCount := TotalCount + 1; 34 | LastModified := to_date(left( files."last_modified", length(files."last_modified") - 4 ), 'DY, DD MON YYYY HH24:MI:SS' ); 35 | if (LastModified <= dateadd( 'day', -1 * days, current_timestamp())) then 36 | RemovedCount := RemovedCount + 1; 37 | if (not dry_run) then 38 | execute immediate 'rm @' || files."name"; 39 | end if; 40 | end if; 41 | end for; 42 | return RemovedCount || ' of ' || TotalCount || ' files ' || iff(dry_run,'will be','were') || ' deleted.'; 43 | end; 44 | $$; 45 | 46 | -- Run Stored Procedure 47 | -- use database my_db; 48 | -- call sp_clean_stage('my_stage', 14, false); 49 | -------------------------------------------------------------------------------- /SQL/Snowflake_Cloning.sql: -------------------------------------------------------------------------------- 1 | /* How to clone data in Snowflake 2 | By: Martin Palkovic 3 | Date: 2022-06-10 4 | 5 | Description: Zero copy cloning is one of the awesome features of Snowflake. 6 | I like to use this feature to quickly create a development environment for 7 | testing */ 8 | 9 | use role sysadmin; 10 | use warehouse reporting_wh; 11 | use database production; 12 | use schema dbo; 13 | 14 | /* clone database */ 15 | create database my_cloned_db clone my_db; 16 | 17 | /* clone schema */ 18 | create schema my_cloned_schema clone analytics_inventory; 19 | 20 | /* clone table */ 21 | create table my_cloned_table clone main_inventory_table; 22 | 23 | /* cloning with time travel */ 24 | create or replace table my_cloned_table clone main_inventory_table 25 | at (timestamp => '2022-06-10 9:30') 26 | -------------------------------------------------------------------------------- /SQL/Snowflake_Data_Pipeline_From_Internal_Stage.sql: -------------------------------------------------------------------------------- 1 | /**********************************************************************************************************/ 2 | -- Proc: Basic data pipeline from Snowflake internal stage 3 | -- CreateBy: Martin Palkovic 4 | -- Create date: 2022-10-31 5 | -- Description: Basic workflow for building the latter portions of a data pipeline within Snowflake. 6 | -- Note that this code assumes you have loaded a csv file into a Snowflake internal stage via a 7 | -- 3rd party or open source integration tool 8 | /***********************************************************************************************************/ 9 | 10 | /* initialize environment */ 11 | use role sysadmin; 12 | use warehouse reporting_wh; 13 | use database my_dev_database; 14 | use schema my_schema; 15 | 16 | /* Provides information for your third party/open source integration tool */ 17 | desc table dimcustomer; 18 | 19 | /* create stage, if needed */ 20 | show stages; 21 | -- create or replace my_stage 22 | list @my_stage; 23 | 24 | /* create file format */ 25 | create or replace file format my_file_format 26 | type = 'CSV' 27 | field_delimiter = ',' 28 | replace_invalid_characters = true 29 | null_if = (''); 30 | 31 | /* create stored procedure */ 32 | create or replace procedure dim_customer_pipeline() 33 | returns varchar 34 | language sql 35 | execute as caller 36 | as 37 | $$ 38 | begin 39 | truncate table MY_SCHEMA.DIMCUSTOMER; 40 | 41 | copy into 42 | MY_SCHEMA.DIMCUSTOMER 43 | from 44 | ( select t1.$1 45 | ,t1.$2 46 | ,t1.$3 47 | ,nullif(t1.$4, '') 48 | from @MY_SCHEMA.MY_STAGE/Dim_Customer.csv.gz (file_format => 'my_file_format') t1 49 | ) 50 | file_format=my_file_format ON_ERROR='SKIP_FILE'; 51 | 52 | remove @MY_SCHEMA.MY_STAGE pattern='.*Customer.*'; 53 | 54 | return 'Successfully loaded data into MY_DEV_DATABASE.MY_SCHEMA.DIMCUSTOMER'; 55 | end; 56 | $$; 57 | 58 | /* create task */ 59 | create or replace task dim_customer 60 | warehouse = load_wh 61 | schedule = 'using cron 30 9 * * * UTC' 62 | comment 63 | = 'Truncates MY_DEV_DATABASE.MY_SCHEMA.DIMCUSTOMER, loads all rows of the dimcustomer table from Azure SQL and deletes the csv from the staging area' 64 | as 65 | call dim_customer_pipeline(); 66 | 67 | /* grant execute task priveleges to role sysadmin */ 68 | use role accountadmin; 69 | grant execute task on account to role sysadmin; 70 | 71 | /* tasks are created in a suspended state by default, you must 'resume' them to schedule them */ 72 | use role sysadmin; 73 | alter task dim_customer resume; 74 | 75 | /* confirm that the tasks are working */ 76 | show tasks; 77 | select * from table(information_schema.task_history()) order by scheduled_time; 78 | -------------------------------------------------------------------------------- /SQL/Snowflake_Find_Duplicates.sql: -------------------------------------------------------------------------------- 1 | select * from my_table 2 | qualify count(*) over (partition by primary_key) > 1; 3 | -------------------------------------------------------------------------------- /SQL/Snowflake_Find_Missing_Dates.sql: -------------------------------------------------------------------------------- 1 | /* Query: find missing dates in a range of dates 2 | By: Martin Palkovic 3 | Date: 2022-08-19 4 | System: Snowflake 5 | Description: Say, for example, you have a report, and there is data missing for certain dates 6 | on that report. You can use this query to identify dates where you may have missing data 7 | */ 8 | 9 | use role sysadmin; 10 | use warehouse my_warehouse; 11 | use database my_db; 12 | use schema my_schema; 13 | 14 | with find_date_gaps (rownum, my_date_field) as ( 15 | select 16 | my_date_field, 17 | row_number() over (order by my_date_field asc) as rownum 18 | from your_table 19 | where my_date_field > 'yyyy-mm-dd' 20 | group by my_date_field 21 | ) 22 | 23 | select 24 | dateadd(dd, 1, fdg1.my_date_field) as startofgap, 25 | dateadd(dd, -1, fdg2.my_date_field) as endofgap 26 | from find_date_gaps as fdg1 27 | inner join find_date_gaps as fdg2 28 | on fdg1.rownum = (fdg2.rownum - 1) 29 | where datediff(dd, fdg1.my_date_field, dateadd(dd, -1, fdg2.my_date_field)) != 0; 30 | -------------------------------------------------------------------------------- /SQL/Snowflake_Flatten_JSON_Example.sql: -------------------------------------------------------------------------------- 1 | /**********************************************************************************************************/ 2 | -- Query: Flatten JSON to analytics view in Snowflake 3 | -- CreateBy: Martin Palkovic 4 | -- Create date: 2021-05-03 5 | -- Description: SQL code for creating a materialized view in Snowflake from a JSON in your staging area 6 | -- Modified by: 7 | -- Modify date: 8 | -- Mod Reason: 9 | /***********************************************************************************************************/ 10 | 11 | create or replace materialized view my_db.schema.my_view 12 | as 13 | select 14 | jsn.value:Id::string as id, 15 | jsn.value:TotalAmount::number(10, 2) as total_amount, 16 | jsn.value:Cash::boolean as cash, 17 | jsn.value:TransactionDate::date as transaction_date 18 | from staging_area.schema.my_table, 19 | lateral flatten(input => json_data) as jsn 20 | 21 | qualify row_number() 22 | over ( 23 | partition by jsn.value:Id 24 | order by jsn.value:Id 25 | ) 26 | = 1; 27 | 28 | /* 29 | Input: 30 | Row JSON_DATA 31 | 1 [{"Id": 1,"TotalAmount": 42.75, "Cash": true,"TransactionDate": "2022-03-25T18:44:46.54"}] 32 | 2 [{"Id":2, "TotalAmount": 57.99, "Cash": false, "TransactionDate": "2022-03-28T12:24:33.12"}] 33 | 3 [{"Id": 1,"TotalAmount": 42.75, "Cash": true,"TransactionDate": "2022-03-25T18:44:46.54"}] 34 | 4 [{"Id": 3, "TotalAmount": 100.25, "Cash": false, "TransactionDate": "2022-04-01T06:10:15.30"}] 35 | 36 | Output: 37 | ID Total_Amount Cash Transaction_Date 38 | 1 42.75 True 2022-03-25 39 | 2 57.99 False 2022-03-28 40 | 3 100.25 False 2022-04-01 41 | */ 42 | -------------------------------------------------------------------------------- /SQL/Snowflake_ForLoop_GrantPermissions.sql: -------------------------------------------------------------------------------- 1 | /* ######################### */ 2 | /* Script: Revoke/Grant permissions for reader accounts in Snowflake */ 3 | /* Author: Martin Palkovic */ 4 | /* Date: 2023-02-09 */ 5 | /* Description: This script loops through query results from the information_schema and grants privileges only to tables */ 6 | /* that have > 0 rows. This script was inspired by a database containing ~2,500 tables, 400 of which contained >= 1 row of data. */ 7 | /* This script revokes all privileges and then grants select on tables with > 0 rows. Modify your cursor queries as needed to provide a */ 8 | /* list of tables, schemas etc. to loop over. */ 9 | 10 | -- Set session variables 11 | set db = 'my_db'; 12 | set rl = 'accountadmin'; 13 | set wh = 'my_wh'; 14 | set role_var = '"My_Role"'; --the double quotes are required as this is a case sensitive string value! 15 | set share_name = 'ab12345.my_secure_share'; 16 | 17 | -- Schemas to exclude. Set as desired, add as many as you need 18 | set exc1 = 'information_schema'; 19 | set exc2 = 'my_schema1'; 20 | 21 | use database identifier($db); 22 | use role identifier($rl); 23 | use warehouse identifier($wh); 24 | 25 | /* SHARE LEVEL - EXECUTED IN MAIN ACCOUNT */ 26 | -- Revoke privileges 27 | declare 28 | iter_schema cursor for (select * from information_schema.schemata where schema_name not in ($exc1, $exc2)); 29 | begin 30 | for s in iter_schema do 31 | execute immediate 'revoke select on all tables in schema ' || s.schema_name || ' from share identifier($share_name)'; 32 | end for; 33 | return 'Permissions successfully revoked from secure share!'; 34 | end; 35 | 36 | -- Add to share all tables that have > 0 rows 37 | declare 38 | iter_tables cursor for (select * from information_schema.tables 39 | where row_count > 0 and table_schema not in ($exc1, $exc2)); 40 | begin 41 | for t in iter_tables do 42 | execute immediate 'grant select on table ' || t.table_schema || '.' || t.table_name || ' to share identifier($share_name)'; 43 | end for; 44 | return 'Permissions successfully granted to secure share!'; 45 | end; 46 | 47 | /* SHARE LEVEL - EXECUTED IN READER ACCOUNT BY ADMIN */ 48 | -- Revoke privileges 49 | declare 50 | iter_schema cursor for (select * from information_schema.schemata where schema_name not in ($exc1, $exc2)); 51 | begin 52 | for s in iter_schema do 53 | execute immediate 'revoke select on all tables in schema ' || s.schema_name || ' from role identifier($role_var)'; 54 | end for; 55 | return 'Permissions successfully revoked!'; 56 | end; 57 | 58 | -- Grant only permissions on tables that have > 0 rows 59 | declare 60 | iter_tables cursor for (select * from information_schema.tables 61 | where row_count > 0 and table_schema not in ($exc1, $exc2)); 62 | begin 63 | for t in iter_tables do 64 | execute immediate 'grant select on table ' || t.table_schema || '.' || t.table_name || ' to role identifier($role_var)'; 65 | end for; 66 | return 'Permissions successfully granted!'; 67 | end; 68 | -------------------------------------------------------------------------------- /SQL/Snowflake_Merge_Into_Example.sql: -------------------------------------------------------------------------------- 1 | /* Title: Example MERGE INTO statement for incremental loading into Snowflake 2 | By: Martin Palkovic 3 | Date: 2022-10-20 4 | Description: With large datasets, you'll often want to implement an incremental load to 5 | improve performance in your data pipeline. The code below will prevent duplicates in your load, 6 | while only adding new records and updating existing records if changes exist. Note that this code excludes 7 | the database name from the full qualified table name - that is deliberate so that this code can be run against 8 | a development database first. The database name is set in the environment extensions of your pipeline tool. 9 | 10 | -- This is a minimum reproducible example of code I've used in production. 11 | */ 12 | 13 | merge into 14 | my_schema.my_table as destination 15 | 16 | using ( 17 | select * 18 | from my_schema.my_staging_table 19 | qualify row_number() over ( 20 | partition by my_unique_sk 21 | order by created_date desc 22 | ) = 1 23 | ) as source 24 | on (source.my_unique_sk = destination.my_unique_sk) 25 | 26 | when matched then 27 | update 28 | set 29 | destination.my_unique_sk = source.my_unique_sk, 30 | destination.order_id = source.order_id, 31 | destination.ship_date = source.ship_date 32 | 33 | when not matched 34 | then insert 35 | ( 36 | my_unique_sk, 37 | order_id, 38 | ship_date 39 | ) 40 | values 41 | ( 42 | source.my_unique_sk, 43 | source.order_id, 44 | source.ship_date 45 | ); 46 | -------------------------------------------------------------------------------- /SQL/Snowflake_Python_Stored_Procedure_Example.sql: -------------------------------------------------------------------------------- 1 | /************************************************************************/ 2 | -- Script: Simple Python stored procedure in Snowflake 3 | -- Date: 2022-12-28 4 | -- Description: One thing I frequently do is compare one field to another, 5 | -- to determine if something exists in one dataset but not another. Does one table 6 | -- contain sales orders, pallet numbers, or report ID's that the other table 7 | -- does not? 8 | 9 | -- This stored procedure allows you to quickly determine that from within 10 | -- the Snowflake environment 11 | /************************************************************************/ 12 | 13 | use role sysadmin; 14 | use warehouse reporting_wh; 15 | use database dev; 16 | use schema my_schema; 17 | 18 | create or replace table mytable (amount number comment 'fake amounts for testing', fruits string comment 'fake types of fruit for testing'); 19 | create or replace table mytable2 like mytable; 20 | 21 | insert into mytable values (1, 'apple'), (2, 'orange'), (5, 'grape'), (7, 'cantelope'), (9, 'pineapple'), (17, 'banana'), (21, 'tangerine'); 22 | insert into mytable2 values (1, 'apple'), (3, 'orange'), (5, 'grape'), (7, 'strawberry'), (10, 'pineapple'), (17, 'banana'), (22, 'raspberry'); 23 | 24 | -- select * from mytable; 25 | -- select * from mytable2; 26 | 27 | create or replace procedure print_differences(TABLE1 string, TABLE2 string, FIELD1 string, FIELD2 string) 28 | returns array 29 | language python 30 | runtime_version = '3.8' 31 | packages = ('snowflake-snowpark-python', 'pandas') 32 | handler = 'print_differences' 33 | as 34 | $$ 35 | import pandas as pd 36 | 37 | def print_differences(session, table1: str,table2: str,field1: str,field2: str): 38 | 39 | #read the tables into a snowpark dataframe 40 | table1 = session.table(table1) 41 | table2 = session.table(table2) 42 | 43 | #convert to pandas 44 | df1 = table1.to_pandas() 45 | df2 = table2.to_pandas() 46 | 47 | # convert the the fields of interest from each table to a list 48 | list1 = df1[field1].to_list() 49 | list2 = df2[field2].to_list() 50 | 51 | return [item for item in list1 if item not in list2] 52 | $$; 53 | 54 | call print_differences('MYTABLE2', 'MYTABLE', 'FRUITS', 'FRUITS'); 55 | 56 | -- output: 57 | -- ["cantelope","tangerine"] 58 | -------------------------------------------------------------------------------- /SQL/Snowflake_Shorten_Huge_Union_Queries.sql: -------------------------------------------------------------------------------- 1 | /**********************************************************************/ 2 | -- Title: How to shorten a huge union query 3 | -- By: Martin Palkovic 4 | -- Date: 2022-11-25 5 | -- Description: Have you encountered a production small_sql query with a large number of unions, 6 | -- and very little changes between the queries except perhaps the database and/or schema name? 7 | -- In this example, you can loop over the COMPANY_NAME field in MY_TABLE to create 8 | -- one select statement per 'COMPANY_NAME', union them together, and return the results 9 | -- in one go. The first implementation of this at work reduced a 300 line query to ~ 40 lines! 10 | /*********************************************************************/ 11 | 12 | use role sysadmin; 13 | use warehouse my_wh; 14 | use database dev; 15 | 16 | -- Declare variables, loop over results of the 'organization' cursor variable 17 | declare 18 | small_sql varchar; 19 | big_sql varchar; 20 | organization cursor for (select COMPANY_NAME from MY_SCHEMA.MY_TABLE); 21 | my_results resultset; 22 | begin 23 | big_sql := ''; 24 | -- In Snowflake, $$ is a multi-line string delimiter 25 | for company in organization do 26 | small_sql := $$select 'COMPANY_NAME' as Company 27 | , GL.ACTNUM as Account_Number 28 | , ACT.DESCRIPTION as Account_Name 29 | from COMPANY_NAME.General_Ledger_Table GL 30 | 31 | inner join COMPANY_NAME.Account_Name_Table ACT 32 | on ACT.ID = GL.ID 33 | $$; 34 | small_sql := replace(small_sql, 'COMPANY_NAME', company.COMPANY_NAME); 35 | 36 | if(big_sql != '') then 37 | big_sql := big_sql || ' union all '; 38 | end if; 39 | 40 | big_sql := big_sql || small_sql; 41 | end for; 42 | 43 | my_results := (execute immediate :big_sql); 44 | return table(my_results); 45 | end; 46 | -------------------------------------------------------------------------------- /SQL/Snowflake_Time_Travel.sql: -------------------------------------------------------------------------------- 1 | /* Title: Snowflake Time Travel 2 | By: Martin Palkovic 3 | Date: 2022-06-07 4 | Description: Snowflake has great time travel functionality, were you can easily restore 5 | a table to its state at a previous point in time. I have used this functionality with 6 | great success when a production table with 2 million records was deleted on accident! 7 | */ 8 | 9 | show tables history; 10 | 11 | /* Note that you may need to rename the table */ 12 | alter table my_table rename to my_table_whoops; 13 | 14 | /* specify the time */ 15 | select 16 | acct_number, 17 | date 18 | from my_table at (timestamp => '2022-06-01 6:00'); 19 | 20 | /* specify an offset, ex. 1 hour ago*/ 21 | select 22 | acct_number, 23 | date 24 | from my_table at (offset => -60*60); --offset is in seconds here 25 | -------------------------------------------------------------------------------- /Shell/Create_gitignore_and_add_lines.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd /Users/johndoe/documents 3 | touch .gitignore 4 | echo '.env' >> .gitignore -------------------------------------------------------------------------------- /Shell/Microsoft.PowerShell_profile.ps1: -------------------------------------------------------------------------------- 1 | # This is an example of my Microsoft PowerShell profile. It sets up the Oh-My-Posh terminal theme, 2 | # and contains the following user defined functions: 3 | # PassGen: Generates random strong passwords 4 | # Create-OpenInVSCode: Creates and opens a file in VS Code using one simple command 5 | 6 | Set-Item -Path Env:TERMINAL_THEME -Value "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/night-owl.omp.json" 7 | 8 | Import-Module Terminal-Icons 9 | 10 | Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete 11 | 12 | oh-my-posh init pwsh --config $env:TERMINAL_THEME | Invoke-Expression 13 | 14 | # Password Generator 15 | function PassGen { 16 | param ( 17 | [int]$Length = 20 18 | ) 19 | 20 | $ValidCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+=' 21 | $Password = '' 22 | 23 | for ($i = 0; $i -lt $Length; $i++) { 24 | $RandomIndex = Get-Random -Minimum 0 -Maximum $ValidCharacters.Length 25 | $Password += $ValidCharacters[$RandomIndex] 26 | } 27 | 28 | return $Password 29 | } 30 | 31 | # Alias for PassGen 32 | Set-Alias -Name pg -Value PassGen 33 | 34 | # --- 35 | 36 | # Create and open file in VS Code 37 | function Create-OpenInVSCode { 38 | param ( 39 | [Parameter(Mandatory = $true)] 40 | [String]$newfile 41 | ) 42 | 43 | code (new-item $newfile) 44 | } 45 | 46 | # Aliases for Create-OpenInVSCode 47 | Set-Alias -Name new-file -Value Create-OpenInVSCode 48 | Set-Alias -Name nf -Value Create-OpenInVSCode 49 | -------------------------------------------------------------------------------- /Shell/Pass_secret_at_runtime_to_py_script.ps1: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pwsh 2 | & {Set-Item Env:my_password "yoUr_str0Ng_paSswoRd_heRe"} | py myscript.py 3 | 4 | # Your Python script must contain the following: 5 | # import os 6 | # my_password = os.getenv('my_password') -------------------------------------------------------------------------------- /Shell/Search_specific_branch_name.ps1: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pwsh 2 | git branch -a | Select-String "string_youre_looking_for" -------------------------------------------------------------------------------- /Shell/Search_specific_branch_name.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | git branch -a | grep -i your_string_here -------------------------------------------------------------------------------- /Shell/create_gitignore_and_add_lines.ps1: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pwsh 2 | Set-Location ./Users/johndoe/documents/ 3 | New-Item .gitignore 4 | Add-Content .gitignore '.env' -------------------------------------------------------------------------------- /Shell/git_mv_multiple_files.ps1: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pwsh 2 | # An example shell script to 'git mv' multiple files at once 3 | 4 | # All files from one folder to new folder 5 | mkdir my_new_folder 6 | Set-Location ./folder_your_files_are_in 7 | foreach ($file in Get-ChildItem *.sql) { git mv $file.name .\my_new_folder } 8 | 9 | # Move all folders inside one folder to another folder 10 | mkdir my_new_folder 11 | Set-Location ./folder_your_files_are_in 12 | Get-ChildItem .\my_old_folder\ | % { git mv $_.FullName .\my_new_folder\ } -------------------------------------------------------------------------------- /Shell/run_all_python_files_in_dir.ps1: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env pwsh 2 | foreach ($file in Get-ChildItem -Path C:\your\directory\here\*.py) { 3 | python $file.FullName 4 | } -------------------------------------------------------------------------------- /Shell/run_groovy_script_in_Docker.sh: -------------------------------------------------------------------------------- 1 | # Run Groovy script in Docker 2 | 3 | #!/usr/bin/env bash 4 | docker run --rm -v "${pwd}:/home/groovy/scripts" -w /home/groovy/scripts groovy:latest groovy your_script.groovy -------------------------------------------------------------------------------- /SnowSQL_CICD/build.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: jobName 3 | default: 'SnowflakeBuild' 4 | - name: jobDisplay 5 | default: 'Build artifacts for Snowflake deployment' 6 | - name: artifactName 7 | default: 'SnowflakeTest' 8 | - name: vmImage 9 | default: 'ubuntu-latest' 10 | - name: environmentName 11 | default: 'DEV' 12 | 13 | jobs: 14 | - job: ${{ parameters.jobName }} 15 | displayName: ${{ parameters.jobDisplay }} 16 | timeoutInMinutes: 10 17 | pool: 18 | vmImage: ${{ parameters.vmImage }} 19 | workspace: 20 | clean: outputs 21 | steps: 22 | # Publish artifacts 23 | - publish: $(System.DefaultWorkingDirectory) 24 | artifact: ${{ parameters.artifactName }} 25 | name: Artifacts 26 | displayName: Publish pipeline artifacts 27 | -------------------------------------------------------------------------------- /SnowSQL_CICD/deploy.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: jobName 3 | default: 'SnowflakeDeploy' 4 | - name: jobDisplay 5 | default: 'Deploy Snowflake Objects' 6 | - name: databaseName 7 | default: '' 8 | - name: vmImage 9 | default: 'ubuntu-latest' 10 | - name: environmentName 11 | default: 'DEV' 12 | 13 | jobs: 14 | - deployment: ${{ parameters.jobName }} 15 | displayName: ${{ parameters.jobDisplay }} 16 | timeoutInMinutes: 10 17 | pool: 18 | vmImage: ${{ parameters.vmImage }} 19 | environment: ${{ parameters.environmentName }} 20 | workspace: 21 | clean: outputs 22 | 23 | strategy: 24 | runOnce: 25 | deploy: 26 | steps: 27 | # Checkout repo 28 | - checkout: self 29 | fetchDepth: 10 30 | clean: true 31 | 32 | # Download and Install SnowSQL CLI 33 | - script: | 34 | curl -O https://sfc-repo.snowflakecomputing.com/snowsql/bootstrap/1.2/linux_x86_64/snowsql-1.2.9-linux_x86_64.bash 35 | SNOWSQL_DEST=~/snowflake SNOWSQL_LOGIN_SHELL=~/.profile bash snowsql-1.2.9-linux_x86_64.bash 36 | name: SnowSQLSetup 37 | displayName: Download and Install SnowSQL 38 | 39 | # Test SnowSQL Installation 40 | - script: ~/snowflake/snowsql -v 41 | name: TestSnowSQL 42 | displayName: Test SnowSQL Installation 43 | 44 | - script: | 45 | echo "All changes in this commit:" 46 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) 47 | name: detectingChanges 48 | displayName: 'Detecting changes' 49 | 50 | # Confirm Snowflake is properly connected 51 | - script: | 52 | # Test SnowSQL connection to our Snowflake instance 53 | ~/snowflake/snowsql -q "select current_account(), current_user(), current_role(), current_warehouse()" 54 | 55 | # Confirm that the pipeline is finding the changed SQL files 56 | files=$(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep \.sql) 57 | 58 | echo "Changed files:" 59 | echo "$files" 60 | env: 61 | SNOWSQL_ACCOUNT: $(SNOWSQL_ACCOUNT) 62 | SNOWSQL_USER: $(SNOWSQL_USER) 63 | SNOWSQL_PWD: $(SNOWSQL_PWD) 64 | SNOWSQL_ROLE: $(SNOWSQL_ROLE) 65 | name: TestSnowSQLConnection 66 | displayName: Test Snowflake Connection 67 | 68 | # Deploy code to Snowflake 69 | - script: | 70 | files=$(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep \.sql) 71 | for file in $files; do 72 | echo "Deploying $file" 73 | ~/snowflake/snowsql -d ${{ parameters.databaseName }} -f $file 74 | done 75 | env: 76 | SNOWSQL_ACCOUNT: $(SNOWSQL_ACCOUNT) 77 | SNOWSQL_USER: $(SNOWSQL_USER) 78 | SNOWSQL_PWD: $(SNOWSQL_PWD) 79 | SNOWSQL_ROLE: $(SNOWSQL_ROLE) 80 | name: Deploy 81 | displayName: Deploy code to Snowflake 82 | -------------------------------------------------------------------------------- /SnowSQL_CICD/snowsql.yml: -------------------------------------------------------------------------------- 1 | # This pipeline uses the SnowSQL CLI to deploy code to Snowflake that is merged to main after PR approval. 2 | # Note that this is the 'parent' pipeline, which calls the build.yml and deploy.yml files 3 | # Note that this uses Azure DevOps flavoured YAML but could easily be modified to work with Github or GitLab 4 | 5 | name: Snowflake CD Pipeline 6 | 7 | variables: 8 | - group: SnowSQL 9 | - name: artifactName 10 | value: 'snowflakeTest' 11 | - name: vmImage 12 | value: 'ubuntu-latest' 13 | 14 | trigger: 15 | branches: 16 | include: 17 | - main 18 | 19 | stages: 20 | - stage: Build 21 | jobs: 22 | - template: build.yml 23 | parameters: 24 | jobName: 'BuildSnowflakeObjects' 25 | artifactName: $(artifactName) 26 | vmImage: $(vmImage) 27 | 28 | - stage: DEV 29 | variables: 30 | - name: database 31 | value: DEV 32 | - name: schema 33 | value: misc 34 | jobs: 35 | - template: deploy.yml 36 | parameters: 37 | jobName: DEV 38 | databaseName: $(database) 39 | vmImage: $(vmImage) 40 | environmentName: DEV 41 | 42 | - stage: QA 43 | variables: 44 | - name: database 45 | value: QA 46 | - name: schema 47 | value: misc 48 | jobs: 49 | - template: deploy.yml 50 | parameters: 51 | jobName: QA 52 | databaseName: $(database) 53 | vmImage: $(vmImage) 54 | environmentName: QA 55 | 56 | - stage: PROD 57 | variables: 58 | - name: database 59 | value: PROD 60 | jobs: 61 | - template: deploy.yml 62 | parameters: 63 | jobName: PROD 64 | databaseName: $(database) 65 | vmImage: $(vmImage) 66 | environmentName: PROD 67 | -------------------------------------------------------------------------------- /SnowSQL_CICD/sqlfluff_pr_check.yml: -------------------------------------------------------------------------------- 1 | # This pipeline uses SQLFluff to lint Snowflake SQL code during a pull request 2 | # Note that this uses Azure DevOps flavoured YAML but could easily be modified to work with Github or GitLab 3 | 4 | name: Pull Request check using SQLFluff 5 | 6 | parameters: 7 | - name: jobName 8 | default: 'SnowflakeTest' 9 | - name: jobDisplay 10 | default: 'Lint repo with SQLFluff' 11 | 12 | pr: 13 | branches: 14 | include: 15 | - main 16 | 17 | pool: 18 | vmImage: 'ubuntu-latest' 19 | 20 | jobs: 21 | - job: ${{ parameters.jobName }} 22 | timeoutInMinutes: 10 23 | displayName: ${{ parameters.jobDisplay }} 24 | 25 | workspace: 26 | clean: outputs 27 | 28 | steps: 29 | # Checkout repo 30 | - checkout: self 31 | fetchDepth: 10 32 | clean: true 33 | 34 | # Download and Install SnowSQL CLI 35 | - script: | 36 | pip install --upgrade pip 37 | pip install sqlfluff 38 | displayName: Download and Install SQLFLuff 39 | 40 | # Lint SQL 41 | - script: | 42 | git ls-files | grep \.sql | sqlfluff lint --dialect snowflake 43 | displayName: Analyzing the code with SQLFluff 44 | -------------------------------------------------------------------------------- /dbt/dbt_python_model_example.py: -------------------------------------------------------------------------------- 1 | def calculate_checksum_digit(sscc: str) -> str: 2 | """Calculates and concats a checksum digit 3 | to a 17 character string using modulus 10 """ 4 | 5 | sscc = sscc.strip() 6 | if not sscc: 7 | return 'BAD INPUT' 8 | 9 | try: 10 | digits = [int(d) for d in str(sscc) if d.isdigit()] 11 | if not digits: 12 | return 'BAD INPUT' 13 | 14 | weighted_digits = [(d * 3 if i % 2 == 0 else d) for i, d in enumerate(digits)] 15 | total_weighted_digits = sum(weighted_digits) 16 | check_digit = (10 - (total_weighted_digits % 10)) % 10 17 | return (str(sscc) + str(check_digit)) 18 | 19 | except(ValueError, TypeError): 20 | return 'BAD INPUT' 21 | 22 | def model(dbt, session): 23 | dbt.config(materialized = 'table', 24 | packages = ['pandas']) 25 | 26 | df = dbt.ref('my_upstream_model') 27 | df = df.to_pandas() 28 | df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) 29 | 30 | df['CHECKSUM'] = df['PRE_CHECKSUM'].apply(calculate_checksum_digit) 31 | 32 | return df 33 | -------------------------------------------------------------------------------- /dbt/filter_dbt_catalog_query_snowflake.sql: -------------------------------------------------------------------------------- 1 | {% macro snowflake__get_catalog(information_schema, schemas) -%} 2 | 3 | {%- set relations_in_project = [] -%} 4 | 5 | {%- for node in graph.nodes.values() -%} 6 | {%- if node.resource_type == 'model' -%} 7 | {%- do relations_in_project.append(node.alias) -%} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | {%- for source in graph.sources.values() -%} 11 | {%- do relations_in_project.append(source.name) -%} 12 | {%- endfor -%} 13 | 14 | {%- set relations_in_project = set(relations_in_project) | list -%} 15 | 16 | {%- if (schemas | length) == 0 -%} 17 | {%- set query = "select 1 as id limit 0" -%} 18 | {%- else -%} 19 | 20 | {% set query %} 21 | 22 | with tables as ( 23 | 24 | select 25 | 26 | table_catalog as "table_database", 27 | table_schema as "table_schema", 28 | table_name as "table_name", 29 | table_type as "table_type", 30 | comment as "table_comment", 31 | table_owner as "table_owner", 32 | 'Clustering Key' as "stats:clustering_key:label", 33 | clustering_key as "stats:clustering_key:value", 34 | 'The key used to cluster this table' as "stats:clustering_key:description", 35 | (clustering_key is not null) as "stats:clustering_key:include", 36 | 'Row Count' as "stats:row_count:label", 37 | row_count as "stats:row_count:value", 38 | 'An approximate count of rows in this table' as "stats:row_count:description", 39 | (row_count is not null) as "stats:row_count:include", 40 | 'Approximate Size' as "stats:bytes:label", 41 | bytes as "stats:bytes:value", 42 | 'Approximate size of the table as reported by Snowflake' as "stats:bytes:description", 43 | (bytes is not null) as "stats:bytes:include", 44 | 'Last Modified' as "stats:last_modified:label", 45 | to_varchar(convert_timezone('UTC', last_altered), 'yyyy-mm-dd HH24:MI'||'UTC') as "stats:last_modified:value", 46 | 'The timestamp for last update/change' as "stats:last_modified:description", 47 | (last_altered is not null and table_type='BASE TABLE') as "stats:last_modified:include" 48 | 49 | from {{ information_schema }}.tables 50 | 51 | where row_count > 0 52 | 53 | and ( 54 | {%- for schema in schemas -%} 55 | upper("table_schema") = upper('{{ schema }}') {%- if not loop.last %} or {% endif -%} 56 | {%- endfor -%} 57 | ) 58 | 59 | {%- if relations_in_project | length > 0 %} 60 | 61 | and coalesce(regexp_substr(table_name, '^(.+)_{1}[0-9]{8}$'), table_name) in ( 62 | {%- for rel in relations_in_project -%} upper('{{ rel }}') {%- if not loop.last %}, {% endif -%}{%- endfor -%} 63 | ) 64 | {% endif -%} 65 | 66 | ), 67 | 68 | columns as ( 69 | 70 | select 71 | 72 | table_catalog as "table_database", 73 | table_schema as "table_schema", 74 | table_name as "table_name", 75 | column_name as "column_name", 76 | ordinal_position as "column_index", 77 | data_type as "column_type", 78 | comment as "column_comment" 79 | 80 | from {{ information_schema }}.columns 81 | 82 | where ( 83 | {%- for schema in schemas -%} 84 | upper("table_schema") = upper('{{ schema }}') {%- if not loop.last %} or {% endif -%} 85 | {%- endfor -%} 86 | ) 87 | 88 | {%- if relations_in_project | length > 0 %} 89 | 90 | and coalesce(regexp_substr(table_name, '^(.+)_{1}[0-9]{8}$'), table_name) in ( 91 | {%- for rel in relations_in_project -%} upper('{{ rel }}') {%- if not loop.last %}, {% endif -%}{%- endfor -%} 92 | ) 93 | {% endif -%} 94 | 95 | ) 96 | 97 | select * from tables 98 | 99 | inner join columns using ("table_database", "table_schema", "table_name") 100 | 101 | order by "c 102 | {%- endset -%} 103 | 104 | {%- endif -%} 105 | 106 | {%- do log(query) -%} 107 | {%- set results = run_query(query) -%} 108 | {%- do log(schemas ~ ' - rows returned: ' ~ results | length, True) -%} 109 | 110 | {{ return(results) }} 111 | 112 | {%- endmacro %} 113 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | snowflake-snowpark-python 2 | pandas 3 | polars 4 | plotly 5 | matplotlib 6 | seaborn 7 | SQLalchemy 8 | ipykernel 9 | scikit-learn --------------------------------------------------------------------------------