├── .devcontainer
    ├── Dockerfile
    ├── Microsoft.PowerShell_profile.ps1
    ├── config.fish
    ├── devcontainer.json
    └── requirements.txt
├── .gitattributes
├── .github
    └── workflows
    │   ├── ruff.yml
    │   ├── sqlfluff.yml
    │   └── yamllint-ci.yml
├── .gitignore
├── .sqlfluff
├── .sqlfluffignore
├── .yamllint
├── CI_Examples
    ├── python-ci.yml
    ├── python-pr.yml
    ├── sqlfluff-ci.yml
    ├── sqlfluff-pr.yml
    ├── yamllint-ci.yml
    └── yamllint-pr.yml
├── Docker
    ├── Dockerfile
    ├── Populate_SQL_Server_Docker_Container.py
    ├── docker-compose.yml
    └── requirements.txt
├── Fivetran
    └── disable_tables_with_zero_rows_fivetran_api.py
├── Python
    ├── Snowflake_Insert_Statements.py
    ├── Snowpark_Backload_API_Data.py
    ├── Snowpark_Create_Stored_Procedure.py
    ├── Snowpark_Example_Backload_SQL_Server_Data.py
    ├── Stack.ipynb
    ├── compare_two_lists_for_differences.py
    ├── connecting_to_snowflake_using_python.py
    ├── connecting_to_sql_server_using_python.py
    ├── determine_sql_field_length.py
    ├── load_json_to_snowflake.py
    ├── parse_xml_compare_differences.py
    ├── pull_records_for_all_sql_tables.py
    ├── read_sql_server_write_snowflake.py
    ├── sql_insert_statement_from_csv.py
    └── sql_style_join_csv.py
├── README.md
├── SQL
    ├── Load_CSV_to_Snowflake
    │   ├── PUT.sql
    │   ├── Snowflake_Worksheet_Load_CSV.sql
    │   └── snowsql.sh
    ├── Snowflake_Account_Setup.sql
    ├── Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql
    ├── Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql
    ├── Snowflake_Clean_Staging_Area.sql
    ├── Snowflake_Cloning.sql
    ├── Snowflake_Data_Pipeline_From_Internal_Stage.sql
    ├── Snowflake_Find_Duplicates.sql
    ├── Snowflake_Find_Missing_Dates.sql
    ├── Snowflake_Flatten_JSON_Example.sql
    ├── Snowflake_ForLoop_GrantPermissions.sql
    ├── Snowflake_Merge_Into_Example.sql
    ├── Snowflake_Python_Stored_Procedure_Example.sql
    ├── Snowflake_Shorten_Huge_Union_Queries.sql
    └── Snowflake_Time_Travel.sql
├── Shell
    ├── Create_gitignore_and_add_lines.sh
    ├── Microsoft.PowerShell_profile.ps1
    ├── Pass_secret_at_runtime_to_py_script.ps1
    ├── Search_specific_branch_name.ps1
    ├── Search_specific_branch_name.sh
    ├── create_gitignore_and_add_lines.ps1
    ├── git_mv_multiple_files.ps1
    ├── run_all_python_files_in_dir.ps1
    └── run_groovy_script_in_Docker.sh
├── SnowSQL_CICD
    ├── build.yml
    ├── deploy.yml
    ├── snowsql.yml
    └── sqlfluff_pr_check.yml
├── dbt
    ├── dbt_python_model_example.py
    └── filter_dbt_catalog_query_snowflake.sql
└── requirements.txt


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go/.devcontainer/base.Dockerfile
 2 | 
 3 | # [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster
 4 | ARG VARIANT=1-bullseye
 5 | FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT}
 6 | 
 7 | # [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
 8 | ARG NODE_VERSION="none"
 9 | RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
10 | 
11 | # Install powershell
12 | ARG PS_VERSION="7.2.1"
13 | # powershell-7.3.0-linux-x64.tar.gz
14 | # powershell-7.3.0-linux-arm64.tar.gz
15 | RUN ARCH="$(dpkg --print-architecture)"; \
16 |     if [ "${ARCH}" = "amd64" ]; then \
17 |         PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-x64.tar.gz"; \
18 |     elif [ "${ARCH}" = "arm64" ]; then \
19 |         PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm64.tar.gz"; \
20 |     elif [ "${ARCH}" = "armhf" ]; then \
21 |         PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm32.tar.gz"; \
22 |     fi; \
23 |     wget https://github.com/PowerShell/PowerShell/releases/download/$PS_BIN -O pwsh.tar.gz; \
24 |     mkdir /usr/local/pwsh && \
25 |     tar Cxvfz /usr/local/pwsh pwsh.tar.gz && \
26 |     rm pwsh.tar.gz
27 | 
28 | ENV PATH=$PATH:/usr/local/pwsh
29 | 
30 | RUN echo 'deb http://download.opensuse.org/repositories/shells:/fish:/release:/3/Debian_11/ /' | tee /etc/apt/sources.list.d/shells:fish:release:3.list; \
31 |     curl -fsSL https://download.opensuse.org/repositories/shells:fish:release:3/Debian_11/Release.key | gpg --dearmor | tee /etc/apt/trusted.gpg.d/shells_fish_release_3.gpg > /dev/null; \
32 |     apt-get update && export DEBIAN_FRONTEND=noninteractive \
33 |     && apt-get install -y --no-install-recommends \
34 |     fish \
35 |     tmux \
36 |     fzf \
37 |     && apt-get clean
38 | 
39 | ARG USERNAME=vscode
40 | 
41 | # Download the oh-my-posh binary
42 | RUN mkdir /home/${USERNAME}/bin; \
43 |     wget https://github.com/JanDeDobbeleer/oh-my-posh/releases/latest/download/posh-linux-$(dpkg --print-architecture) -O /home/${USERNAME}/bin/oh-my-posh; \
44 |     chmod +x /home/${USERNAME}/bin/oh-my-posh; \
45 |     chown ${USERNAME}: /home/${USERNAME}/bin;
46 | 
47 | # NOTE: devcontainers are Linux-only at this time but when
48 | # Windows or Darwin is supported someone will need to improve
49 | # the code logic above.
50 | 
51 | # Setup a neat little PowerShell experience
52 | RUN pwsh -Command Install-Module posh-git -Scope AllUsers -Force; \
53 |     pwsh -Command Install-Module z -Scope AllUsers -Force; \
54 |     pwsh -Command Install-Module PSFzf -Scope AllUsers -Force; \
55 |     pwsh -Command Install-Module Terminal-Icons -Scope AllUsers -Force;
56 | 
57 | # add the oh-my-posh path to the PATH variable
58 | ENV PATH "$PATH:/home/${USERNAME}/bin"
59 | 
60 | # Add vscode default dir to the PATH variable
61 | ENV PATH "$PATH:/home/vscode/.local/bin"
62 | 
63 | # Can be used to override the devcontainer prompt default theme:
64 | ENV POSH_THEME="https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/night-owl.omp.json"
65 | 
66 | # Deploy oh-my-posh prompt to Powershell:
67 | COPY Microsoft.PowerShell_profile.ps1 /home/${USERNAME}/.config/powershell/Microsoft.PowerShell_profile.ps1
68 | 
69 | # Deploy oh-my-posh prompt to Fish:
70 | COPY config.fish /home/${USERNAME}/.config/fish/config.fish
71 | 
72 | # Everything runs as root during build time, so we want
73 | # to make sure the vscode user can edit these paths too:
74 | RUN chmod 777 -R /home/${USERNAME}/.config
75 | 
76 | # Override vscode's own Bash prompt with oh-my-posh:
77 | RUN sed -i 's/^__bash_prompt$/#&/' /home/${USERNAME}/.bashrc && \
78 |     echo "eval \"\$(oh-my-posh init bash --config $POSH_THEME)\"" >> /home/${USERNAME}/.bashrc
79 | 
80 | # Override vscode's own ZSH prompt with oh-my-posh:
81 | RUN echo "eval \"\$(oh-my-posh init zsh --config $POSH_THEME)\"" >> /home/${USERNAME}/.zshrc
82 | 
83 | # Set container timezone:
84 | ARG TZ="UTC"
85 | RUN ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime
86 | 
87 | # Required for Python - Confluent Kafka on M1 Silicon
88 | RUN apt update && apt -y install software-properties-common gcc
89 | RUN git clone https://github.com/edenhill/librdkafka
90 | RUN cd librdkafka && ./configure && make && make install && ldconfig
91 | 
92 | # [Optional] Uncomment the next line to use go get to install anything else you need
93 | # RUN go get -x github.com/JanDeDobbeleer/battery
94 | 
95 | # [Optional] Uncomment this line to install global node packages.
96 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1


--------------------------------------------------------------------------------
/.devcontainer/Microsoft.PowerShell_profile.ps1:
--------------------------------------------------------------------------------
 1 | Import-Module posh-git
 2 | Import-Module PSFzf -ArgumentList 'Ctrl+t', 'Ctrl+r'
 3 | Import-Module z
 4 | Import-Module Terminal-Icons
 5 | 
 6 | Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete
 7 | 
 8 | $env:POSH_GIT_ENABLED=$true
 9 | oh-my-posh init pwsh --config $env:POSH_THEME | Invoke-Expression
10 | 
11 | # NOTE: You can override the above env var from the devcontainer.json "args" under the "build" key.
12 | function PassGen {
13 |     param (
14 |         [int]$Length = 20
15 |     )
16 |     
17 |     $ValidCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+='
18 |     $Password = ''
19 |     
20 |     for ($i = 0; $i -lt $Length; $i++) {
21 |         $RandomIndex = Get-Random -Minimum 0 -Maximum $ValidCharacters.Length
22 |         $Password += $ValidCharacters[$RandomIndex]
23 |     }
24 |     
25 |     return $Password
26 | }
27 | 
28 | Set-Alias -Name pg -Value PassGen
29 | # Aliases
30 | Set-Alias -Name ac -Value Add-Content


--------------------------------------------------------------------------------
/.devcontainer/config.fish:
--------------------------------------------------------------------------------
1 | # Activate oh-my-posh prompt:
2 | oh-my-posh init fish --config $POSH_THEME | source
3 | 
4 | # NOTE: You can override the above env vars from the devcontainer.json "args" under the "build" key.


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
  1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
  2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go
  3 | {
  4 | 	"name": "oh-my-posh",
  5 | 	"build": {
  6 | 	  "dockerfile": "Dockerfile",
  7 | 	  "args": {
  8 | 		// Update the VARIANT arg to pick a version of Go: 1, 1.16, 1.17
  9 | 		// Append -bullseye or -buster to pin to an OS version.
 10 | 		// Use -bullseye variants on local arm64/Apple Silicon.
 11 | 		"VARIANT": "1.19-bullseye",
 12 | 		// Options:
 13 |   
 14 | 		"POSH_THEME": "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json",
 15 |   
 16 | 		// Override me with your own timezone:
 17 | 		"TZ": "America/Moncton",
 18 | 		// Use one of the "TZ database name" entries from:
 19 | 		// https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
 20 |   
 21 | 		"NODE_VERSION": "lts/*",
 22 | 		//Powershell version
 23 | 		"PS_VERSION": "7.2.1"
 24 | 	  }
 25 | 	},
 26 | 	"runArgs": ["--cap-add=SYS_PTRACE", 
 27 | 				"--security-opt", 
 28 | 				"seccomp=unconfined"
 29 | 			],
 30 | 
 31 | 	"features": {
 32 | 		"ghcr.io/devcontainers/features/azure-cli:1": {
 33 | 			"version": "latest"
 34 | 		},
 35 | 		"ghcr.io/devcontainers/features/python:1": {
 36 | 			"version": "3.8"
 37 | 		},
 38 | 		"ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {},
 39 | 		"ghcr.io/devcontainers-contrib/features/terraform-asdf:2": {},
 40 | 		"ghcr.io/devcontainers-contrib/features/yamllint:2": {},
 41 | 		"ghcr.io/devcontainers/features/docker-in-docker:2": {},
 42 | 		"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
 43 | 		"ghcr.io/devcontainers/features/github-cli:1": {},
 44 | 		"ghcr.io/devcontainers-contrib/features/spark-sdkman:2": {
 45 | 			"jdkVersion": "11"
 46 | 		},
 47 | 		"ghcr.io/dhoeric/features/google-cloud-cli:1": {
 48 | 			"version": "latest"
 49 | 		}
 50 | 	  },
 51 |   
 52 | 	// Set *default* container specific settings.json values on container create.
 53 | 	"customizations": {
 54 | 		"vscode": {
 55 | 			"settings": {
 56 | 			"go.toolsManagement.checkForUpdates": "local",
 57 | 			"go.useLanguageServer": true,
 58 | 			"go.gopath": "/go",
 59 | 			"go.goroot": "/usr/local/go",
 60 | 			"terminal.integrated.profiles.linux": {
 61 | 				"bash": {
 62 | 				"path": "bash"
 63 | 				},
 64 | 				"zsh": {
 65 | 				"path": "zsh"
 66 | 				},
 67 | 				"fish": {
 68 | 				"path": "fish"
 69 | 				},
 70 | 				"tmux": {
 71 | 				"path": "tmux",
 72 | 				"icon": "terminal-tmux"
 73 | 				},
 74 | 				"pwsh": {
 75 | 				"path": "pwsh",
 76 | 				"icon": "terminal-powershell"
 77 | 				}
 78 | 			},
 79 | 			"terminal.integrated.defaultProfile.linux": "pwsh",
 80 | 			"terminal.integrated.defaultProfile.windows": "pwsh",
 81 | 			"terminal.integrated.defaultProfile.osx": "pwsh",
 82 | 			"tasks.statusbar.default.hide": true,
 83 | 			"terminal.integrated.tabs.defaultIcon": "terminal-powershell",
 84 | 			"terminal.integrated.tabs.defaultColor": "terminal.ansiBlue",
 85 | 			"workbench.colorTheme": "GitHub Dark Dimmed",
 86 | 			"workbench.iconTheme": "material-icon-theme"
 87 | 			},
 88 | 
 89 | 			// Add the IDs of extensions you want installed when the container is created.
 90 | 			"extensions": [
 91 | 				"ms-mssql.mssql",
 92 | 				"snowflake.snowflake-vsc",
 93 | 				"golang.go",
 94 | 				"ms-vscode.powershell",
 95 | 				"ms-python.python",
 96 | 				"ms-python.vscode-pylance",
 97 | 				"redhat.vscode-yaml",
 98 | 				"ms-vscode-remote.remote-containers",
 99 | 				"ms-toolsai.jupyter",
100 | 				"eamodio.gitlens",
101 | 				"yzhang.markdown-all-in-one",
102 | 				"davidanson.vscode-markdownlint",
103 | 				"editorconfig.editorconfig",
104 | 				"esbenp.prettier-vscode",
105 | 				"github.vscode-pull-request-github",
106 | 				"akamud.vscode-theme-onedark",
107 | 				"PKief.material-icon-theme",
108 | 				"GitHub.github-vscode-theme",
109 | 				"actboy168.tasks",
110 | 				"bastienboutonnet.vscode-dbt",
111 | 				"innoverio.vscode-dbt-power-user",
112 | 				"redhat.vscode-xml",
113 | 				"adpyke.vscode-sql-formatter",
114 | 				"inferrinizzard.prettier-sql-vscode",
115 | 				"github.vscode-github-actions",
116 | 				"ms-python.black-formatter"
117 | 			]
118 | 		}
119 | 	},
120 |   
121 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
122 | 	// "forwardPorts": [3000],
123 |   
124 | 	// Use 'postCreateCommand' to run commands after the container is created.
125 | 	"postCreateCommand": "pip3 install --user -r .devcontainer/requirements.txt --use-pep517",
126 |   
127 | 	// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
128 | 	"remoteUser": "vscode"
129 |   }
130 |   


--------------------------------------------------------------------------------
/.devcontainer/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==1.5.2
 2 | prefect==2.7.7
 3 | prefect-sqlalchemy==0.2.2
 4 | prefect-gcp[cloud_storage]==0.2.4
 5 | protobuf
 6 | pyarrow==10.0.1
 7 | pandas-gbq==0.18.1
 8 | psycopg2-binary==2.9.5
 9 | sqlalchemy==1.4.46
10 | ipykernel
11 | polars
12 | dbt-core
13 | dbt-bigquery
14 | dbt-postgres
15 | dbt-snowflake
16 | pyspark
17 | # confluent-kafka==1.9.2
18 | snowflake-snowpark-python
19 | scikit-learn
20 | ruff
21 | sqlfluff


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.sql linguist-detectable=true
2 | *.yml linguist-detectable=true
3 | *.yml linguist-language=YAML
4 | *.ipynb linguist-detectable=false


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: Ruff Testing
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - name: Checkout repository
11 |         uses: actions/checkout@v2
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v2
15 |         with:
16 |           python-version: 3.x
17 | 
18 |       - name: Install dependencies
19 |         run: pip install ruff
20 | 
21 |       - name: Test Ruff installation
22 |         run: ruff --version
23 | 
24 |       - name: Run ruff
25 |         run: ruff check ./Python/
26 | 


--------------------------------------------------------------------------------
/.github/workflows/sqlfluff.yml:
--------------------------------------------------------------------------------
 1 | name: SQLFluff Testing
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - name: Checkout repository
11 |         uses: actions/checkout@v2
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v2
15 |         with:
16 |           python-version: 3.x
17 | 
18 |       - name: Install dependencies
19 |         run: pip install sqlfluff
20 | 
21 |       - name: Run SQLFluff
22 |         run: git ls-files | grep \.sql | sqlfluff lint --dialect snowflake
23 | 


--------------------------------------------------------------------------------
/.github/workflows/yamllint-ci.yml:
--------------------------------------------------------------------------------
 1 | name: yamllint testing
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - name: Checkout repository
11 |         uses: actions/checkout@v2
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v2
15 |         with:
16 |           python-version: 3.x
17 | 
18 |       - name: Install yamllint
19 |         run: pip install yamllint
20 | 
21 |       - name: Run yamllint
22 |         run: git ls-files | grep \.yml | yamllint .
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .vscode/
3 | /.ruff_cache
4 | /.ipynb_checkpoints
5 | Snowflake_Azure_Blob_Auto_ingest_Snowpipe.sql
6 | 


--------------------------------------------------------------------------------
/.sqlfluff:
--------------------------------------------------------------------------------
 1 | [sqlfluff]
 2 | 
 3 | # Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html
 4 | # Or run 'sqlfluff dialects'
 5 | dialect = snowflake
 6 | 
 7 | # One of [raw|jinja|python|placeholder]
 8 | templater = jinja
 9 | 
10 | # Comma separated list of rules to exclude, or None
11 | # See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules
12 | # AM04 (ambiguous.column_count) and ST06 (structure.column_order) are
13 | # two of the more controversial rules included to illustrate usage.
14 | exclude_rules = ambiguous.column_count, structure.column_order
15 | warnings = LT05
16 | 
17 | # The standard max_line_length is 80 in line with the convention of
18 | # other tools and several style guides. Many projects however prefer
19 | # something a little longer.
20 | # Set to zero or negative to disable checks.
21 | max_line_length = 120
22 | 
23 | # CPU processes to use while linting.
24 | # The default is "single threaded" to allow easy debugging, but this
25 | # is often undesirable at scale.
26 | # If positive, just implies number of processes.
27 | # If negative or zero, implies number_of_cpus - specified_number.
28 | # e.g. -1 means use all processors but one. 0 means all cpus.
29 | processes = -1
30 | 
31 | # If using the dbt templater, we recommend setting the project dir.
32 | ; [sqlfluff:templater:dbt]
33 | ; project_dir = ./
34 | 
35 | [sqlfluff:indentation]
36 | # While implicit indents are not enabled by default. Many of the
37 | # SQLFluff maintainers do use them in their projects.
38 | allow_implicit_indents = true
39 | 
40 | # The default configuration for aliasing rules is "consistent"
41 | # which will auto-detect the setting from the rest of the file. This
42 | # is less desirable in a new project and you may find this (slightly
43 | # more strict) setting more useful.
44 | [sqlfluff:rules:aliasing.table]
45 | aliasing = explicit
46 | [sqlfluff:rules:aliasing.column]
47 | aliasing = explicit
48 | [sqlfluff:rules:aliasing.length]
49 | min_alias_length = 3
50 | 
51 | # The default configuration for capitalisation rules is "consistent"
52 | # which will auto-detect the setting from the rest of the file. This
53 | # is less desirable in a new project and you may find this (slightly
54 | # more strict) setting more useful.
55 | # Typically we find users rely on syntax highlighting rather than
56 | # capitalisation to distinguish between keywords and identifiers.
57 | # Clearly, if your organisation has already settled on uppercase
58 | # formatting for any of these syntax elements then set them to "upper".
59 | # See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason
60 | [sqlfluff:rules:capitalisation.keywords]
61 | capitalisation_policy = lower
62 | [sqlfluff:rules:capitalisation.identifiers]
63 | capitalisation_policy = lower
64 | [sqlfluff:rules:capitalisation.functions]
65 | extended_capitalisation_policy = lower
66 | [sqlfluff:rules:capitalisation.literals]
67 | capitalisation_policy = lower
68 | [sqlfluff:rules:capitalisation.types]
69 | extended_capitalisation_policy = lower
70 | 


--------------------------------------------------------------------------------
/.sqlfluffignore:
--------------------------------------------------------------------------------
1 | # SQLFluff doesn't work well with Snowflake loops, functions 
2 | # or Python stored procedures. Ignoring those files here
3 | Snowflake_ForLoop_GrantPermissions.sql
4 | Snowflake_Python_Stored_Procedure_Example.sql
5 | Snowflake_Shorten_Huge_Union_Queries.sql
6 | Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql
7 | PUT.sql
8 | Snowflake_Time_Travel.sql
9 | 


--------------------------------------------------------------------------------
/.yamllint:
--------------------------------------------------------------------------------
 1 | yaml-files:
 2 |   - '*.yml'
 3 |   - '*.yaml'
 4 |   - '.yamllint'
 5 | 
 6 | rules:
 7 |   braces: enable
 8 |   brackets: enable
 9 |   colons: enable
10 |   commas: enable
11 |   comments:
12 |     level: warning
13 |   comments-indentation:
14 |     level: warning
15 |   document-end: disable
16 |   document-start: disable
17 |   empty-lines: enable
18 |   empty-values: disable
19 |   hyphens: enable
20 |   indentation: enable
21 |   key-duplicates: enable
22 |   key-ordering: disable
23 |   new-line-at-end-of-file: enable
24 |   new-lines: disable
25 |   octal-values: disable
26 |   quoted-strings: disable
27 |   trailing-spaces: enable
28 |   truthy:
29 |     level: warning
30 |   # 120 chars should be enough, but don't fail if a line is longer
31 |   line-length:
32 |     max: 120
33 |     level: warning
34 | 


--------------------------------------------------------------------------------
/CI_Examples/python-ci.yml:
--------------------------------------------------------------------------------
 1 | name: Python Continuous Integration
 2 | 
 3 | parameters:
 4 |   - name: jobName
 5 |     default: 'PythonCI'
 6 |   - name: jobDisplay
 7 |     default: 'Lint .py files with Ruff'
 8 | 
 9 | trigger:
10 |   branches:
11 |     include:
12 |       - '*'
13 |     exclude:
14 |       - main
15 | 
16 | pool:
17 |   vmImage: 'ubuntu-latest'
18 | 
19 | jobs:
20 |   - job: ${{ parameters.jobName }}
21 |     timeoutInMinutes: 10
22 |     displayName: ${{ parameters.jobDisplay }}
23 | 
24 |     workspace:
25 |       clean: outputs
26 | 
27 |     steps:
28 |       # Checkout repo
29 |       - checkout: self
30 |         fetchDepth: 0
31 |         clean: true
32 | 
33 |       # List Pipeline directory and Build Source Version
34 |       - script: |
35 |           ls -R $(System.DefaultWorkingDirectory)
36 |         displayName: List directory contents
37 | 
38 |       - script: |
39 |           echo "Build.SourceVersion: $(Build.SourceVersion)"
40 |         displayName: Print Build.SourceVersion
41 | 
42 |       # Install Ruff
43 |       - script: |
44 |           pip install ruff
45 |         displayName: Install Ruff
46 | 
47 |       # Test Ruff Installation and list all .py files in repo
48 |       - script: |
49 |           ruff --version
50 |           echo "All changes in this commit:"
51 |           git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.py$' ||
52 |           echo "No Python files changed."
53 |         displayName: Test Ruff Install, List all .py files
54 | 
55 |       # Lint Python
56 |       - script: |
57 |           changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.py$') )
58 |           if [[ "${#changed[@]}" -gt 0 ]]; then
59 |             failed=false
60 |             for filename in "${changed[@]}"; do
61 |               if [[ -f "$filename" ]]; then
62 |                 echo "linting $filename"
63 |                 ruff check "$filename" || failed=true
64 |               else
65 |                 echo "File not found: $filename"
66 |               fi
67 |             done
68 |             if [[ $failed == true ]]; then
69 |               exit 1
70 |             fi
71 |           fi
72 |         displayName: Lint .py files
73 | 


--------------------------------------------------------------------------------
/CI_Examples/python-pr.yml:
--------------------------------------------------------------------------------
 1 | name: Python Pull Request Check
 2 | 
 3 | parameters:
 4 |   - name: jobName
 5 |     default: 'PythonCI'
 6 |   - name: jobDisplay
 7 |     default: 'Lint repo with Ruff + run all unit tests'
 8 | 
 9 | trigger:
10 |   branches:
11 |     include:
12 |       - main
13 | 
14 | pool:
15 |   vmImage: 'ubuntu-latest'
16 | 
17 | jobs:
18 |   - job: ${{ parameters.jobName }}
19 |     timeoutInMinutes: 30
20 |     displayName: ${{ parameters.jobDisplay }}
21 | 
22 |     workspace:
23 |       clean: outputs
24 | 
25 |     steps:
26 |       # Checkout repo
27 |       - checkout: self
28 |         fetchDepth: 1
29 |         clean: true
30 | 
31 |       # Install Ruff
32 |       - script: |
33 |           pip install pytest pytest-azurepipelines pytest-cov ruff
34 |         displayName: Install Pytest, Pytest Code Coverage and Ruff
35 | 
36 |       # Test Ruff Installation and list all files in repo
37 |       - script: |
38 |           echo "Ruff Version:" && ruff --version
39 |           echo "Pytest Version:" && pytest --version
40 |           echo "Pytest Coverage Version:" && pytest-cov --version
41 |           echo "Pytest Azure Pipelines Version:" && pytest-azurepipelines --version
42 |           git ls-files | grep '\.py$'
43 |         displayName: Test Installs, List all files for CI
44 | 
45 |       # Lint SQL
46 |       - script: |
47 |           git ls-files | grep '\.py$' | ruff check .
48 |         displayName: Analyzing the code with Ruff
49 |         continueOnError: true
50 | 
51 |       - script: |
52 |           pytest tests/ --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-report=xml
53 |         displayName: Run all Python unit tests
54 |         condition: always()
55 |         continueOnError: false
56 | 
57 |       - task: PublishCodeCoverageResults@1
58 |         inputs:
59 |           codeCoverageTool: Cobertura
60 |           summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
61 | 


--------------------------------------------------------------------------------
/CI_Examples/sqlfluff-ci.yml:
--------------------------------------------------------------------------------
 1 | # Azure CI pipeline that lints new/modified SQL files after every push to a git repository.
 2 | # See the SQLFluff GitHub for more info: [SQLFluff](https://github.com/sqlfluff/sqlfluff)
 3 | name: SQLFluff Continuous Integration
 4 | 
 5 | parameters:
 6 |   - name: jobName
 7 |     default: 'SQLFluffCI'
 8 |   - name: jobDisplay
 9 |     default: 'Lint .sql files with SQLFluff'
10 | 
11 | trigger:
12 |   branches:
13 |     include:
14 |       - '*'
15 |     exclude:
16 |       - main
17 | 
18 | pool:
19 |   vmImage: 'ubuntu-latest'
20 | 
21 | jobs:
22 |   - job: ${{ parameters.jobName }}
23 |     timeoutInMinutes: 10
24 |     displayName: ${{ parameters.jobDisplay }}
25 | 
26 |     workspace:
27 |       clean: outputs
28 | 
29 |     steps:
30 |       # Checkout repo
31 |       - checkout: self
32 |         fetchDepth: 0
33 |         clean: true
34 | 
35 |       # List Pipeline directory and Build Source Version
36 |       - script: |
37 |           ls -R $(System.DefaultWorkingDirectory)
38 |         displayName: List directory contents
39 | 
40 |       - script: |
41 |           echo "Build.SourceVersion: $(Build.SourceVersion)"
42 |         displayName: Print Build.SourceVersion
43 | 
44 |       # Install SQLFluff
45 |       - script: |
46 |           pip install sqlfluff
47 |         displayName: Install SQLFluff
48 | 
49 |       # Test SQLFluff Installation and list all .sql files in repo
50 |       - script: |
51 |           sqlfluff --version
52 |           echo "All changes in this commit:"
53 |           git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.sql$' ||
54 |           echo "No SQL files changed."
55 |         displayName: Test SQLFluff Install, List all .sql files
56 | 
57 |       # Lint SQL
58 |       - script: |
59 |           changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.sql$') )
60 |           if [[ "${#changed[@]}" -gt 0 ]]; then
61 |             failed=false
62 |             for filename in "${changed[@]}"; do
63 |               if [[ -f "$filename" ]]; then
64 |                 echo "linting $filename"
65 |                 sqlfluff lint "$filename" --dialect snowflake || failed=true
66 |               else
67 |                 echo "File not found: $filename"
68 |               fi
69 |             done
70 |             if [[ $failed == true ]]; then
71 |               exit 1
72 |             fi
73 |           fi
74 |         displayName: Lint .sql files
75 | 


--------------------------------------------------------------------------------
/CI_Examples/sqlfluff-pr.yml:
--------------------------------------------------------------------------------
 1 | # Azure CI pipeline that lints all SQL files during a PR.
 2 | # See the SQLFluff GitHub for more info: [SQLFluff](https://github.com/sqlfluff/sqlfluff)
 3 | name: SQLFluff PR Check
 4 | 
 5 | parameters:
 6 |   - name: jobName
 7 |     default: 'SQLFluffPR'
 8 |   - name: jobDisplay
 9 |     default: 'Lint repo with SQLFluff'
10 | 
11 | trigger:
12 |   branches:
13 |     include:
14 |       - main
15 | 
16 | pool:
17 |   vmImage: 'ubuntu-latest'
18 | 
19 | jobs:
20 |   - job: ${{ parameters.jobName }}
21 |     timeoutInMinutes: 30
22 |     displayName: ${{ parameters.jobDisplay }}
23 | 
24 |     workspace:
25 |       clean: outputs
26 | 
27 |     steps:
28 |       # Checkout repo
29 |       - checkout: self
30 |         fetchDepth: 1
31 |         clean: true
32 | 
33 |       # Install SQLFluff
34 |       - script: |
35 |           pip install sqlfluff
36 |         displayName: Download and Install SQLFluff
37 | 
38 |       # Test SQLFluff Installation and list all files in repo
39 |       - script: |
40 |           sqlfluff --version
41 |           git ls-files | grep \.sql
42 |         displayName: Test SQLFluff Install, List all files for CI
43 | 
44 |       # Lint SQL
45 |       - script: |
46 |           git ls-files | grep \.sql | sqlfluff lint --dialect snowflake
47 |         displayName: Analyzing the code with SQLFluff
48 | 


--------------------------------------------------------------------------------
/CI_Examples/yamllint-ci.yml:
--------------------------------------------------------------------------------
 1 | # Azure CI pipeline that lints YAML files in the dbt repository.
 2 | # See the yamllint GitHub for more info: [yamllint](https://github.com/adrienverge/yamllint)
 3 | name: yamllint Continuous Integration
 4 | 
 5 | parameters:
 6 |   - name: jobName
 7 |     default: 'YAMLLintCI'
 8 |   - name: jobDisplay
 9 |     default: 'Lint .yml files with YAMLLint'
10 | 
11 | trigger:
12 |   branches:
13 |     include:
14 |       - '*'
15 |     exclude:
16 |       - main
17 | 
18 | pool:
19 |   vmImage: 'ubuntu-latest'
20 | 
21 | jobs:
22 |   - job: ${{ parameters.jobName}}
23 |     timeoutInMinutes: 10
24 |     displayName: ${{ parameters.jobDisplay }}
25 | 
26 |     workspace:
27 |       clean: outputs
28 | 
29 |     steps:
30 |       # Checkout repo
31 |       - checkout: self
32 |         fetchDepth: 0
33 |         clean: true
34 | 
35 |       # List Pipeline directory and Build Source Version
36 |       - script: |
37 |           ls -R $(System.DefaultWorkingDirectory)
38 |         displayName: List directory contents
39 | 
40 |       - script: |
41 |           echo "Build.SourceVersion: $(Build.SourceVersion)"
42 |         displayName: Print Build.SourceVersion
43 | 
44 |       # Install yamllint
45 |       - script: |
46 |           pip install yamllint
47 |         displayName: Install yamllint
48 | 
49 |       # Test yamllint Installation and list all .yml files in repo
50 |       - script: |
51 |           yamllint --version
52 |           echo "All changes in this commit:"
53 |           git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.yml$' ||
54 |           echo "No YAML files changed."
55 |         displayName: Test yamllint Install, List all .yml files
56 | 
57 |       # Lint YAML
58 |       - script: |
59 |           changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.yml$') )
60 |           if [[ "${#changed[@]}" -gt 0 ]]; then
61 |             failed=false
62 |             for filename in "${changed[@]}"; do
63 |               if [[ -f "$filename" ]]; then
64 |                 echo "linting $filename"
65 |                 yamllint "$filename" || failed=true
66 |               else
67 |                 echo "File not found: $filename"
68 |               fi
69 |             done
70 |             if [[ $failed == true ]]; then
71 |               exit 1
72 |             fi
73 |           fi
74 |         displayName: Lint .yml files
75 | 


--------------------------------------------------------------------------------
/CI_Examples/yamllint-pr.yml:
--------------------------------------------------------------------------------
 1 | # Azure CI pipeline that lints all YAML files during a PR.
 2 | # See the yamllint GitHub for more info: [yamllint](https://github.com/adrienverge/yamllint)
 3 | name: yamllint PR Check
 4 | 
 5 | parameters:
 6 |   - name: jobName
 7 |     default: 'yamllintPR'
 8 |   - name: jobDisplay
 9 |     default: 'Lint .yml files with yamllint'
10 | 
11 | trigger:
12 |   branches:
13 |     include:
14 |       - main
15 | 
16 | pool:
17 |   vmImage: 'ubuntu-latest'
18 | 
19 | jobs:
20 |   - job: ${{ parameters.jobName}}
21 |     timeoutInMinutes: 10
22 |     displayName: ${{ parameters.jobDisplay }}
23 | 
24 |     workspace:
25 |       clean: outputs
26 | 
27 |     steps:
28 |       # Checkout repo
29 |       - checkout: self
30 |         fetchDepth: 1
31 |         clean: true
32 | 
33 |       # Install yamllint
34 |       - script: |
35 |           pip install yamllint
36 |         displayName: Download yamllint
37 | 
38 |       # Test yamllint installation and list all .yml files in the repo
39 |       - script: |
40 |           yamllint --version
41 |           git ls-files | grep \.yml
42 |         displayName: Test yamllint Install, list all .yml files
43 | 
44 |       # Lint YAML
45 |       - script: |
46 |           git ls-files | grep \.yml | yamllint .
47 |         displayName: Lint .yml files
48 | 


--------------------------------------------------------------------------------
/Docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # I wrote this Dockerfile to run the Python script inside of my container, but it doesn't work because of our Windows auth at work
 2 | FROM python:3.8-slim
 3 | 
 4 | ENV DEBIAN_FRONTEND="noninteractive"\
 5 |   ACCEPT_EULA="y"
 6 | 
 7 | # install system dependencies
 8 | # Microsoft SQL Server Prerequisites
 9 | RUN apt-get update -y \
10 |  && apt-get install -y gcc curl gnupg build-essential\
11 |  unixodbc unixodbc-dev tdsodbc freetds-common freetds-bin freetds-dev\
12 |  && curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - \
13 |  && curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list \
14 |  && apt-get update \
15 |  && apt-get install -y --no-install-recommends locales apt-transport-https\
16 |  && apt-get -y --no-install-recommends install msodbcsql18 unixodbc-dev
17 | 
18 | WORKDIR /usr/src/app
19 | 
20 | COPY requirements.txt ./
21 | 
22 | RUN pip install --no-cache-dir -r requirements.txt
23 | 
24 | COPY . .
25 | 
26 | CMD [ "python", "./SQL_Server_ForLoop.py" ]
27 | 


--------------------------------------------------------------------------------
/Docker/Populate_SQL_Server_Docker_Container.py:
--------------------------------------------------------------------------------
 1 | """Title: Populate SQL Server Docker Container with production data
 2 | By: Martin Palkovic
 3 | Date: 2022-07-25
 4 | Description: Recently I had a need for a small, lightweight SQL Server development 
 5 | environment where I could play around with data and not impact anything in production.
 6 | This python script was my solution - it iteratively creates and populates tables 
 7 | in a test database that resides within a docker container.
 8 | 
 9 | Due to our Windows auth at work, I couldn't get this to run in a docker-compose 
10 | file (i.e within the container). The solution is to run docker-compose to initialize 
11 | SQL Server in the container, and then run this script locally
12 | 
13 | Exec in shell:
14 | cd your/file/location
15 | docker-compose up
16 | python3 Populate_SQL_Server_Docker_Container.py
17 | """
18 | 
19 | #import modules
20 | import pandas as pd
21 | from sqlalchemy.engine import URL
22 | from sqlalchemy import create_engine
23 | 
24 | #server credentials - prod
25 | prod_server = 'prod_server'
26 | prod_db = 'prod_db'
27 | 
28 | #server credentials - docker
29 | docker_server = 'localhost'
30 | docker_db = 'test_db'
31 | username = 'sa' 
32 | password = 'Your-Strong!Password@Here%'
33 | #-------------------------
34 | driver = 'SQL Server'
35 | schema = 'dbo'
36 | 
37 | def sqlalchemy_cnxn(driver, server, db):
38 |     connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
39 |     url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
40 |     engine = create_engine(url)
41 |     return engine
42 | 
43 | # SQLAlchemy for Prod
44 | prod_engine = sqlalchemy_cnxn(driver, prod_server, prod_db)
45 | 
46 | # SQLAlchemy for Docker
47 | docker_engine = sqlalchemy_cnxn(driver, docker_server, docker_db)
48 | 
49 | docker_engine.execute('''
50 | if not exists (select 1 from sys.databases where name = N'test_db')
51 | create database test_db;
52 | '''
53 | )
54 | 
55 | """create a list of each table in the database, 
56 | and remove table names from the list that contain numbers 
57 | (i.e duplicates/backups with dates on the end)
58 | If you only want certain tables, you can manipulate this list however you like.
59 | Only table names on this list will be queried from your prod database in the 
60 | for loop below"""
61 | prod_tables = [table for table in prod_engine.table_names()]
62 | prod_tables = [i for i in prod_tables if not any(char.isdigit() for char in i)]
63 | 
64 | # This block is needed to connect to the db now that we have created it
65 | docker_engine = sqlalchemy_cnxn(driver, docker_server, docker_db)
66 | 
67 | """iterate over each table to populate the Docker container
68 | Note that this takes ~1 min per 50 tables"""
69 | for table in prod_tables:
70 |     try:
71 |         #read
72 |         query = f'select top 1000 * from {prod_db}.{schema}.{table}'
73 |         results = prod_engine.execute(query)
74 |         df_sql = pd.read_sql(query, prod_engine)
75 | 
76 |         #write
77 |         df_sql.to_sql(f'{table}', schema= f'{schema}', 
78 |             con = docker_engine, chunksize=1, 
79 |             index=False, if_exists='replace')
80 |     except Exception:
81 |         print(f'failed to insert {table} to docker container')


--------------------------------------------------------------------------------
/Docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.9'
 2 | 
 3 | services:
 4 |   SQL-Server:
 5 |     image: mcr.microsoft.com/mssql/server:2022-latest
 6 |     container_name: SQL_Server_Dev_Environment
 7 |     restart: unless-stopped
 8 |     ports:
 9 |       - "1433:1433"
10 |     environment:
11 |       - ACCEPT_EULA=Y
12 |       - SA_PASSWORD=-Your-Strong!Password@Here%
13 | 
14 |   # I cant actually get this to work due to our Windows auth/active directory situation at Cooke...
15 |   # i.e from within the container, my script doesn't know how to authenticate to our production SQL server
16 |   # python:
17 |   #   container_name: SQL_Server_Python_Script
18 |   #   build: ./
19 |   #   command: python3 ./SQL_Server_ForLoop.py
20 | 


--------------------------------------------------------------------------------
/Docker/requirements.txt:
--------------------------------------------------------------------------------
1 | pyodbc
2 | sqlalchemy
3 | pandas
4 | numpy


--------------------------------------------------------------------------------
/Fivetran/disable_tables_with_zero_rows_fivetran_api.py:
--------------------------------------------------------------------------------
  1 | """ Import Modules """
  2 | import os
  3 | import json
  4 | import requests
  5 | import pandas as pd
  6 | 
  7 | from sqlalchemy.engine import URL
  8 | from sqlalchemy import create_engine
  9 | from dotenv import load_dotenv
 10 | 
 11 | load_dotenv()
 12 | 
 13 | # Retrieve Fivetran secrets
 14 | fivetran_key = os.getenv("FIVETRAN_KEY")
 15 | fivetran_secret = os.getenv("FIVETRAN_SECRET")
 16 | 
 17 | # --------------------------------------------
 18 | """ Retrieve list of Fivetran connector IDs"""
 19 | 
 20 | # Define API variables
 21 | group_id = "my_fivetran_group_id"
 22 | url = "https://api.fivetran.com/v1/groups/" + group_id + "/connectors"
 23 | headers = {"Accept": "application/json"}
 24 | 
 25 | # API GET request
 26 | response = requests.get(url, headers=headers, auth=(fivetran_key, fivetran_secret))
 27 | data = response.json()
 28 | 
 29 | # Save Fivetran connector list to file
 30 | with open("fivetran_connector_list.json", "w") as file:
 31 |     json.dump(data, file, indent=4)
 32 | 
 33 | # Create a dictionary containing the database name(key) and connector ID(value)
 34 | connector_id_dict = {
 35 |     item["schema"].upper()
 36 |     if item["schema"] != "db_name_you_want_capitalized"
 37 |     else item["schema"].capitalize(): item["id"]
 38 |     for item in data["data"]["items"]
 39 | }
 40 | 
 41 | print(
 42 |     f"""Dictionary of connector ID's for Fivetran databases:
 43 |       {connector_id_dict} \n"""
 44 | )
 45 | 
 46 | # ------------------------------------------------------------------
 47 | """ Establish SQL Server Connection"""
 48 | 
 49 | # Define variables
 50 | driver = "SQL Server"
 51 | server = "my_server"
 52 | 
 53 | # Define connection function
 54 | def sqlalchemy_cnxn(driver, server, db):
 55 |     """ Function for connecting to SQL Server via SQLAlchemy """
 56 |     connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
 57 |     url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
 58 |     engine = create_engine(url)
 59 |     return engine
 60 | 
 61 | # ------------------------------------------------------------
 62 | """ Loop over list of databases/connector IDs to retrive tables
 63 | with 0 rows from SQL server, and call a PATCH request with the Fivetran API
 64 | to disable tables with 0 rows for that connector"""
 65 | 
 66 | for database in connector_id_dict.keys():
 67 |     engine = sqlalchemy_cnxn(driver, server, database)
 68 | 
 69 |     print(f"successfully connected to {server}.{database}!\n")
 70 |     print()  # new line
 71 | 
 72 |     # Query the sys schema for the database to get tables with 0 rows of data
 73 |     query = f"""
 74 |     SELECT
 75 |         t.NAME AS TableName,
 76 |         p.rows AS RowCounts
 77 |     FROM {database}.sys.tables AS t
 78 | 
 79 |     INNER JOIN {database}.sys.partitions AS p
 80 |         ON t.object_id = p.OBJECT_ID
 81 | 
 82 |     WHERE 
 83 |         t.NAME NOT LIKE 'dt%' 
 84 |         AND t.is_ms_shipped = 0
 85 |         AND p.rows = 0
 86 | 
 87 |     GROUP BY 
 88 |         t.Name, p.Rows
 89 | 
 90 |     ORDER BY 
 91 |         t.Name
 92 |     """
 93 | 
 94 |     # load results of query to Pandas dataframe
 95 |     df = pd.read_sql(query, engine)
 96 | 
 97 |     print(f"tables with 0 rows of data in {database} database: {len(df)}\n")
 98 | 
 99 |     tables_to_unsync = df["TableName"].tolist()
100 | 
101 |     # Create a JSON payload of tables to disable
102 |     tables_payload = {table_name: {"enabled": False} for table_name in tables_to_unsync}
103 |     payload = {"enabled": True, "tables": tables_payload}
104 | 
105 |     # For testing, if needed
106 |     # with open(f"{database}_payload.json", "w") as file:
107 |     #     json.dump(payload, file, indent = 4)
108 | 
109 |     # # ######################################
110 |     """ Fivetran API Call to disable tables"""
111 | 
112 |     connector_id = connector_id_dict[database]
113 |     print(f"Connector ID for {database}: {connector_id}\n")
114 | 
115 |     schema_name = "dbo"
116 |     url = (
117 |         "https://api.fivetran.com/v1/connectors/"
118 |         + connector_id
119 |         + "/schemas/"
120 |         + schema_name
121 |     )
122 | 
123 |     headers = {"Content-Type": "application/json", "Accept": "application/json"}
124 | 
125 |     """Fivetran API call - comment this block if you are testing the script"""
126 |     response = requests.patch(url,
127 |                               json = payload,
128 |                               headers = headers,
129 |                               auth = (fivetran_key, fivetran_secret))
130 | 
131 |     data = response.json()
132 |     print(f"Successfully called the Fivetran API for the {connector_id} connector!\n")
133 | 
134 |     # For testing, if needed
135 |     # with open('fivetran_api_response.json', 'w') as file:
136 |     #     file.write(str(data))
137 |     # print(f"Successfully saved logs to file!\n")
138 | 
139 |     # break #LEAVE THIS IN IF YOU ARE TESTING
140 | 


--------------------------------------------------------------------------------
/Python/Snowflake_Insert_Statements.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | df = pd.read_csv(r"your/file/here.csv")
 5 | df = df.replace({np.nan: "NULL"})
 6 | 
 7 | print("successfully read csv!\n")
 8 | 
 9 | 
10 | def sql_insert_statement_from_dataframe(source, target):
11 |     print("insert into " + target + "(" + str(", ".join(source.columns)) + ") values ")
12 |     for i, x in source.iterrows():
13 |         values = x.values
14 |         formatted_values = []
15 |         for val in values:
16 |             if val == "NULL":
17 |                 formatted_values.append(val)
18 |             else:
19 |                 formatted_values.append("'" + str(val) + "'")
20 |         if i == len(source) - 1:
21 |             print("(" + str(", ".join(formatted_values)) + ");")
22 |         else:
23 |             print("(" + str(", ".join(formatted_values)) + "),")
24 | 
25 | 
26 | sql_insert_statement_from_dataframe(df, "my_db.my_schema.my_table")
27 | 


--------------------------------------------------------------------------------
/Python/Snowpark_Backload_API_Data.py:
--------------------------------------------------------------------------------
 1 | # **********************************************************************#
 2 | # Title: Backload API data using Snowpark Python
 3 | # By: Martin Palkovic
 4 | # Date: 2022-11-18
 5 | # Description: Here is another Snowpark example, where you can loop through
 6 | # an API call and insert the JSON response for each days worth of data
 7 | # into a VARIANT table in Snowflake
 8 | # *********************************************************************#
 9 | 
10 | # Import modules
11 | import os
12 | import json
13 | import requests
14 | 
15 | from datetime import date, timedelta
16 | from snowflake.snowpark import Session
17 | 
18 | from dotenv import load_dotenv
19 | 
20 | load_dotenv()
21 | 
22 | # Establish Snowflake Connection using Snowpark
23 | account = os.getenv("SNOWFLAKE_ACCT")
24 | user = os.getenv("SNOWFLAKE_USER")
25 | password = os.getenv("SNOWFLAKE_PASSWORD")
26 | role = os.getenv("SNOWFLAKE_ROLE")
27 | role = "SYSADMIN"
28 | warehouse = "MY_WH"
29 | database = "DEV"
30 | schema = "MY_SCHEMA"
31 | target_table = "MY_TABLE"
32 | 
33 | api_key = os.getenv("MY_API_KEY")
34 | 
35 | 
36 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema):
37 |     connection_parameters = {
38 |         "account": account,
39 |         "user": user,
40 |         "password": password,
41 |         "role": role,
42 |         "warehouse": warehouse,
43 |         "database": database,
44 |         "schema": schema,
45 |     }
46 |     session = Session.builder.configs(connection_parameters).create()
47 |     return session
48 | 
49 | 
50 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema)
51 | 
52 | print(
53 |     session.sql(
54 |         "SELECT CURRENT_WAREHOUSE(), CURRENT_DATABASE(), CURRENT_SCHEMA()"
55 |     ).collect()
56 | )
57 | 
58 | # API variables
59 | headers = {"APIKey": f"{api_key}"}
60 | 
61 | 
62 | # Define a function so we can loop over a date range
63 | def daterange(start_date, end_date):
64 |     for n in range(int((end_date - start_date).days)):
65 |         yield start_date + timedelta(n)
66 | 
67 | 
68 | start_date = date(2019, 1, 1)
69 | end_date = date(2022, 11, 18)
70 | 
71 | # Loop through 4 years worth of API data, insert into Snowflake VARIANT table
72 | for dates in daterange(start_date, end_date):
73 |     url = f"https://api.mywebsite.com/api/data?&startDate={date}&endDate={date}"
74 |     response = requests.request("GET", url, headers=headers)
75 | 
76 |     formatted_json = json.loads(response.text)
77 |     formatted_json = json.dumps(formatted_json, indent=4)
78 | 
79 |     # insert to Snowflake
80 |     session.sql(
81 |         f"""INSERT INTO {target_table} (JSON_DATA, INSERT_DATE)
82 |                     SELECT PARSE_JSON('{formatted_json}'),
83 |                     CURRENT_TIMESTAMP();"""
84 |     ).collect()
85 | 


--------------------------------------------------------------------------------
/Python/Snowpark_Create_Stored_Procedure.py:
--------------------------------------------------------------------------------
  1 | # This only runs on a Python 3.8 environment
  2 | 
  3 | # import modules
  4 | import os
  5 | import snowflake
  6 | import pandas as pd
  7 | 
  8 | from snowflake.snowpark import Session
  9 | from snowflake.snowpark.types import StringType
 10 | 
 11 | from dotenv import load_dotenv
 12 | 
 13 | load_dotenv()
 14 | 
 15 | # Establish Snowflake Connection
 16 | account = os.getenv("SNOWFLAKE_ACCT")
 17 | user = os.getenv("SNOWFLAKE_USER")
 18 | password = os.getenv("SNOWFLAKE_PASSWORD")
 19 | role = os.getenv("SNOWFLAKE_ROLE")
 20 | warehouse = "REPORTING_WH"
 21 | database = "STAGING_DEV"
 22 | schema = "MISC"
 23 | 
 24 | 
 25 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema):
 26 |     connection_parameters = {
 27 |         "account": account,
 28 |         "user": user,
 29 |         "password": password,
 30 |         "role": role,
 31 |         "warehouse": warehouse,
 32 |         "database": database,
 33 |         "schema": schema,
 34 |     }
 35 |     session = Session.builder.configs(connection_parameters).create()
 36 |     return session
 37 | 
 38 | 
 39 | print("Connecting to Snowpark...\n")
 40 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema)
 41 | 
 42 | print(
 43 |     session.sql(
 44 |         "select current_warehouse(), current_database(), current_schema()"
 45 |     ).collect(),
 46 |     "\n",
 47 | )
 48 | print("Connected!\n")
 49 | 
 50 | session.sql(
 51 |     """create or replace table 
 52 |         mytable(amount number comment 'fake amounts for testing', 
 53 |         fruits string comment 'fake types of fruit for testing')"""
 54 | ).show()
 55 | 
 56 | session.sql("""create or replace table mytable2 like mytable""").show()
 57 | 
 58 | session.sql(
 59 |     """insert into mytable values (1, 'apple'),
 60 |                                     (2, 'orange'),
 61 |                                     (5, 'grape'),
 62 |                                     (7, 'cantelope'),
 63 |                                     (9, 'pineapple'),
 64 |                                     (17, 'banana'),
 65 |                                     (21, 'tangerine')"""
 66 | ).show()
 67 | 
 68 | session.sql(
 69 |     """insert into mytable2 values (1, 'apple'),
 70 |                                     (3, 'orange'),
 71 |                                     (5, 'grape'),
 72 |                                     (7, 'strawberry'),
 73 |                                     (10, 'pineapple'),
 74 |                                     (17, 'banana'),
 75 |                                     (22, 'raspberry')"""
 76 | ).show()
 77 | 
 78 | 
 79 | def print_differences(
 80 |     session: snowflake.snowpark.Session,
 81 |     table1: str,
 82 |     table2: str,
 83 |     field1: str,
 84 |     field2: str,
 85 | ):
 86 |     # read the tables into a snowpark dataframe
 87 |     table1 = session.table(table1)
 88 |     table2 = session.table(table2)
 89 | 
 90 |     # convert to pandas
 91 |     df1 = table1.to_pandas()
 92 |     df2 = table2.to_pandas()
 93 | 
 94 |     # convert the the fields of interest from each table to a list
 95 |     list1 = df1[field1].to_list()
 96 |     list2 = df2[field2].to_list()
 97 | 
 98 |     return ", ".join(item for item in list1 if item not in list2)
 99 | 
100 | 
101 | session.add_packages("snowflake-snowpark-python")
102 | 
103 | print("Registering Stored Procedure with Snowflake...\n")
104 | 
105 | session.sproc.register(
106 |     func=print_differences,
107 |     return_type=StringType(),
108 |     input_types=[StringType(), StringType(), StringType(), StringType()],
109 |     is_permanent=True,
110 |     name="PRINT_DIFFERENCES",
111 |     replace=True,
112 |     stage_location="@UDF_STAGE",
113 | )
114 | 
115 | print("Stored Procedure registered with Snowflake!\n")
116 | 
117 | # You can return the results on one line using the sql() method:
118 | """session.sql('''call print_differences('MYTABLE', 
119 |                                         'MYTABLE2', 
120 |                                         'FRUITS', 
121 |                                         'FRUITS')''').show()"""
122 | 
123 | # Call stored procedure, print results as dataframe
124 | x = session.call("print_differences", "MYTABLE", "MYTABLE2", "FRUITS", "FRUITS")
125 | print(x, "\n")
126 | 
127 | df = pd.DataFrame({"Differences": x.split(",")})
128 | print(df)
129 | 


--------------------------------------------------------------------------------
/Python/Snowpark_Example_Backload_SQL_Server_Data.py:
--------------------------------------------------------------------------------
  1 | # **********************************************************************#
  2 | # Title: Basic Snowpark Example for backloading data to Snowflake
  3 | # By: Martin Palkovic
  4 | # Date: 2022-11-18
  5 | # Description: Recently I needed to backload some exchange rate data into Snowflake from
  6 | # SQL Server, and was excited because I got to test out Snowpark! It is a really nice
  7 | # way to interact with Snowflake using Python.
  8 | # *********************************************************************#
  9 | 
 10 | # Import modules
 11 | import os
 12 | from sqlalchemy.engine import URL
 13 | from sqlalchemy import create_engine
 14 | 
 15 | import pandas as pd
 16 | 
 17 | from snowflake.snowpark import Session
 18 | 
 19 | from dotenv import load_dotenv
 20 | 
 21 | load_dotenv()
 22 | 
 23 | # Establish SQL Server Connection
 24 | driver = "SQL Server"
 25 | server = "my_server"
 26 | database = "my_db"
 27 | schema = "dbo"
 28 | table = "Daily_Exchange_Rates"
 29 | 
 30 | 
 31 | # Define connection function
 32 | def sqlalchemy_cnxn(driver, server, db):
 33 |     connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
 34 |     url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
 35 |     engine = create_engine(url)
 36 |     return engine
 37 | 
 38 | 
 39 | engine = sqlalchemy_cnxn(driver, server, database)
 40 | 
 41 | # If you're not performing any data transformation at the
 42 | # SQL Server level, this is a great way to parameterize column names
 43 | columns = f"""SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS 
 44 |             WHERE TABLE_NAME LIKE N'{table}'"""
 45 | 
 46 | df_cols = pd.read_sql(columns, engine)
 47 | columns = ", ".join(df_cols["COLUMN_NAME"].to_list())
 48 | 
 49 | query = f"""SELECT {columns} FROM {database}.{schema}.{table}"""
 50 | 
 51 | # load query to dataframe
 52 | df_fx = pd.read_sql(query, engine)
 53 | print("Total records from SQL Server:", len(df_fx))
 54 | 
 55 | # --------------------------------------------
 56 | 
 57 | # Establish Snowpark Connection
 58 | account = os.getenv("SNOWFLAKE_ACCT")
 59 | user = os.getenv("SNOWFLAKE_USER")
 60 | password = os.getenv("SNOWFLAKE_PASSWORD")
 61 | role = os.getenv("SNOWFLAKE_ROLE")
 62 | warehouse = "REPORTING_WH"
 63 | database = "DEV"
 64 | schema = "MY_SCHEMA"
 65 | target_table = "CURRENCY_EXCHANGE_RATES"
 66 | temp_table = "FX_RATE_TEMP"
 67 | 
 68 | 
 69 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema):
 70 |     connection_parameters = {
 71 |         "account": account,
 72 |         "user": user,
 73 |         "password": password,
 74 |         "role": role,
 75 |         "warehouse": warehouse,
 76 |         "database": database,
 77 |         "schema": schema,
 78 |     }
 79 |     session = Session.builder.configs(connection_parameters).create()
 80 |     return session
 81 | 
 82 | 
 83 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema)
 84 | 
 85 | print(
 86 |     session.sql(
 87 |         "select current_warehouse(), current_database(), current_schema()"
 88 |     ).collect()
 89 | )
 90 | 
 91 | # ---------------------------------------------------------------------
 92 | 
 93 | # Transform the data (if needed) to match the format that is required for Snowflake
 94 | # In my case, the data in the source data did not match what I needed
 95 | # for Snowflake.
 96 | 
 97 | df_sf = pd.DataFrame()
 98 | 
 99 | df_sf[["FROM_CURRENCY", "TO_CURRENCY"]] = df_fx["EXGTBLID_TRANSFORMED"].str.split(
100 |     "-", 1, expand=True
101 | )
102 | df_sf = df_sf[
103 |     df_sf["TO_CURRENCY"].str.contains("|".join(["AVG", "BUY", "SELL", "ALL"])) is False
104 | ]  # drops rows that contain junk data
105 | 
106 | df_sf["EFFECTIVE_START"] = df_fx["EXCHDATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S")
107 | df_sf["EFFECTIVE_STOP"] = (
108 |     df_fx["EXCHDATE"] + pd.DateOffset(days=7, hours=23, minutes=59)
109 | ).dt.strftime("%Y-%m-%d %H:%m:%s.%S")
110 | 
111 | df_sf["RATE"] = df_fx["XCHGRATE"]
112 | 
113 | # Get current datetime
114 | df_sf["STAGE_DATE"] = pd.Timestamp.now()
115 | df_sf["STAGE_DATE"] = df_sf["STAGE_DATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S")
116 | 
117 | # strip all whitespace from every field
118 | df_sf = df_sf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
119 | print("Total records after transformations:", len(df_sf))
120 | 
121 | columns = ", ".join(df_sf.columns)
122 | 
123 | # Create Snowpark DataFrame
124 | df = session.create_dataframe(df_sf)
125 | 
126 | df.write.mode("overwrite").save_as_table(
127 |     f"{temp_table}", column_order="name", table_type="temporary"
128 | )
129 | 
130 | session.sql(f"SELECT COUNT(*) FROM {temp_table}").collect()
131 | 
132 | # OPTION 1: Overwrite + insert new data
133 | session.sql(
134 |     f"""INSERT OVERWRITE INTO {target_table} ({columns})
135 |             SELECT {columns} FROM {temp_table}"""
136 | ).collect()
137 | 
138 | # -------------------------------------------------------------
139 | 
140 | # OPTION 2: Incremental load
141 | session.sql(
142 |     f"""MERGE INTO {target_table} Dest
143 |             USING (
144 |                 SELECT {columns} FROM {temp_table}
145 |                 QUALIFY ROW_NUMBER() OVER (
146 |                     PARTITION BY MY_KEY
147 |                     ORDER BY DATE ASC) = 1
148 |                     ) Source
149 |                     ON Dest.MY_KEY = Source.MY_KEY
150 |                     AND Dest.FROM_CURRENCY = Source.FROM_CURRENCY
151 |                     AND Dest.TO_CURRENCY = Source.TO_CURRENCY
152 |             WHEN MATCHED THEN UPDATE
153 |             SET   Dest.FROM_CURRENCY = Source.FROM_CURRENCY
154 |                 , Dest.TO_CURRENCY = Source.TO_CURRENCY
155 |                 , Dest.DATE = Source.DATE
156 |                 , Dest.RATE = Source.RATE
157 |                 , Dest.STAGE_DATE = Source.STAGE_DATE
158 |             
159 |             WHEN NOT MATCHED THEN INSERT(
160 |                   FROM_CURRENCY
161 |                 , TO_CURRENCY
162 |                 , DATE
163 |                 , RATE
164 |                 , STAGE_DATE
165 |             )
166 |             VALUES(
167 |                   Source.FROM_CURRENCY
168 |                 , Source.TO_CURRENCY
169 |                 , Source.EFFECTIVE_START
170 |                 , Source.RATE
171 |                 , Source.STAGE_DATE
172 |             )
173 |                 """
174 | ).collect()
175 | 


--------------------------------------------------------------------------------
/Python/Stack.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MartyC-137/Data-Engineering/d5c850def89f2cc2f28b88b713313486ab20a9e7/Python/Stack.ipynb


--------------------------------------------------------------------------------
/Python/compare_two_lists_for_differences.py:
--------------------------------------------------------------------------------
 1 | """Compare two lists for differences
 2 | By: Martin Palkovic
 3 | Date: 2022-02-09"""
 4 | # ------------------------------
 5 | # a common work task is to compare two database ID fields
 6 | # against each other to determine which records exist
 7 | # in one table but not another. This operation can take 10+
 8 | # minutes to run in SQl and is syntactically heavy, but is
 9 | # fast and easy in Python.
10 | 
11 | 
12 | # Copy and paste your fields below
13 | # to identify records that are unique to one of the tables
14 | 
15 | list1 = ["red", "blue", "yellow", 7, 25]  # copy and paste your values into here
16 | list2 = ["yellow", 7, "blue", 1, 5.4]
17 | 
18 | # returns items that are in list1 but not in list2
19 | list_difference = [item for item in list1 if item not in list2]
20 | print(list_difference)
21 | 


--------------------------------------------------------------------------------
/Python/connecting_to_snowflake_using_python.py:
--------------------------------------------------------------------------------
 1 | """ Import Modules """
 2 | import os
 3 | from dotenv import load_dotenv
 4 | from snowflake import connector
 5 | # import pandas as pd
 6 | 
 7 | load_dotenv()
 8 | 
 9 | # establish connection to Snowflake using .env file
10 | connection = connector.connect(
11 |     user=os.getenv("SNOWFLAKE_USER"),
12 |     password=os.getenv("SNOWFLAKE_PASSWORD"),
13 |     account=os.getenv("SNOWFLAKE_ACCT"),
14 |     role=os.getenv("SNOWFLAKE_ROLE"),
15 |     warehouse="REPORTING_WH",
16 | )
17 | 
18 | # sample SQL query, paste whatever you'd like in here
19 | sql_query = "select * from database.schema.table limit 10;"
20 | 
21 | # execute the query
22 | cursor = connection.cursor()
23 | cursor.execute(sql_query)
24 | 
25 | # load the data in to Pandas
26 | df = cursor.fetch_pandas_all()
27 | df.head()
28 | 


--------------------------------------------------------------------------------
/Python/connecting_to_sql_server_using_python.py:
--------------------------------------------------------------------------------
 1 | # import modules
 2 | import pyodbc
 3 | import pandas as pd
 4 | 
 5 | # set all rows and columns visible
 6 | # pd.set_option('display.max_columns', None)
 7 | # pd.set_option('display.max_rows', None)
 8 | 
 9 | 
10 | # server credentials
11 | server = "server"
12 | database = "database"
13 | 
14 | # sql connection - uses AD to authenticate
15 | cnxn = pyodbc.connect(
16 |     Trusted_Connection="Yes", Driver="{SQL Server}", Server=server, Database=database
17 | )
18 | cursor = cnxn.cursor()
19 | 
20 | # stick your query inside the triple quotes
21 | query = """select top 10 * from database.dbo.table"""
22 | 
23 | # load query to dataframe
24 | df_sql = pd.read_sql(query, cnxn)
25 | df_sql.head()
26 | 


--------------------------------------------------------------------------------
/Python/determine_sql_field_length.py:
--------------------------------------------------------------------------------
 1 | """Determing the maximum Length of a field for database table design
 2 | By: Martin Palkovic
 3 | Date: 2022-02-04
 4 | 
 5 | When building ETL/Integration jobs to Snowflake (or building any SQL table),
 6 | you need to designate how many characters are allowed in a field. I like to use
 7 | Python to quantitatively answer this question rather than manually counting or
 8 | guessing how many characters to allow in a varchar field """
 9 | 
10 | #import modules
11 | import pyodbc
12 | import pandas as pd
13 | 
14 | #set all rows and columns visible
15 | #pd.set_option('display.max_columns', None)
16 | #pd.set_option('display.max_rows', None)
17 | 
18 | #server credentials
19 | server = 'server'
20 | database = 'database'
21 | 
22 | #sql connection
23 | cnxn = pyodbc.connect(
24 |     Trusted_Connection= 'Yes',
25 |     Driver= '{SQL Server}',
26 |     Server= server,
27 |     Database= database
28 | )
29 | cursor = cnxn.cursor()
30 | 
31 | """stick your query inside the triple quotes"""
32 | 
33 | query = """SELECT * FROM <your_table_here>"""
34 | 
35 | #load query to dataframe
36 | df_sql = pd.read_sql(query, cnxn)
37 | df_sql.head()
38 | 
39 | """Example"""
40 | #Field of Interest
41 | foi = 'Item_Key'
42 | print('{} maximum record length ='.format(foi),
43 |         max(df_sql[foi].astype(str).map(len)), 'characters')
44 | # Output: Item_Key maximum record length = 19 characters
45 | 
46 | #Or run a for loop to get values for every column:
47 | for c in df_sql.columns:
48 |     print('{} maximum record length ='.format(c),
49 |          max(df_sql[c].astype(str).map(len)), 'characters',
50 |          'data type = {}'.format(df_sql[c].dtype))
51 | 
52 | #object == varchar
53 | """
54 | Company maximum record length = 18 characters , data type = object
55 | Company_Key maximum record length = 4 characters , data type = object
56 | Site_Key maximum record length = 4 characters , data type = object
57 | Item_Key maximum record length = 19 characters , data type = object
58 | Item_Description maximum record length = 100 characters , data type = object
59 | Species maximum record length = 15 characters , data type = object
60 | Standard_Cost maximum record length = 8 characters , data type = float64
61 | Current_Cost maximum record length = 8 characters , data type = float64
62 | Category maximum record length = 16 characters , data type = object
63 | Sub_Category maximum record length = 22 characters , data type = object
64 | Size maximum record length = 8 characters , data type = object
65 | Grade maximum record length = 7 characters , data type = object
66 | Country_Of_Origin maximum record length = 15 characters , data type = object
67 | Pallet maximum record length = 10 characters , data type = object
68 | Bin maximum record length = 15 characters , data type = object
69 | Order_Allocation maximum record length = 15 characters , data type = object
70 | Production_Date maximum record length = 10 characters , data type = datetime64[ns]
71 | Production_Age maximum record length = 4 characters , data type = int64
72 | Lot_Date maximum record length = 10 characters , data type = datetime64[ns]
73 | Lot_Age maximum record length = 7 characters , data type = float64
74 | Weight maximum record length = 18 characters , data type = float64
75 | Cases maximum record length = 9 characters , data type = float64
76 | """


--------------------------------------------------------------------------------
/Python/load_json_to_snowflake.py:
--------------------------------------------------------------------------------
 1 | """Example script to load multiple JSONs to a named Snowflake staging area, 
 2 | then copy the JSONs into a Snowflake table
 3 | By: Martin Palkovic
 4 | Date: 2022-07-28
 5 | Description: Sometimes in a dev environment, 
 6 | I need to manipulate a JSON file to see the effect those changes 
 7 | will have on my data pipeline. Here's a quick script I wrote 
 8 | to batch load json files into Snowflake, after I've altered some of the fields
 9 | """
10 | 
11 | import os
12 | from snowflake import connector
13 | 
14 | from dotenv import load_dotenv
15 | 
16 | load_dotenv()
17 | 
18 | # folder containing your json files
19 | root = r"C:\Directory\containing\JSON\files"
20 | 
21 | # Connect to your Snowflake account
22 | cnxn = connector.connect(
23 |     user=os.getenv("SNOWFLAKE_USER"),
24 |     password=os.getenv("SNOWFLAKE_PASSWORD"),
25 |     account=os.getenv("SNOWFLAKE_ACCT"),
26 |     role=os.getenv("SNOWFLAKE_ROLE"),
27 |     warehouse="REPORTING_WH",
28 | )
29 | 
30 | cursor = cnxn.cursor()
31 | cursor.execute("create or replace stage MY_STAGE;")
32 | cursor.execute("use role SYSADMIN;")
33 | 
34 | for file in os.listdir(root):
35 |     full_path = os.path.join(root, file)
36 |     cursor.execute(f"put file://{full_path} @MY_STAGE;")
37 | 
38 |     copy_statement = file + ".gz"
39 |     cursor.execute(
40 |         f"""copy into EXAMPLE_TABLE (JSON_DATA, INSERT DATE) 
41 |                 from (select t.$1, 
42 |                 current_timestamp() 
43 |                 from @MY_STAGE/{copy_statement} t)
44 |                 file_format = (type = JSON);"""
45 |     )
46 | cursor.close()
47 | cnxn.close()
48 | 


--------------------------------------------------------------------------------
/Python/parse_xml_compare_differences.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Name: Parse XML, extract a field, compare that field to a field from a csv for diffs
 3 | By: Martin Palkovic
 4 | Date: 2022-08-18
 5 | Description: 
 6 | """
 7 | 
 8 | # Import Modules
 9 | import pandas as pd
10 | 
11 | # Paste your xml here
12 | xml = """
13 | <Pallet>
14 |  <Packs>
15 |   <Pack>
16 |    <Properties>
17 |     <Property name="Id">1</Property>
18 |     <Property name="Site">warehouse1</Property>
19 |     <Property name="Pallet">1</Property>
20 |     <Property name="Units">127</Property>
21 |     <Property name="Weight">9.16</Property>
22 |     <Property name="DateTime">08/16/2022 15:38:55</Property>
23 |    </Properties>
24 |   </Pack>
25 |   <Pack>
26 |    <Properties>
27 |     <Property name="Id">2</Property>
28 |     <Property name="Site">warehouse2</Property>
29 |     <Property name="Pallet">2</Property>
30 |     <Property name="Units">450</Property>
31 |     <Property name="Weight">13.3</Property>
32 |     <Property name="DateTime">08/17/2022 15:39:26</Property>
33 |    </Properties>
34 |   </Pack>
35 |  </Packs>
36 | </Pallet>
37 | """
38 | 
39 | # Parse XML
40 | df = pd.read_xml(xml, xpath=".//Property")
41 | 
42 | # Extract only the columns we need from the XML
43 | df_pallet = df.loc[df["name"] == "Pallet"]
44 | 
45 | # Read CSV
46 | df_csv = pd.read_csv(r"your_csv_here.csv")
47 | 
48 | # Convert values to Python list, cast to integer
49 | pallet = df_pallet["Property"].tolist()
50 | pallet = [int(i) for i in pallet]
51 | csv = df_csv["Pallet"].tolist()
52 | 
53 | # Compare differences
54 | print([i for i in pallet if i not in csv])
55 | 


--------------------------------------------------------------------------------
/Python/pull_records_for_all_sql_tables.py:
--------------------------------------------------------------------------------
 1 | """Title: Data Pull for all views in SQL database
 2 | By: Martin Palkovic
 3 | Date: 2022-11-08
 4 | Description: Script to loop through every view in my_db and pull 100 records. 
 5 | The Business Analyst for a project at work asked for the structure of 
 6 | each my_db table, this was the fastest way to do it
 7 | """
 8 | 
 9 | # import modules
10 | from sqlalchemy.engine import URL
11 | from sqlalchemy import create_engine
12 | 
13 | import pandas as pd
14 | 
15 | # SQL Server Connection - uses Active Directory to authenticate
16 | driver = "SQL Server"
17 | server = "my_server"
18 | database = "my_db"
19 | schema = "dbo"
20 | 
21 | 
22 | # Define connection function
23 | def sqlalchemy_cnxn(driver, server, db):
24 |     connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
25 |     url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
26 |     engine = create_engine(url)
27 |     return engine
28 | 
29 | 
30 | engine = sqlalchemy_cnxn(driver, server, database)
31 | 
32 | list_of_views = "SELECT name FROM sys.views"
33 | 
34 | my_server_views = pd.read_sql(list_of_views, engine)
35 | list_of_sql_views = sorted(my_server_views["name"].to_list())
36 | list_of_sql_views = [
37 |     x for x in list_of_sql_views if x != "DailySensorReadings"
38 | ]  
39 | # I had one table with 50M + rows that was causing performance issues, I removed it here
40 | 
41 | for view in list_of_sql_views:
42 |     try:
43 |         query = f"SELECT TOP 100 * FROM {database}.{schema}.{view}"
44 |         results = engine.execute(query)
45 |         df = pd.read_sql(query, engine)
46 |         if len(df) > 0:
47 |             df.to_csv(f"{view}.csv")
48 |         else:
49 |             pass
50 |     except Exception:
51 |         print(f"failed to generate data for view {view}")
52 | 


--------------------------------------------------------------------------------
/Python/read_sql_server_write_snowflake.py:
--------------------------------------------------------------------------------
  1 | """Script to read data from SQL Server and write it to Snowflake
  2 | By: Martin Palkovic
  3 | Date: 2022-09-14
  4 | Description: For a work task, I needed to add some historical exchange rate data 
  5 | to Snowflake for analytical reporting. This data existed on SQL server, so I wrote this
  6 | Python script to read the data from SQL Server, transform it, and load it into 
  7 | Snowflake. I've modified this as a minimum reproducable example for the purposes of my 
  8 | project portfolio.
  9 | """
 10 | 
 11 | #Step 1: Read data from SQL Server
 12 | 
 13 | # import modules
 14 | import os
 15 | import pyodbc
 16 | import pandas as pd
 17 | 
 18 | from snowflake import connector
 19 | from dotenv import load_dotenv
 20 | load_dotenv()
 21 | 
 22 | # set all rows and columns visible
 23 | # pd.set_option('display.max_columns', None)
 24 | # pd.set_option('display.max_rows', None)
 25 | 
 26 | # server credentials
 27 | server = "my_server"
 28 | database = "my_database"
 29 | 
 30 | # sql connection
 31 | cnxn = pyodbc.connect(
 32 |     Trusted_Connection="Yes", Driver="{SQL Server}", Server=server, Database=database
 33 | )
 34 | cursor = cnxn.cursor()
 35 | 
 36 | # stick your query inside the triple quotes
 37 | query = """select * from DATABASE.SCHEMA.EXCHANGERATES 
 38 |             where EXCHDATE > '2021-09-03' and EXCHDATE < '2021-09-09' 
 39 |             order by EXCHDATE asc"""
 40 | 
 41 | # load query to dataframe
 42 | df_fx = pd.read_sql(query, cnxn)
 43 | print(df_fx.dtypes)
 44 | 
 45 | # --------------------------------------------------------
 46 | 
 47 | # Step 2: Create a dataframe that matches the Snowflake table we are inserting to
 48 | df_sf = pd.DataFrame()
 49 | 
 50 | # Create the from and to currency columns
 51 | df_sf[["FROM_CURRENCY", "TO_CURRENCY"]] = df_fx["EXCHANGE_ID"].str.split(
 52 |     "-", 1, expand=True
 53 | )
 54 | df_sf = df_sf[
 55 |     df_sf["TO_CURRENCY"].str.contains("AVG") is False
 56 | ]  # drops rows that show avg - there are some GBP AVG
 57 | 
 58 | # Create the start and stop date columns
 59 | df_sf["EFFECTIVE_START"] = df_fx["EXCHDATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S")
 60 | df_sf["EFFECTIVE_STOP"] = (
 61 |     df_fx["EXCHDATE"] + pd.DateOffset(days=7, hours=23, minutes=59)
 62 | ).dt.strftime("%Y-%m-%d %H:%m:%s.%S")
 63 | 
 64 | # Exchange Rate
 65 | df_sf["RATE"] = df_fx["XCHGRATE"]
 66 | 
 67 | # Get current datetime
 68 | df_sf["STAGE_DATE"] = pd.Timestamp.now()
 69 | 
 70 | # strip all whitespace from every field
 71 | df_sf = df_sf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
 72 | 
 73 | # diagnostic check...number of rows, data types etc.
 74 | print("Number of rows:", len(df_sf))
 75 | print(df_sf.dtypes)
 76 | # print(df_sf.head())
 77 | df_sf.to_csv("FXRates.csv", header=False, index=False)
 78 | 
 79 | # ------------------------------------------------------------------
 80 | # Step 3: Write data to Snowflake
 81 | # Establish connection to Cooke Snowflake
 82 | cnxn = connector.connect(
 83 |     user=os.getenv("SNOWFLAKE_USER"),
 84 |     password=os.getenv("SNOWFLAKE_PASSWORD"),
 85 |     account=os.getenv("SNOWFLAKE_ACCT"),
 86 |     role=os.getenv("SNOWFLAKE_ROLE"),
 87 |     warehouse="REPORTING_WH",
 88 | )
 89 | # assign csv to variable
 90 | csv = r"<your_filepath_here>\FXRates.csv.csv"
 91 | staged_file = os.path.basename(csv) + ".gz"
 92 | 
 93 | # execute write operations
 94 | cursor = cnxn.cursor()
 95 | cursor.execute("use database STAGING_DEV;")
 96 | cursor.execute("use schema MY_SCHEMA;")
 97 | cursor.execute("create or replace stage FX_RATES;")
 98 | cursor.execute(f"put file://{csv} @FX_RATES;")
 99 | cursor.execute(
100 |     f"""copy into CURRENCY_EXCHANGE_RATES(FROM_CURRENCY, 
101 |                                             TO_CURRENCY, 
102 |                                             EFFECTIVE_START, 
103 |                                             EFFECTIVE_STOP, 
104 |                                             RATE, 
105 |                                             STAGE_DATE)
106 |                from @FX_RATES/{staged_file}
107 |                file_format = (type = CSV)"""
108 | )
109 | cursor.execute('rm @MY_SCHEMA.FX_RATES pattern = ".*FX_RATES.*";')
110 | 
111 | cursor.close()
112 | cnxn.close()
113 | 


--------------------------------------------------------------------------------
/Python/sql_insert_statement_from_csv.py:
--------------------------------------------------------------------------------
 1 | """Generate a SQL insert statement from a csv file
 2 | By: Martin Palkovic
 3 | Date: 2022-03-14"""
 4 | 
 5 | import pandas as pd
 6 | 
 7 | # Filepath for the csv
 8 | df = pd.read_csv("my_file.csv")
 9 | 
10 | # In my case I only wanted after row 1022
11 | df = df.iloc[1022:]
12 | 
13 | # There are some weird unicode characters in the excel sheet I received,
14 | # I removed them with this for loop:
15 | for column in df.columns:
16 |     df[column] = df[column].str.split().str.join(" ")
17 | 
18 | 
19 | # Define Function
20 | def sql_insert_statement_from_dataframe(source, target):
21 |     """This function generates a SQL insert statement"""
22 |     for index, row in source.iterrows():
23 |         # full insert statement:
24 |         print(
25 |             "insert into "
26 |             + target
27 |             + "("
28 |             + str(", ".join(source.columns))
29 |             + ") values "
30 |             + str(tuple(row.values))
31 |             + ";"
32 |         )
33 | 
34 | 
35 | # Execute Function
36 | sql_insert_statement_from_dataframe(df, "database.schema.table")
37 | """
38 | #Full insert statement:
39 | insert into database.schema.table(code, 
40 |                                 expense_type, 
41 |                                 acct, 
42 |                                 company) 
43 |                                 values ('02113', 
44 |                                         'Accounts Receivable, 
45 |                                         Other', 
46 |                                         '35400', 
47 |                                         'An_Awesome_Company');
48 | 
49 | insert into database.schema.table(code, 
50 |                                 expense_type, 
51 |                                 acct, 
52 |                                 company) 
53 |                                 values ('02114', 
54 |                                         'Accounts Payable', 
55 |                                         '36500', 
56 |                                         'A_Different_Company');
57 | insert into database.schema.table(code,
58 |                                 expense_type, 
59 |                                 acct, 
60 |                                 company) values ('02115', 
61 |                                                 'Donations', 
62 |                                                 '12220', 
63 |                                                 'Another_Company');
64 | 
65 | #just the values:
66 | ('02113', 'Accounts Receivable, Other', '35400', 'An_Awesome_Company'),
67 | ('02114', 'Accounts Payable', '36500', 'A_Different_Company'),
68 | ('02115', 'Donations', '12220', 'Another_Company'),
69 | """
70 | 


--------------------------------------------------------------------------------
/Python/sql_style_join_csv.py:
--------------------------------------------------------------------------------
 1 | """Performing a SQL style join on two csv files
 2 | By: Martin Palkovic
 3 | Date: 2022-02-11
 4 | 
 5 | Description: The inventory team is producing Excel sheets on a weekly basis
 6 | and would like to move comments from one sheet to another. Inventory goes out,
 7 | new inventory comes in, and they want the comments transfered on items that are
 8 | still in stock. I wasn't sure how to do this in SQL without making new tables
 9 | in the database and decided to use Python.
10 | 
11 | Note that this program is specific to a workflow I do for the Inventory team,
12 | and you cant really make a one size fits all program for this task since you
13 | need to specify which fields you want to join. But hopefully it will give you
14 | an idea of how to do this if you encounter a similar task
15 | """
16 | 
17 | import os
18 | import pandas as pd
19 | 
20 | old_csv = input("Enter filepath for the old csv: ")
21 | while not os.path.isfile(old_csv):
22 |     print("Error: that is not a valid file, try again...")
23 |     old_csv = input("Enter filepath for the old csv: ")
24 | 
25 | new_csv = input("Enter filepath for the new csv: ")
26 | while not os.path.isfile(new_csv):
27 |     print("Error: that is not a valid file, try again...")
28 |     new_csv = input("Enter filepath for the new csv: ")
29 | 
30 | try:
31 |     df_old = pd.read_csv(old_csv, low_memory=False)
32 |     df_new = pd.read_csv(new_csv, low_memory=False)
33 | 
34 |     # makes all column names lower case, ensuring they meet the join criteria
35 |     # i.e if the user capitalizes one of the column names one week but not the next,
36 |     # it doesn't matter with this block of code
37 |     df_old.columns = map(str.lower, df_old.columns)
38 |     df_new.columns = map(str.lower, df_new.columns)
39 | 
40 |     # removes any whitespace from the column names
41 |     df_old = df_old.rename(columns=lambda x: x.strip())
42 |     df_new = df_new.rename(columns=lambda x: x.strip())
43 | 
44 |     df_old = df_old.loc[:, df_old.columns.isin(["columns_you_want_to_keep"])]
45 |     df_old = df_old.reset_index(drop=True)
46 | 
47 |     df_new = df_new.loc[:, ~df_new.columns.isin(["columns_you_want_to_keep"])]
48 |     df_new = df_new.reset_index(drop=True)
49 | 
50 |     df = pd.merge(
51 |         df_new,
52 |         df_old.drop_duplicates(subset=["pallet"]),
53 |         how="left",
54 |         on=["pallet"],
55 |         suffixes=("", "_drop"),
56 |     )
57 | 
58 |     df = df.drop([c for c in df.columns if "drop" in c], axis=1)
59 |     df.columns = map(str.capitalize, df.columns)
60 | 
61 |     file_name = input("Enter your file name (dont add the .csv extension): ")
62 |     df.to_csv("{}.csv".format(file_name))
63 | 
64 | except BaseException as exception:
65 |     print(f"An exception occurred: {exception}")
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering Portfolio
 2 | 
 3 | <div id="header" align="center">
 4 |     <img src="https://i.redd.it/w1m3or6z66j51.jpg" width="400"/>
 5 | </div>
 6 | 
 7 | [![Ruff](https://github.com/MartyC-137/Data-Engineering/actions/workflows/ruff.yml/badge.svg)](https://github.com/MartyC-137/Data-Engineering/actions/workflows/ruff.yml)
 8 | [![SQLFluff](https://github.com/MartyC-137/Data-Engineering/actions/workflows/sqlfluff.yml/badge.svg)](https://github.com/MartyC-137/Data-Engineering/actions/workflows/sqlfluff.yml)
 9 | 
10 | ---
11 | 
12 | ### Introduction
13 | 
14 | This repository contains numerous work examples of code I use in my day to day work as a data engineer, all of which has been modified as minimum reproducible examples. My favourite tools are Snowflake, Python, and dbt, and I also have an interest in DevOps as it pertains to data engineering.
15 | 
16 | <div>
17 |   <img src="https://github.com/devicons/devicon/blob/master/icons/python/python-original-wordmark.svg" title="Python" alt="Python" width="40" height="40"/>&nbsp;
18 |   <img src = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Snowflake_Logo.svg/2560px-Snowflake_Logo.svg.png" title="Snowflake" alt="Snowflake" width="150" height="40"/>&nbsp;
19 |   <img src="https://seeklogo.com/images/D/dbt-logo-E4B0ED72A2-seeklogo.com.png" title="dbt" alt="dbt" width="100" height="40"/>&nbsp;
20 | </div>
21 | 
22 | [![Linkedin Badge](https://img.shields.io/badge/-Martin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/mpalkovic/)
23 | [![Resume Badge](https://img.shields.io/badge/-Resume-blue?style=flat&logo=Resume&logoColor=white)](https://my.visualcv.com/martin-palkovic/)
24 | 
25 | ### Table of Contents
26 | * [Python Examples](https://github.com/MartyC-137/Data-Engineering/tree/main/Python)
27 |     - [Snowpark example - backload data from SQL Server](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Snowpark_Example_Backload_Data.py)
28 |     - [Snowpark example - backload data from API](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Snowpark_Backload_API_Data.py)
29 |     - [Automated SQL insert statements from a CSV file](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Generate_SQL_Insert_Statements_From_CSV.py)
30 |     - [Extract data from SQL Server, transform, and load to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Read_SQLServer_Write_Snowflake.py)
31 |     - [Batch load JSON files to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/LoadJSONToSnowflake.py)
32 |     - [SQL Server data Pull - 100 Records from every view in a database](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Pull_records_for_all_SQL_tables_in_db.py)
33 | * [SQL Examples](https://github.com/MartyC-137/Data-Engineering/tree/main/SQL)
34 |     - [Only grant permissions on tables with > 0 rows of data - Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_ForLoop_GrantPermissions.sql)
35 |     - [Auto Ingest Snowpipe from Azure Blob to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql)
36 |     - [Shorten large union queries using Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Shorten_Huge_Union_Queries.sql)
37 |     - [Basic Snowflake CDC Pipeline using Streams and Tasks](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql)
38 |     - [Find missing dates in a date field - Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Find_Missing_Dates.sql)
39 |     - [Snowflake data pipeline from internal stage](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Data_Pipeline_From_Internal_Stage.sql)
40 | * [Snowflake CI/CD using Azure Pipelines - SQLFluff testing, build and deploy using SnowSQL](https://github.com/MartyC-137/Data-Engineering/tree/main/SnowSQL_CICD)
41 | * [SQLFluff and yamllint pipelines for a dbt project](https://github.com/MartyC-137/Data-Engineering/tree/main/CI_Examples)
42 | 
43 | ---
44 | 
45 | ### Usage
46 | 
47 | ```bash
48 | # Clone the repository
49 | $ git clone https://github.com/MartyC-137/Data-Engineering.git
50 | 
51 | # Connect to the repository
52 | $ cd Data-Engineering
53 | ```
54 | 


--------------------------------------------------------------------------------
/SQL/Load_CSV_to_Snowflake/PUT.sql:
--------------------------------------------------------------------------------
1 | !set variable_substitution=true;
2 | put file://&{csv_path} @~&{stage} auto_compress=true;
3 | 


--------------------------------------------------------------------------------
/SQL/Load_CSV_to_Snowflake/Snowflake_Worksheet_Load_CSV.sql:
--------------------------------------------------------------------------------
 1 | /*****************************************************/
 2 | -- Worksheet: Loading a local csv to a Snowflake table
 3 | -- Date: 2022-12-08
 4 | /*****************************************************/
 5 | 
 6 | /* Set session variables
 7 | Enter the relevant database, schema, table and file format names here
 8 | */
 9 | set role_name = 'sysadmin';
10 | set wh = 'reporting_wh';
11 | set db = 'my_new_db';
12 | set sch = 'my_schema';
13 | set table_name = 'my_table';
14 | set fileformat = 'my_file_format';
15 | set stage_name = 'my_stage';
16 | 
17 | /* initialize session */
18 | -- role, warehouse
19 | use role identifier($role_name);
20 | use warehouse identifier($wh);
21 | 
22 | -- database
23 | create database if not exists identifier($db);
24 | use database identifier($db);
25 | 
26 | -- schema
27 | create schema if not exists identifier($sch);
28 | use schema identifier($sch);
29 | 
30 | -- file format
31 | create file format if not exists identifier($fileformat)
32 | type = csv
33 | field_delimiter = ','
34 | empty_field_as_null = true
35 | skip_header = 1
36 | comment = 'file format for loading csv files to Snowflake';
37 | 
38 | -- stage
39 | create stage if not exists identifier($stage_name)
40 |     file_format = $fileformat; --this may need to be typed out
41 | show stages;
42 | 
43 | -- table;
44 | create table if not exists identifier($table_name) (
45 |     field1 varchar,
46 |     field2 number
47 | );
48 | 
49 | /* the PUT command must be executed in the SnowSQL CLI!
50 | See the following documentation on this topic:
51 | https://docs.snowflake.com/en/user-guide/snowsql-install-config.html
52 | https://docs.snowflake.com/en/user-guide/data-load-internal-tutorial.html
53 | 
54 | download link: https://developers.snowflake.com/snowsql/
55 | put file://c:\your\filepath\here\my_file.csv;
56 | */
57 | 
58 | /* confirm that the PUT command worked */
59 | list @my_stage;
60 | 
61 | copy into identifier($table_name)
62 | from @my_stage/my_file.csv.gz --variables dont work in conjunction with the @ argument
63 | file_format = (format_name = $fileformat)
64 | on_error = 'skip_file';
65 | 
66 | -- confirm the COPY INTO command worked
67 | select * from identifier($table_name);
68 | 


--------------------------------------------------------------------------------
/SQL/Load_CSV_to_Snowflake/snowsql.sh:
--------------------------------------------------------------------------------
1 | snowsql -c dev -s my_schema -f PUT.sql -D csv_path=your_csv_path\your_csv.csv -D stage=my_stage


--------------------------------------------------------------------------------
/SQL/Snowflake_Account_Setup.sql:
--------------------------------------------------------------------------------
 1 | /******************************************************************************/
 2 | -- Script: Account Setup in Snowflake
 3 | -- CreateBy: Martin Palkovic
 4 | -- Create date: 2022-11-01
 5 | -- Description: Script to set up a warehouse, 
 6 | -- role and user with basic privileges
 7 | /******************************************************************************/
 8 | 
 9 | /* Set session variables*/
10 | set role_name = 'my_role';
11 | set user_name = 'my_user';
12 | set wh_name = 'my_warehouse';
13 | set db_name = 'my_db';
14 | 
15 | /* Create warehouse for service account */
16 | use role sysadmin;
17 | create or replace warehouse identifier($wh_name)
18 | warehouse_size = xsmall
19 | auto_suspend = 60
20 | auto_resume = true
21 | min_cluster_count = 1
22 | max_cluster_count = 5
23 | scaling_policy = standard
24 | comment = 'Warehouse for service account to query the Snowflake API';
25 | 
26 | /* Create role */
27 | use role securityadmin;
28 | create or replace role identifier($role_name)
29 | comment = 'Default role for service account my_user';
30 | 
31 | /* Create user */
32 | use role accountadmin;
33 | create or replace user identifier($user_name)
34 |     login_name = $user_name
35 |     display_name = $user_name
36 |     password = '********************'
37 |     must_change_password = false
38 |     default_role = $role_name
39 |     default_warehouse = $wh_name
40 |     comment = 'Service account for application to query the Snowflake API';
41 | 
42 | /* grant account permissions */
43 | grant role identifier($role_name) to user identifier($user_name);
44 | grant usage on warehouse identifier($wh_name) to role identifier($role_name);
45 | grant usage on database identifier($db_name) to role identifier($role_name);
46 | grant usage on all schemas in database identifier($db_name) to role identifier(
47 |     $role_name
48 | );
49 | grant select on all tables in database identifier($db_name) to role identifier(
50 |     $role_name
51 | );
52 | 
53 | /* Future Grants */
54 | grant select on future tables in database identifier(
55 |     $db_name
56 | ) to role identifier($role_name);
57 | grant usage on future schemas in database identifier(
58 |     $db_name
59 | ) to role identifier($role_name);
60 | 
61 | /* Confirm access is correct */
62 | show grants to role identifier($role_name);
63 | 
64 | show grants of role identifier($role_name);
65 | show grants to user identifier($user_name);
66 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql:
--------------------------------------------------------------------------------
 1 | /**********************************************************************/
 2 | -- Title: Azure Blob Snowpipe setup
 3 | -- By: Martin Palkovic
 4 | -- Date: 2022-11-09
 5 | -- Description: Snowflake set up of an auto-ingest snowpipe from Azure Blob Storage to Snowflake table.
 6 | -- Documentation: https://docs.snowflake.com/en/user-guide/data-load-snowpipe-auto-azure.html
 7 | /*********************************************************************/
 8 | 
 9 | /* Set session variables */
10 | set session_role = 'sysadmin';
11 | set session_warehouse = 'reporting_wh';
12 | set session_database = 'dev';
13 | set session_table = 'my_table';
14 | set project_name = 'MY_PROJECT';
15 | set storage_loc = 'azure://your_blob_account_here.blob.core.windows.net/my_project';
16 | set tenant_id = 'a123b4c5-1234-123a-a12b-1a23b45678c9'; -- example tenant id from Snowflake docs
17 | 
18 | /* Initialize Environment */
19 | use role identifier($session_role);
20 | use warehouse identifier($session_warehouse);
21 | use database identifier($session_database);
22 | 
23 | create schema if not exists identifier($project_name);
24 | use schema identifier($project_name);
25 | 
26 | /* Create storage integration for Snowflake to connect to Azure Blob.
27 | See the 'Configuring Secure Access to Cloud Storage' section in the url above*/
28 | create storage integration if not exists identifier($project_name)
29 | type = external_stage
30 | storage_provider = 'AZURE'
31 | enabled = true 
32 | azure_tenant_id = $tenant_id
33 | storage_allowed_locations = ($storage_loc)
34 | comment = 'Storage Integration for moving my_project data into Snowflake';
35 | 
36 | /* The output of this command is needed for setup in the Azure Portal */
37 | desc storage integration identifier($project_name);
38 | 
39 | /* Create notification integration to connect Snowflake to Azure Event Grid.
40 | See Step 2 of 'Configuring Automation With Azure Event Grid'*/
41 | create notification integration if not exists identifier($project_name)
42 | enabled = true
43 | type = queue
44 | notification_provider = azure_storage_queue
45 | azure_storage_queue_primary_uri = '<queue_URL>'
46 | azure_tenant_id = $tenant_id
47 | comment = 'Notification Integration for moving my_project data into Snowflake';
48 | 
49 | /* The output of this command is needed for setup in the Azure Portal */
50 | desc notification integration identifier($project_name);
51 | 
52 | /* Create a Snowflake stage */
53 | create stage if not exists identifier($project_name)
54 | url = $storage_loc
55 | storage_integration = $project_name
56 | comment = 'Staging area for my_project data, between Azure Blob and Snowflake';
57 | 
58 | -- show stages;
59 | 
60 | /* Create a Snowpipe that will be notified via Azure Event Grid 
61 | when a file is added to the Azure Blob instance specified above*/
62 | create pipe if not exists identifier($project_name)
63 | auto_ingest = true
64 | integration = $project_name
65 | as
66 | copy into $session_table
67 | from @$project_name
68 | file_format = (type = 'csv')
69 | comment = 'Auto Ingest Snowpipe for moving data from Azure Blob to Snowflake. When a file is added to
70 | Azure Blob, this Snowpipe will automatically trigger';
71 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql:
--------------------------------------------------------------------------------
 1 | /******************************************************************************/
 2 | -- Script: Basic CDC Pipeline using Streams and Tasks in Snowflake 
 3 | -- CreateBy: Martin Palkovic
 4 | -- Create date: 2022-11-01
 5 | -- Description: Basic implementation of a Streams/Tasks workflow in Snowflake.
 6 | -- Streams detect DML changes to one table and will update another table based
 7 | -- on those changes
 8 | /******************************************************************************/
 9 | 
10 | /* Set session variables */
11 | set role_name = 'sysadmin';
12 | set wh = 'my_wh';
13 | set db = 'my_db';
14 | set schema_name = 'my_schema';
15 | set dest_table = 'my_table';
16 | set stream_name = 'my_stream';
17 | set source_table = 'staging_db.staging_schema.staging_table';
18 | set proc_name = 'my_procedure';
19 | set task_name = 'push_my_table';
20 | 
21 | /* Initialize Environment */
22 | use role identifier($role_name);
23 | use warehouse identifier($wh);
24 | 
25 | create database if not exists identifier($db);
26 | create schema if not exists identifier($schema_name);
27 | 
28 | use database identifier($db);
29 | use schema identifier($schema_name);
30 | 
31 | create table if not exists identifier($dest_table)
32 | comment = 'JSON data from API, streaming from the staging database'
33 | clone identifier($source_table);
34 | 
35 | create stream if not exists identifier($stream_name) on table identifier($source_table)
36 | comment = 'CDC stream from staging table to prod table';
37 | 
38 | /* quick diagnostic check */
39 | show streams;
40 | select * from identifier($stream_name);
41 | 
42 | create or replace procedure identifier($proc_name)()
43 | returns varchar
44 | language sql
45 | execute as owner
46 | as
47 | $$
48 | begin
49 | merge into my_table DEST using (
50 |     select * from my_stream
51 |       qualify row_number() over (
52 |       partition by json_data:ID order by insert_date) = 1
53 |       ) SOURCE
54 |         on DEST.json_data:ID = SOURCE.json_data:ID
55 | when matched and metadata$action = 'INSERT' then 
56 | update set DEST.json_data = SOURCE.json_data,
57 |             DEST.insert_date = current_timestamp()
58 | when not matched and metadata$action = 'INSERT' then 
59 | insert (DEST.json_data, DEST.insert_date)
60 |         values(SOURCE.json_data, current_timestamp());
61 | return 'CDC records successfully inserted';
62 | end;
63 | $$;
64 | 
65 | create or replace task identifier($task_name)
66 |     warehouse = LOAD_WH
67 |     schedule = '1 minute'
68 |     comment = 'Change data capture task that pulls over new data once a minute'
69 | when system$stream_has_data ('my_stream')
70 | as
71 |     call my_procedure();
72 | 
73 | /* grant execute task priveleges to role sysadmin */
74 | set role_name = 'accountadmin';
75 | use role identifier($role_name);
76 | grant execute task on account to role identifier($role_name);
77 | 
78 | /* tasks are created in a suspended state by default, you must 'resume' them to schedule them */
79 | set role_name = 'sysadmin';
80 | use role identifier($role_name);
81 | alter task identifier($task_name) resume;
82 | 
83 | select * from identifier($my_table);
84 | 
85 | show tasks;
86 | select * from table(information_schema.task_history()) order by SCHEDULED_TIME;
87 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Clean_Staging_Area.sql:
--------------------------------------------------------------------------------
 1 | /*******************************************************************/
 2 | -- Procedure: sp_clean_stage
 3 | -- Created By: Martin Palkovic
 4 | -- Create date: 2022-08-16
 5 | -- Organization: Cooke Inc.
 6 | -- Summary: Delete files from a named Snowflake staging area
 7 | -- Description: In data pipelines, we sometimes stick files in a named 
 8 | -- Snowflake internal staging area - occasionally, you'll want to purge the 
 9 | -- files from here. Append this stored procedure call as the last step in your pipeline
10 | -- to keep your staging area clean
11 | /*******************************************************************/
12 | use warehouse REPORTING_WH;
13 | use database STAGING_DEV;
14 | use schema NS_LANDING;
15 | 
16 | create or replace procedure sp_clean_stage(
17 |     stage_name varchar, DAYS number, DRY_RUN boolean
18 | )
19 | returns varchar
20 | language sql
21 | execute as caller
22 | as
23 | $$
24 | declare
25 |     ListFiles resultset;
26 |     LastModified date;
27 |     RemovedCount number := 0;
28 |     TotalCount number := 0;
29 | begin
30 |     ListFiles := (execute immediate 'ls @' || stage_name );
31 |     let C1 cursor for ListFiles; 
32 |     for files in C1 do
33 |        TotalCount := TotalCount + 1;
34 |        LastModified := to_date(left( files."last_modified", length(files."last_modified") - 4 ), 'DY, DD MON YYYY HH24:MI:SS' );
35 |        if (LastModified <= dateadd( 'day', -1 * days, current_timestamp())) then 
36 |             RemovedCount := RemovedCount + 1;                
37 |             if (not dry_run) then
38 |                 execute immediate 'rm @' || files."name";
39 |             end if;
40 |        end if;
41 |     end for;
42 |     return RemovedCount || ' of ' || TotalCount || ' files ' || iff(dry_run,'will be','were') || ' deleted.';
43 | end;
44 | $$;
45 | 
46 | -- Run Stored Procedure
47 | -- use database my_db;
48 | -- call sp_clean_stage('my_stage', 14, false);
49 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Cloning.sql:
--------------------------------------------------------------------------------
 1 | /* How to clone data in Snowflake
 2 | By: Martin Palkovic
 3 | Date: 2022-06-10
 4 | 
 5 | Description: Zero copy cloning is one of the awesome features of Snowflake.
 6 | I like to use this feature to quickly create a development environment for
 7 | testing */
 8 | 
 9 | use role sysadmin;
10 | use warehouse reporting_wh;
11 | use database production;
12 | use schema dbo;
13 | 
14 | /* clone database */
15 | create database my_cloned_db clone my_db;
16 | 
17 | /* clone schema */
18 | create schema my_cloned_schema clone analytics_inventory;
19 | 
20 | /* clone table */
21 | create table my_cloned_table clone main_inventory_table;
22 | 
23 | /* cloning with time travel  */
24 | create or replace table my_cloned_table clone main_inventory_table
25 | at (timestamp => '2022-06-10 9:30')
26 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Data_Pipeline_From_Internal_Stage.sql:
--------------------------------------------------------------------------------
 1 | /**********************************************************************************************************/
 2 | -- Proc: Basic data pipeline from Snowflake internal stage
 3 | -- CreateBy: Martin Palkovic
 4 | -- Create date: 2022-10-31
 5 | -- Description: Basic workflow for building the latter portions of a data pipeline within Snowflake.
 6 | -- Note that this code assumes you have loaded a csv file into a Snowflake internal stage via a 
 7 | -- 3rd party or open source integration tool
 8 | /***********************************************************************************************************/
 9 | 
10 | /* initialize environment */
11 | use role sysadmin;
12 | use warehouse reporting_wh;
13 | use database my_dev_database;
14 | use schema my_schema;
15 | 
16 | /* Provides information for your third party/open source integration tool */
17 | desc table dimcustomer;
18 | 
19 | /* create stage, if needed */
20 | show stages;
21 | -- create or replace my_stage 
22 | list @my_stage;
23 | 
24 | /* create file format */
25 | create or replace file format my_file_format
26 | type = 'CSV'
27 | field_delimiter = ','
28 | replace_invalid_characters = true
29 | null_if = ('');
30 | 
31 | /* create stored procedure */
32 | create or replace procedure dim_customer_pipeline()
33 | returns varchar
34 | language sql
35 | execute as caller
36 | as
37 | $$
38 | begin
39 |     truncate table MY_SCHEMA.DIMCUSTOMER;
40 | 
41 |     copy into
42 |         MY_SCHEMA.DIMCUSTOMER
43 |         from
44 |         ( select t1.$1
45 |                 ,t1.$2
46 |                 ,t1.$3
47 |                 ,nullif(t1.$4, '')
48 |             from @MY_SCHEMA.MY_STAGE/Dim_Customer.csv.gz (file_format => 'my_file_format') t1 
49 |     )
50 |     file_format=my_file_format ON_ERROR='SKIP_FILE';
51 | 
52 |     remove @MY_SCHEMA.MY_STAGE pattern='.*Customer.*';
53 | 
54 |     return 'Successfully loaded data into MY_DEV_DATABASE.MY_SCHEMA.DIMCUSTOMER';
55 |  end;
56 |  $$;
57 | 
58 | /* create task */
59 | create or replace task dim_customer
60 |     warehouse = load_wh
61 |     schedule = 'using cron 30 9 * * * UTC'
62 |     comment
63 |     = 'Truncates MY_DEV_DATABASE.MY_SCHEMA.DIMCUSTOMER, loads all rows of the dimcustomer table from Azure SQL and deletes the csv from the staging area'
64 | as
65 |     call dim_customer_pipeline();
66 | 
67 | /* grant execute task priveleges to role sysadmin */
68 | use role accountadmin;
69 | grant execute task on account to role sysadmin;
70 | 
71 | /* tasks are created in a suspended state by default, you must 'resume' them to schedule them */
72 | use role sysadmin;
73 | alter task dim_customer resume;
74 | 
75 | /* confirm that the tasks are working */
76 | show tasks;
77 | select * from table(information_schema.task_history()) order by scheduled_time;
78 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Find_Duplicates.sql:
--------------------------------------------------------------------------------
1 | select * from my_table
2 | qualify count(*) over (partition by primary_key) > 1;
3 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Find_Missing_Dates.sql:
--------------------------------------------------------------------------------
 1 | /* Query: find missing dates in a range of dates
 2 | By: Martin Palkovic
 3 | Date: 2022-08-19
 4 | System: Snowflake
 5 | Description: Say, for example, you have a report, and there is data missing for certain dates
 6 | on that report. You can use this query to identify dates where you may have missing data
 7 |  */
 8 | 
 9 | use role sysadmin;
10 | use warehouse my_warehouse;
11 | use database my_db;
12 | use schema my_schema;
13 | 
14 | with find_date_gaps (rownum, my_date_field) as (
15 |     select
16 |         my_date_field,
17 |         row_number() over (order by my_date_field asc) as rownum
18 |     from your_table
19 |     where my_date_field > 'yyyy-mm-dd'
20 |     group by my_date_field
21 | )
22 | 
23 | select
24 |     dateadd(dd, 1, fdg1.my_date_field) as startofgap,
25 |     dateadd(dd, -1, fdg2.my_date_field) as endofgap
26 | from find_date_gaps as fdg1
27 | inner join find_date_gaps as fdg2
28 |     on fdg1.rownum = (fdg2.rownum - 1)
29 | where datediff(dd, fdg1.my_date_field, dateadd(dd, -1, fdg2.my_date_field)) != 0;
30 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Flatten_JSON_Example.sql:
--------------------------------------------------------------------------------
 1 | /**********************************************************************************************************/
 2 | -- Query: Flatten JSON to analytics view in Snowflake 
 3 | -- CreateBy: Martin Palkovic
 4 | -- Create date: 2021-05-03
 5 | -- Description: SQL code for creating a materialized view in Snowflake from a JSON in your staging area
 6 | -- Modified by:
 7 | -- Modify date:
 8 | -- Mod Reason:
 9 | /***********************************************************************************************************/
10 | 
11 | create or replace materialized view my_db.schema.my_view
12 | as
13 | select
14 |     jsn.value:Id::string as id,
15 |     jsn.value:TotalAmount::number(10, 2) as total_amount,
16 |     jsn.value:Cash::boolean as cash,
17 |     jsn.value:TransactionDate::date as transaction_date
18 | from staging_area.schema.my_table,
19 |     lateral flatten(input => json_data) as jsn
20 | 
21 | qualify row_number()
22 |     over (
23 |         partition by jsn.value:Id
24 |         order by jsn.value:Id
25 |     )
26 | = 1;
27 | 
28 | /*
29 | Input:
30 | Row  JSON_DATA
31 | 1    [{"Id": 1,"TotalAmount": 42.75, "Cash": true,"TransactionDate": "2022-03-25T18:44:46.54"}]
32 | 2    [{"Id":2, "TotalAmount": 57.99, "Cash": false, "TransactionDate": "2022-03-28T12:24:33.12"}]
33 | 3    [{"Id": 1,"TotalAmount": 42.75, "Cash": true,"TransactionDate": "2022-03-25T18:44:46.54"}]
34 | 4    [{"Id": 3, "TotalAmount": 100.25, "Cash": false, "TransactionDate": "2022-04-01T06:10:15.30"}]
35 | 
36 | Output:
37 | ID Total_Amount  Cash   Transaction_Date
38 | 1  42.75         True   2022-03-25
39 | 2  57.99         False  2022-03-28
40 | 3  100.25        False  2022-04-01
41 | */
42 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_ForLoop_GrantPermissions.sql:
--------------------------------------------------------------------------------
 1 | /* ######################### */
 2 | /* Script: Revoke/Grant permissions for reader accounts in Snowflake  */
 3 | /* Author: Martin Palkovic  */
 4 | /* Date: 2023-02-09  */
 5 | /* Description: This script loops through query results from the information_schema and grants privileges only to tables */
 6 | /* that have > 0 rows. This script was inspired by a database containing ~2,500 tables, 400 of which contained >= 1 row of data. */
 7 | /* This script revokes all privileges and then grants select on tables with > 0 rows. Modify your cursor queries as needed to provide a */
 8 | /* list of tables, schemas etc. to loop over. */
 9 | 
10 | -- Set session variables
11 | set db = 'my_db';
12 | set rl = 'accountadmin';
13 | set wh = 'my_wh';
14 | set role_var = '"My_Role"'; --the double quotes are required as this is a case sensitive string value!
15 | set share_name = 'ab12345.my_secure_share';
16 | 
17 | -- Schemas to exclude. Set as desired, add as many as you need
18 | set exc1 = 'information_schema';
19 | set exc2 = 'my_schema1';
20 | 
21 | use database identifier($db);
22 | use role identifier($rl);
23 | use warehouse identifier($wh);
24 | 
25 | /* SHARE LEVEL - EXECUTED IN MAIN ACCOUNT */
26 | -- Revoke privileges
27 | declare 
28 |     iter_schema cursor for (select * from information_schema.schemata where schema_name not in ($exc1, $exc2));
29 | begin
30 |     for s in iter_schema do
31 |         execute immediate 'revoke select on all tables in schema ' || s.schema_name || ' from share identifier($share_name)';
32 |     end for;
33 |     return 'Permissions successfully revoked from secure share!';
34 | end;
35 | 
36 | -- Add to share all tables that have > 0 rows
37 | declare 
38 |     iter_tables cursor for (select * from information_schema.tables 
39 |                             where row_count > 0 and table_schema not in ($exc1, $exc2));
40 | begin
41 |     for t in iter_tables do
42 |         execute immediate 'grant select on table ' || t.table_schema || '.' || t.table_name || ' to share identifier($share_name)';
43 |     end for;
44 |     return 'Permissions successfully granted to secure share!';
45 | end;
46 | 
47 | /* SHARE LEVEL - EXECUTED IN READER ACCOUNT BY ADMIN */
48 | -- Revoke privileges 
49 | declare 
50 |     iter_schema cursor for (select * from information_schema.schemata where schema_name not in ($exc1, $exc2));
51 | begin
52 |     for s in iter_schema do
53 |         execute immediate 'revoke select on all tables in schema ' || s.schema_name || ' from role identifier($role_var)';
54 |     end for;
55 |     return 'Permissions successfully revoked!';
56 | end;
57 | 
58 | -- Grant only permissions on tables that have > 0 rows
59 | declare 
60 |     iter_tables cursor for (select * from information_schema.tables 
61 |                             where row_count > 0 and table_schema not in ($exc1, $exc2));
62 | begin
63 |     for t in iter_tables do
64 |         execute immediate 'grant select on table ' || t.table_schema || '.' || t.table_name || ' to role identifier($role_var)';
65 |     end for;
66 |     return 'Permissions successfully granted!';
67 | end;
68 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Merge_Into_Example.sql:
--------------------------------------------------------------------------------
 1 | /* Title: Example MERGE INTO statement for incremental loading into Snowflake
 2 | By: Martin Palkovic
 3 | Date: 2022-10-20
 4 | Description: With large datasets, you'll often want to implement an incremental load to
 5 | improve performance in your data pipeline. The code below will prevent duplicates in your load,
 6 | while only adding new records and updating existing records if changes exist. Note that this code excludes
 7 | the database name from the full qualified table name - that is deliberate so that this code can be run against
 8 | a development database first. The database name is set in the environment extensions of your pipeline tool.
 9 | 
10 | -- This is a minimum reproducible example of code I've used in production.
11 | */
12 | 
13 | merge into
14 |     my_schema.my_table as destination
15 | 
16 | using (
17 |     select *
18 |     from my_schema.my_staging_table
19 |     qualify row_number() over (
20 |         partition by my_unique_sk
21 |         order by created_date desc
22 |     ) = 1
23 | ) as source
24 |     on (source.my_unique_sk = destination.my_unique_sk)
25 | 
26 | when matched then
27 |     update
28 |     set
29 |         destination.my_unique_sk = source.my_unique_sk,
30 |         destination.order_id = source.order_id,
31 |         destination.ship_date = source.ship_date
32 | 
33 | when not matched
34 | then insert
35 |     (
36 |         my_unique_sk,
37 |         order_id,
38 |         ship_date
39 |     )
40 | values
41 | (
42 |     source.my_unique_sk,
43 |     source.order_id,
44 |     source.ship_date
45 | );
46 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Python_Stored_Procedure_Example.sql:
--------------------------------------------------------------------------------
 1 | /************************************************************************/
 2 | -- Script: Simple Python stored procedure in Snowflake
 3 | -- Date: 2022-12-28
 4 | -- Description: One thing I frequently do is compare one field to another,
 5 | -- to determine if something exists in one dataset but not another. Does one table
 6 | -- contain sales orders, pallet numbers, or report ID's that the other table 
 7 | -- does not?
 8 | 
 9 | -- This stored procedure allows you to quickly determine that from within
10 | -- the Snowflake environment
11 | /************************************************************************/
12 | 
13 | use role sysadmin;
14 | use warehouse reporting_wh;
15 | use database dev;
16 | use schema my_schema;
17 | 
18 | create or replace table mytable (amount number comment 'fake amounts for testing', fruits string comment 'fake types of fruit for testing');
19 | create or replace table mytable2 like mytable;
20 | 
21 | insert into mytable values (1, 'apple'), (2, 'orange'), (5, 'grape'), (7, 'cantelope'), (9, 'pineapple'), (17, 'banana'), (21, 'tangerine');
22 | insert into mytable2 values (1, 'apple'), (3, 'orange'), (5, 'grape'), (7, 'strawberry'), (10, 'pineapple'), (17, 'banana'), (22, 'raspberry');
23 | 
24 | -- select * from mytable;
25 | -- select * from mytable2;
26 | 
27 | create or replace procedure print_differences(TABLE1 string, TABLE2 string, FIELD1 string, FIELD2 string)
28 | returns array
29 | language python 
30 | runtime_version = '3.8'
31 | packages = ('snowflake-snowpark-python', 'pandas')
32 | handler = 'print_differences'
33 | as 
34 | $$
35 | import pandas as pd
36 | 
37 | def print_differences(session, table1: str,table2: str,field1: str,field2: str):
38 | 
39 |     #read the tables into a snowpark dataframe
40 |     table1 = session.table(table1)
41 |     table2 = session.table(table2)
42 | 
43 |     #convert to pandas
44 |     df1 = table1.to_pandas()
45 |     df2 = table2.to_pandas()
46 | 
47 |     # convert the the fields of interest from each table to a list
48 |     list1 = df1[field1].to_list()
49 |     list2 = df2[field2].to_list()
50 | 
51 |     return [item for item in list1 if item not in list2]
52 | $$;
53 | 
54 | call print_differences('MYTABLE2', 'MYTABLE', 'FRUITS', 'FRUITS');
55 | 
56 | -- output:
57 | -- ["cantelope","tangerine"]
58 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Shorten_Huge_Union_Queries.sql:
--------------------------------------------------------------------------------
 1 | /**********************************************************************/
 2 | -- Title: How to shorten a huge union query
 3 | -- By: Martin Palkovic
 4 | -- Date: 2022-11-25
 5 | -- Description: Have you encountered a production small_sql query with a large number of unions,
 6 | -- and very little changes between the queries except perhaps the database and/or schema name?
 7 | -- In this example, you can loop over the COMPANY_NAME field in MY_TABLE to create
 8 | -- one select statement per 'COMPANY_NAME', union them together, and return the results
 9 | -- in one go. The first implementation of this at work reduced a 300 line query to ~ 40 lines!
10 | /*********************************************************************/
11 | 
12 | use role sysadmin;
13 | use warehouse my_wh;
14 | use database dev;
15 | 
16 | -- Declare variables, loop over results of the 'organization' cursor variable
17 | declare
18 |     small_sql varchar;
19 |     big_sql varchar;
20 |     organization cursor for (select COMPANY_NAME from MY_SCHEMA.MY_TABLE);
21 |     my_results resultset;
22 | begin
23 |     big_sql := '';
24 |     -- In Snowflake, $$ is a multi-line string delimiter 
25 |     for company in organization do
26 |         small_sql := $$select 'COMPANY_NAME' as Company
27 |             , GL.ACTNUM as Account_Number
28 |             , ACT.DESCRIPTION as Account_Name
29 |             from COMPANY_NAME.General_Ledger_Table GL 
30 | 
31 |             inner join COMPANY_NAME.Account_Name_Table ACT
32 |                 on ACT.ID = GL.ID
33 |             $$;
34 |         small_sql := replace(small_sql, 'COMPANY_NAME', company.COMPANY_NAME);
35 | 
36 |         if(big_sql != '') then 
37 |           big_sql := big_sql || ' union all ';
38 |         end if;
39 | 
40 |         big_sql := big_sql || small_sql;
41 |     end for;
42 |     
43 |     my_results := (execute immediate :big_sql);
44 |     return table(my_results);
45 | end;
46 | 


--------------------------------------------------------------------------------
/SQL/Snowflake_Time_Travel.sql:
--------------------------------------------------------------------------------
 1 | /* Title: Snowflake Time Travel
 2 | By: Martin Palkovic
 3 | Date: 2022-06-07
 4 | Description: Snowflake has great time travel functionality, were you can easily restore
 5 | a table to its state at a previous point in time. I have used this functionality with
 6 | great success when a production table with 2 million records was deleted on accident!
 7 | */
 8 | 
 9 | show tables history;
10 | 
11 | /* Note that you may need to rename the table */
12 | alter table my_table rename to my_table_whoops;
13 | 
14 | /* specify the time */
15 | select
16 |     acct_number,
17 |     date
18 | from my_table at (timestamp => '2022-06-01 6:00');
19 | 
20 | /* specify an offset, ex. 1 hour ago*/
21 | select
22 |   acct_number,
23 |   date 
24 | from my_table at (offset => -60*60); --offset is in seconds here
25 | 


--------------------------------------------------------------------------------
/Shell/Create_gitignore_and_add_lines.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd /Users/johndoe/documents
3 | touch .gitignore 
4 | echo '.env' >> .gitignore


--------------------------------------------------------------------------------
/Shell/Microsoft.PowerShell_profile.ps1:
--------------------------------------------------------------------------------
 1 | # This is an example of my Microsoft PowerShell profile. It sets up the Oh-My-Posh terminal theme,
 2 | # and contains the following user defined functions:
 3 | # PassGen: Generates random strong passwords
 4 | # Create-OpenInVSCode: Creates and opens a file in VS Code using one simple command
 5 | 
 6 | Set-Item -Path Env:TERMINAL_THEME -Value "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/night-owl.omp.json"
 7 | 
 8 | Import-Module Terminal-Icons
 9 | 
10 | Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete
11 | 
12 | oh-my-posh init pwsh --config $env:TERMINAL_THEME | Invoke-Expression
13 | 
14 | # Password Generator
15 | function PassGen {
16 |     param (
17 |         [int]$Length = 20
18 |     )
19 |     
20 |     $ValidCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+='
21 |     $Password = ''
22 |     
23 |     for ($i = 0; $i -lt $Length; $i++) {
24 |         $RandomIndex = Get-Random -Minimum 0 -Maximum $ValidCharacters.Length
25 |         $Password += $ValidCharacters[$RandomIndex]
26 |     }
27 |     
28 |     return $Password
29 | }
30 | 
31 | # Alias for PassGen
32 | Set-Alias -Name pg -Value PassGen
33 | 
34 | # ---
35 | 
36 | # Create and open file in VS Code
37 | function Create-OpenInVSCode {
38 |     param (
39 |         [Parameter(Mandatory = $true)]
40 |         [String]$newfile
41 |     )
42 | 
43 |     code (new-item $newfile)
44 | }
45 | 
46 | # Aliases for Create-OpenInVSCode
47 | Set-Alias -Name new-file -Value Create-OpenInVSCode
48 | Set-Alias -Name nf -Value Create-OpenInVSCode
49 | 


--------------------------------------------------------------------------------
/Shell/Pass_secret_at_runtime_to_py_script.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | & {Set-Item Env:my_password  "yoUr_str0Ng_paSswoRd_heRe"} | py myscript.py
3 | 
4 | # Your Python script must contain the following:
5 | # import os
6 | # my_password = os.getenv('my_password')


--------------------------------------------------------------------------------
/Shell/Search_specific_branch_name.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | git branch -a | Select-String "string_youre_looking_for"


--------------------------------------------------------------------------------
/Shell/Search_specific_branch_name.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | git branch -a | grep -i your_string_here


--------------------------------------------------------------------------------
/Shell/create_gitignore_and_add_lines.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | Set-Location ./Users/johndoe/documents/
3 | New-Item .gitignore
4 | Add-Content .gitignore '.env'


--------------------------------------------------------------------------------
/Shell/git_mv_multiple_files.ps1:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env pwsh
 2 | # An example shell script to 'git mv' multiple files at once
 3 | 
 4 | # All files from one folder to new folder
 5 | mkdir my_new_folder
 6 | Set-Location ./folder_your_files_are_in
 7 | foreach ($file in Get-ChildItem *.sql) { git mv $file.name .\my_new_folder }
 8 | 
 9 | # Move all folders inside one folder to another folder
10 | mkdir my_new_folder
11 | Set-Location ./folder_your_files_are_in
12 | Get-ChildItem .\my_old_folder\ | % { git mv $_.FullName .\my_new_folder\ }


--------------------------------------------------------------------------------
/Shell/run_all_python_files_in_dir.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | foreach ($file in Get-ChildItem -Path C:\your\directory\here\*.py) {
3 |     python $file.FullName
4 | }


--------------------------------------------------------------------------------
/Shell/run_groovy_script_in_Docker.sh:
--------------------------------------------------------------------------------
1 | # Run Groovy script in Docker
2 | 
3 | #!/usr/bin/env bash
4 | docker run --rm -v "${pwd}:/home/groovy/scripts" -w /home/groovy/scripts groovy:latest groovy your_script.groovy


--------------------------------------------------------------------------------
/SnowSQL_CICD/build.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: jobName
 3 |     default: 'SnowflakeBuild'
 4 |   - name: jobDisplay
 5 |     default: 'Build artifacts for Snowflake deployment'
 6 |   - name: artifactName
 7 |     default: 'SnowflakeTest'
 8 |   - name: vmImage
 9 |     default: 'ubuntu-latest'
10 |   - name: environmentName
11 |     default: 'DEV'
12 | 
13 | jobs:
14 |   - job: ${{ parameters.jobName }}
15 |     displayName: ${{ parameters.jobDisplay }}
16 |     timeoutInMinutes: 10
17 |     pool:
18 |       vmImage: ${{ parameters.vmImage }}
19 |     workspace:
20 |       clean: outputs
21 |     steps:
22 |       # Publish artifacts
23 |       - publish: $(System.DefaultWorkingDirectory)
24 |         artifact: ${{ parameters.artifactName }}
25 |         name: Artifacts
26 |         displayName: Publish pipeline artifacts
27 | 


--------------------------------------------------------------------------------
/SnowSQL_CICD/deploy.yml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   - name: jobName
 3 |     default: 'SnowflakeDeploy'
 4 |   - name: jobDisplay
 5 |     default: 'Deploy Snowflake Objects'
 6 |   - name: databaseName
 7 |     default: ''
 8 |   - name: vmImage
 9 |     default: 'ubuntu-latest'
10 |   - name: environmentName
11 |     default: 'DEV'
12 | 
13 | jobs:
14 |   - deployment: ${{ parameters.jobName }}
15 |     displayName: ${{ parameters.jobDisplay }}
16 |     timeoutInMinutes: 10
17 |     pool:
18 |       vmImage: ${{ parameters.vmImage }}
19 |     environment: ${{ parameters.environmentName }}
20 |     workspace:
21 |       clean: outputs
22 | 
23 |     strategy:
24 |       runOnce:
25 |         deploy:
26 |           steps:
27 |             # Checkout repo
28 |             - checkout: self
29 |               fetchDepth: 10
30 |               clean: true
31 | 
32 |               # Download and Install SnowSQL CLI
33 |             - script: |
34 |                 curl -O https://sfc-repo.snowflakecomputing.com/snowsql/bootstrap/1.2/linux_x86_64/snowsql-1.2.9-linux_x86_64.bash
35 |                 SNOWSQL_DEST=~/snowflake SNOWSQL_LOGIN_SHELL=~/.profile bash snowsql-1.2.9-linux_x86_64.bash
36 |               name: SnowSQLSetup
37 |               displayName: Download and Install SnowSQL
38 | 
39 |               # Test SnowSQL Installation
40 |             - script: ~/snowflake/snowsql -v
41 |               name: TestSnowSQL
42 |               displayName: Test SnowSQL Installation
43 | 
44 |             - script: |
45 |                 echo "All changes in this commit:"
46 |                 git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion)
47 |               name: detectingChanges
48 |               displayName: 'Detecting changes'
49 | 
50 |               # Confirm Snowflake is properly connected
51 |             - script: |
52 |                 # Test SnowSQL connection to our Snowflake instance
53 |                 ~/snowflake/snowsql -q "select current_account(), current_user(), current_role(), current_warehouse()"
54 | 
55 |                 # Confirm that the pipeline is finding the changed SQL files
56 |                 files=$(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep \.sql)
57 | 
58 |                 echo "Changed files:"
59 |                 echo "$files"
60 |               env:
61 |                 SNOWSQL_ACCOUNT: $(SNOWSQL_ACCOUNT)
62 |                 SNOWSQL_USER: $(SNOWSQL_USER)
63 |                 SNOWSQL_PWD: $(SNOWSQL_PWD)
64 |                 SNOWSQL_ROLE: $(SNOWSQL_ROLE)
65 |               name: TestSnowSQLConnection
66 |               displayName: Test Snowflake Connection
67 | 
68 |               # Deploy code to Snowflake
69 |             - script: |
70 |                 files=$(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep \.sql)
71 |                 for file in $files; do
72 |                   echo "Deploying $file"
73 |                   ~/snowflake/snowsql -d ${{ parameters.databaseName }} -f $file
74 |                 done
75 |               env:
76 |                 SNOWSQL_ACCOUNT: $(SNOWSQL_ACCOUNT)
77 |                 SNOWSQL_USER: $(SNOWSQL_USER)
78 |                 SNOWSQL_PWD: $(SNOWSQL_PWD)
79 |                 SNOWSQL_ROLE: $(SNOWSQL_ROLE)
80 |               name: Deploy
81 |               displayName: Deploy code to Snowflake
82 | 


--------------------------------------------------------------------------------
/SnowSQL_CICD/snowsql.yml:
--------------------------------------------------------------------------------
 1 | # This pipeline uses the SnowSQL CLI to deploy code to Snowflake that is merged to main after PR approval.
 2 | # Note that this is the 'parent' pipeline, which calls the build.yml and deploy.yml files
 3 | # Note that this uses Azure DevOps flavoured YAML but could easily be modified to work with Github or GitLab
 4 | 
 5 | name: Snowflake CD Pipeline
 6 | 
 7 | variables:
 8 |   - group: SnowSQL
 9 |   - name: artifactName
10 |     value: 'snowflakeTest'
11 |   - name: vmImage
12 |     value: 'ubuntu-latest'
13 | 
14 | trigger:
15 |   branches:
16 |     include:
17 |       - main
18 | 
19 | stages:
20 |   - stage: Build
21 |     jobs:
22 |       - template: build.yml
23 |         parameters:
24 |           jobName: 'BuildSnowflakeObjects'
25 |           artifactName: $(artifactName)
26 |           vmImage: $(vmImage)
27 | 
28 |   - stage: DEV
29 |     variables:
30 |       - name: database
31 |         value: DEV
32 |       - name: schema
33 |         value: misc
34 |     jobs:
35 |       - template: deploy.yml
36 |         parameters:
37 |           jobName: DEV
38 |           databaseName: $(database)
39 |           vmImage: $(vmImage)
40 |           environmentName: DEV
41 | 
42 |   - stage: QA
43 |     variables:
44 |       - name: database
45 |         value: QA
46 |       - name: schema
47 |         value: misc
48 |     jobs:
49 |       - template: deploy.yml
50 |         parameters:
51 |           jobName: QA
52 |           databaseName: $(database)
53 |           vmImage: $(vmImage)
54 |           environmentName: QA
55 | 
56 |   - stage: PROD
57 |     variables:
58 |       - name: database
59 |         value: PROD
60 |     jobs:
61 |       - template: deploy.yml
62 |         parameters:
63 |           jobName: PROD
64 |           databaseName: $(database)
65 |           vmImage: $(vmImage)
66 |           environmentName: PROD
67 | 


--------------------------------------------------------------------------------
/SnowSQL_CICD/sqlfluff_pr_check.yml:
--------------------------------------------------------------------------------
 1 | # This pipeline uses SQLFluff to lint Snowflake SQL code during a pull request
 2 | # Note that this uses Azure DevOps flavoured YAML but could easily be modified to work with Github or GitLab
 3 | 
 4 | name: Pull Request check using SQLFluff
 5 | 
 6 | parameters:
 7 |   - name: jobName
 8 |     default: 'SnowflakeTest'
 9 |   - name: jobDisplay
10 |     default: 'Lint repo with SQLFluff'
11 | 
12 | pr:
13 |   branches:
14 |     include:
15 |       - main
16 | 
17 | pool:
18 |   vmImage: 'ubuntu-latest'
19 | 
20 | jobs:
21 |   - job: ${{ parameters.jobName }}
22 |     timeoutInMinutes: 10
23 |     displayName: ${{ parameters.jobDisplay }}
24 | 
25 |     workspace:
26 |       clean: outputs
27 | 
28 |     steps:
29 |       # Checkout repo
30 |       - checkout: self
31 |         fetchDepth: 10
32 |         clean: true
33 | 
34 |         # Download and Install SnowSQL CLI
35 |       - script: |
36 |           pip install --upgrade pip
37 |           pip install sqlfluff
38 |         displayName: Download and Install SQLFLuff
39 | 
40 |         # Lint SQL
41 |       - script: |
42 |           git ls-files | grep \.sql | sqlfluff lint --dialect snowflake
43 |         displayName: Analyzing the code with SQLFluff
44 | 


--------------------------------------------------------------------------------
/dbt/dbt_python_model_example.py:
--------------------------------------------------------------------------------
 1 | def calculate_checksum_digit(sscc: str) -> str:
 2 |     """Calculates and concats a checksum digit
 3 |     to a 17 character string using modulus 10 """
 4 |     
 5 |     sscc = sscc.strip()
 6 |     if not sscc:
 7 |         return 'BAD INPUT'
 8 |     
 9 |     try:
10 |         digits = [int(d) for d in str(sscc) if d.isdigit()]
11 |         if not digits:
12 |             return 'BAD INPUT'
13 | 
14 |         weighted_digits = [(d * 3 if i % 2 == 0 else d) for i, d in enumerate(digits)]
15 |         total_weighted_digits = sum(weighted_digits)
16 |         check_digit = (10 - (total_weighted_digits % 10)) % 10
17 |         return (str(sscc) + str(check_digit))
18 |     
19 |     except(ValueError, TypeError):
20 |         return 'BAD INPUT'
21 | 
22 | def model(dbt, session):
23 |     dbt.config(materialized = 'table',
24 |     packages = ['pandas'])
25 |     
26 |     df = dbt.ref('my_upstream_model')
27 |     df = df.to_pandas()
28 |     df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
29 | 
30 |     df['CHECKSUM'] = df['PRE_CHECKSUM'].apply(calculate_checksum_digit)
31 | 
32 |     return df
33 | 


--------------------------------------------------------------------------------
/dbt/filter_dbt_catalog_query_snowflake.sql:
--------------------------------------------------------------------------------
  1 | {% macro snowflake__get_catalog(information_schema, schemas) -%}
  2 | 
  3 | {%- set relations_in_project = [] -%}
  4 | 
  5 | {%- for node in graph.nodes.values() -%}
  6 | {%- if node.resource_type == 'model' -%}
  7 | {%- do relations_in_project.append(node.alias) -%}
  8 | {%- endif -%}
  9 | {%- endfor -%}
 10 | {%- for source in graph.sources.values() -%}
 11 | {%- do relations_in_project.append(source.name) -%}
 12 | {%- endfor -%}
 13 |     
 14 | {%- set relations_in_project = set(relations_in_project) | list -%}
 15 | 
 16 | {%- if (schemas | length) == 0 -%}
 17 |         {%- set query  = "select 1 as id limit 0" -%}
 18 |     {%- else -%}
 19 | 
 20 | {% set query %}
 21 |             
 22 |         with tables as (
 23 | 
 24 |             select
 25 | 
 26 |                 table_catalog as "table_database",
 27 |                 table_schema as "table_schema",
 28 |                 table_name as "table_name",
 29 |                 table_type as "table_type",
 30 |                 comment as "table_comment",
 31 |                 table_owner as "table_owner",
 32 |                 'Clustering Key' as "stats:clustering_key:label",
 33 |                 clustering_key as "stats:clustering_key:value",
 34 |                 'The key used to cluster this table' as "stats:clustering_key:description",
 35 |                 (clustering_key is not null) as "stats:clustering_key:include",
 36 |                 'Row Count' as "stats:row_count:label",
 37 |                 row_count as "stats:row_count:value",
 38 |                 'An approximate count of rows in this table' as "stats:row_count:description",
 39 |                 (row_count is not null) as "stats:row_count:include",
 40 |                 'Approximate Size' as "stats:bytes:label",
 41 |                 bytes as "stats:bytes:value",
 42 |                 'Approximate size of the table as reported by Snowflake' as "stats:bytes:description",
 43 |                 (bytes is not null) as "stats:bytes:include",
 44 |                 'Last Modified' as "stats:last_modified:label",
 45 |                 to_varchar(convert_timezone('UTC', last_altered), 'yyyy-mm-dd HH24:MI'||'UTC') as "stats:last_modified:value",
 46 |                 'The timestamp for last update/change' as "stats:last_modified:description",
 47 |                 (last_altered is not null and table_type='BASE TABLE') as "stats:last_modified:include"
 48 | 
 49 |             from {{ information_schema }}.tables
 50 | 
 51 |             where row_count > 0
 52 | 
 53 |             and (
 54 | {%- for schema in schemas -%}
 55 | upper("table_schema") = upper('{{ schema }}') {%- if not loop.last %} or {% endif -%}
 56 | {%- endfor -%}
 57 | )
 58 | 
 59 | {%- if relations_in_project | length > 0 %}
 60 | 
 61 |             and coalesce(regexp_substr(table_name, '^(.+)_{1}[0-9]{8}$'), table_name) in (
 62 | {%- for rel in relations_in_project -%} upper('{{ rel }}') {%- if not loop.last %}, {% endif -%}{%- endfor -%}
 63 |             )
 64 |             {% endif -%}
 65 | 
 66 |         ),
 67 | 
 68 |         columns as (
 69 | 
 70 |             select
 71 | 
 72 |                 table_catalog as "table_database",
 73 |                 table_schema as "table_schema",
 74 |                 table_name as "table_name",
 75 |                 column_name as "column_name",
 76 |                 ordinal_position as "column_index",
 77 |                 data_type as "column_type",
 78 |                 comment as "column_comment"
 79 | 
 80 |             from {{ information_schema }}.columns
 81 | 
 82 |             where (
 83 | {%- for schema in schemas -%}
 84 | upper("table_schema") = upper('{{ schema }}') {%- if not loop.last %} or {% endif -%}
 85 | {%- endfor -%}
 86 | )
 87 | 
 88 | {%- if relations_in_project | length > 0 %}
 89 | 
 90 |             and coalesce(regexp_substr(table_name, '^(.+)_{1}[0-9]{8}$'), table_name) in (
 91 | {%- for rel in relations_in_project -%} upper('{{ rel }}') {%- if not loop.last %}, {% endif -%}{%- endfor -%}
 92 |             )
 93 |             {% endif -%}
 94 | 
 95 |         )
 96 | 
 97 |         select * from tables
 98 | 
 99 |         inner join columns using ("table_database", "table_schema", "table_name")
100 | 
101 |         order by "c
102 |         {%- endset -%}
103 | 
104 |     {%- endif -%}
105 | 
106 | {%- do log(query) -%}
107 | {%- set results = run_query(query) -%}
108 | {%- do log(schemas ~ ' - rows returned: ' ~ results | length, True) -%}
109 | 
110 | {{ return(results) }}
111 | 
112 | {%- endmacro %}
113 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | snowflake-snowpark-python
2 | pandas
3 | polars
4 | plotly
5 | matplotlib
6 | seaborn
7 | SQLalchemy
8 | ipykernel
9 | scikit-learn


--------------------------------------------------------------------------------