├── Python
├── Stack.ipynb
├── connecting_to_sql_server_using_python.py
├── connecting_to_snowflake_using_python.py
├── compare_two_lists_for_differences.py
├── Snowflake_Insert_Statements.py
├── parse_xml_compare_differences.py
├── load_json_to_snowflake.py
├── pull_records_for_all_sql_tables.py
├── sql_insert_statement_from_csv.py
├── sql_style_join_csv.py
├── Snowpark_Backload_API_Data.py
├── determine_sql_field_length.py
├── read_sql_server_write_snowflake.py
├── Snowpark_Create_Stored_Procedure.py
└── Snowpark_Example_Backload_SQL_Server_Data.py
├── Docker
├── requirements.txt
├── docker-compose.yml
├── Dockerfile
└── Populate_SQL_Server_Docker_Container.py
├── Shell
├── Search_specific_branch_name.sh
├── Search_specific_branch_name.ps1
├── Create_gitignore_and_add_lines.sh
├── create_gitignore_and_add_lines.ps1
├── run_all_python_files_in_dir.ps1
├── run_groovy_script_in_Docker.sh
├── Pass_secret_at_runtime_to_py_script.ps1
├── git_mv_multiple_files.ps1
└── Microsoft.PowerShell_profile.ps1
├── SQL
├── Snowflake_Find_Duplicates.sql
├── Load_CSV_to_Snowflake
│ ├── PUT.sql
│ ├── snowsql.sh
│ └── Snowflake_Worksheet_Load_CSV.sql
├── Snowflake_Cloning.sql
├── Snowflake_Time_Travel.sql
├── Snowflake_Find_Missing_Dates.sql
├── Snowflake_Merge_Into_Example.sql
├── Snowflake_Flatten_JSON_Example.sql
├── Snowflake_Clean_Staging_Area.sql
├── Snowflake_Shorten_Huge_Union_Queries.sql
├── Snowflake_Python_Stored_Procedure_Example.sql
├── Snowflake_Account_Setup.sql
├── Snowflake_Data_Pipeline_From_Internal_Stage.sql
├── Snowflake_ForLoop_GrantPermissions.sql
├── Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql
└── Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql
├── .gitignore
├── requirements.txt
├── .gitattributes
├── .devcontainer
├── config.fish
├── requirements.txt
├── Microsoft.PowerShell_profile.ps1
├── devcontainer.json
└── Dockerfile
├── .sqlfluffignore
├── .github
└── workflows
│ ├── yamllint-ci.yml
│ ├── sqlfluff.yml
│ └── ruff.yml
├── .yamllint
├── SnowSQL_CICD
├── build.yml
├── sqlfluff_pr_check.yml
├── snowsql.yml
└── deploy.yml
├── dbt
├── dbt_python_model_example.py
└── filter_dbt_catalog_query_snowflake.sql
├── CI_Examples
├── yamllint-pr.yml
├── sqlfluff-pr.yml
├── python-pr.yml
├── python-ci.yml
├── yamllint-ci.yml
└── sqlfluff-ci.yml
├── .sqlfluff
├── README.md
└── Fivetran
└── disable_tables_with_zero_rows_fivetran_api.py
/Python/Stack.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Docker/requirements.txt:
--------------------------------------------------------------------------------
1 | pyodbc
2 | sqlalchemy
3 | pandas
4 | numpy
--------------------------------------------------------------------------------
/Shell/Search_specific_branch_name.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | git branch -a | grep -i your_string_here
--------------------------------------------------------------------------------
/SQL/Snowflake_Find_Duplicates.sql:
--------------------------------------------------------------------------------
1 | select * from my_table
2 | qualify count(*) over (partition by primary_key) > 1;
3 |
--------------------------------------------------------------------------------
/Shell/Search_specific_branch_name.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | git branch -a | Select-String "string_youre_looking_for"
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .vscode/
3 | /.ruff_cache
4 | /.ipynb_checkpoints
5 | Snowflake_Azure_Blob_Auto_ingest_Snowpipe.sql
6 |
--------------------------------------------------------------------------------
/SQL/Load_CSV_to_Snowflake/PUT.sql:
--------------------------------------------------------------------------------
1 | !set variable_substitution=true;
2 | put file://&{csv_path} @~&{stage} auto_compress=true;
3 |
--------------------------------------------------------------------------------
/SQL/Load_CSV_to_Snowflake/snowsql.sh:
--------------------------------------------------------------------------------
1 | snowsql -c dev -s my_schema -f PUT.sql -D csv_path=your_csv_path\your_csv.csv -D stage=my_stage
--------------------------------------------------------------------------------
/Shell/Create_gitignore_and_add_lines.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd /Users/johndoe/documents
3 | touch .gitignore
4 | echo '.env' >> .gitignore
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | snowflake-snowpark-python
2 | pandas
3 | polars
4 | plotly
5 | matplotlib
6 | seaborn
7 | SQLalchemy
8 | ipykernel
9 | scikit-learn
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.sql linguist-detectable=true
2 | *.yml linguist-detectable=true
3 | *.yml linguist-language=YAML
4 | *.ipynb linguist-detectable=false
--------------------------------------------------------------------------------
/Shell/create_gitignore_and_add_lines.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | Set-Location ./Users/johndoe/documents/
3 | New-Item .gitignore
4 | Add-Content .gitignore '.env'
--------------------------------------------------------------------------------
/Shell/run_all_python_files_in_dir.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | foreach ($file in Get-ChildItem -Path C:\your\directory\here\*.py) {
3 | python $file.FullName
4 | }
--------------------------------------------------------------------------------
/Shell/run_groovy_script_in_Docker.sh:
--------------------------------------------------------------------------------
1 | # Run Groovy script in Docker
2 |
3 | #!/usr/bin/env bash
4 | docker run --rm -v "${pwd}:/home/groovy/scripts" -w /home/groovy/scripts groovy:latest groovy your_script.groovy
--------------------------------------------------------------------------------
/.devcontainer/config.fish:
--------------------------------------------------------------------------------
1 | # Activate oh-my-posh prompt:
2 | oh-my-posh init fish --config $POSH_THEME | source
3 |
4 | # NOTE: You can override the above env vars from the devcontainer.json "args" under the "build" key.
--------------------------------------------------------------------------------
/Shell/Pass_secret_at_runtime_to_py_script.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | & {Set-Item Env:my_password "yoUr_str0Ng_paSswoRd_heRe"} | py myscript.py
3 |
4 | # Your Python script must contain the following:
5 | # import os
6 | # my_password = os.getenv('my_password')
--------------------------------------------------------------------------------
/.sqlfluffignore:
--------------------------------------------------------------------------------
1 | # SQLFluff doesn't work well with Snowflake loops, functions
2 | # or Python stored procedures. Ignoring those files here
3 | Snowflake_ForLoop_GrantPermissions.sql
4 | Snowflake_Python_Stored_Procedure_Example.sql
5 | Snowflake_Shorten_Huge_Union_Queries.sql
6 | Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql
7 | PUT.sql
8 | Snowflake_Time_Travel.sql
9 |
--------------------------------------------------------------------------------
/.devcontainer/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.5.2
2 | prefect==2.7.7
3 | prefect-sqlalchemy==0.2.2
4 | prefect-gcp[cloud_storage]==0.2.4
5 | protobuf
6 | pyarrow==10.0.1
7 | pandas-gbq==0.18.1
8 | psycopg2-binary==2.9.5
9 | sqlalchemy==1.4.46
10 | ipykernel
11 | polars
12 | dbt-core
13 | dbt-bigquery
14 | dbt-postgres
15 | dbt-snowflake
16 | pyspark
17 | # confluent-kafka==1.9.2
18 | snowflake-snowpark-python
19 | scikit-learn
20 | ruff
21 | sqlfluff
--------------------------------------------------------------------------------
/Shell/git_mv_multiple_files.ps1:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env pwsh
2 | # An example shell script to 'git mv' multiple files at once
3 |
4 | # All files from one folder to new folder
5 | mkdir my_new_folder
6 | Set-Location ./folder_your_files_are_in
7 | foreach ($file in Get-ChildItem *.sql) { git mv $file.name .\my_new_folder }
8 |
9 | # Move all folders inside one folder to another folder
10 | mkdir my_new_folder
11 | Set-Location ./folder_your_files_are_in
12 | Get-ChildItem .\my_old_folder\ | % { git mv $_.FullName .\my_new_folder\ }
--------------------------------------------------------------------------------
/.github/workflows/yamllint-ci.yml:
--------------------------------------------------------------------------------
1 | name: yamllint testing
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 |
9 | steps:
10 | - name: Checkout repository
11 | uses: actions/checkout@v2
12 |
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.x
17 |
18 | - name: Install yamllint
19 | run: pip install yamllint
20 |
21 | - name: Run yamllint
22 | run: git ls-files | grep \.yml | yamllint .
23 |
--------------------------------------------------------------------------------
/.github/workflows/sqlfluff.yml:
--------------------------------------------------------------------------------
1 | name: SQLFluff Testing
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 |
9 | steps:
10 | - name: Checkout repository
11 | uses: actions/checkout@v2
12 |
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.x
17 |
18 | - name: Install dependencies
19 | run: pip install sqlfluff
20 |
21 | - name: Run SQLFluff
22 | run: git ls-files | grep \.sql | sqlfluff lint --dialect snowflake
23 |
--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: Ruff Testing
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 |
9 | steps:
10 | - name: Checkout repository
11 | uses: actions/checkout@v2
12 |
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.x
17 |
18 | - name: Install dependencies
19 | run: pip install ruff
20 |
21 | - name: Test Ruff installation
22 | run: ruff --version
23 |
24 | - name: Run ruff
25 | run: ruff check ./Python/
26 |
--------------------------------------------------------------------------------
/Docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.9'
2 |
3 | services:
4 | SQL-Server:
5 | image: mcr.microsoft.com/mssql/server:2022-latest
6 | container_name: SQL_Server_Dev_Environment
7 | restart: unless-stopped
8 | ports:
9 | - "1433:1433"
10 | environment:
11 | - ACCEPT_EULA=Y
12 | - SA_PASSWORD=-Your-Strong!Password@Here%
13 |
14 | # I cant actually get this to work due to our Windows auth/active directory situation at Cooke...
15 | # i.e from within the container, my script doesn't know how to authenticate to our production SQL server
16 | # python:
17 | # container_name: SQL_Server_Python_Script
18 | # build: ./
19 | # command: python3 ./SQL_Server_ForLoop.py
20 |
--------------------------------------------------------------------------------
/Python/connecting_to_sql_server_using_python.py:
--------------------------------------------------------------------------------
1 | # import modules
2 | import pyodbc
3 | import pandas as pd
4 |
5 | # set all rows and columns visible
6 | # pd.set_option('display.max_columns', None)
7 | # pd.set_option('display.max_rows', None)
8 |
9 |
10 | # server credentials
11 | server = "server"
12 | database = "database"
13 |
14 | # sql connection - uses AD to authenticate
15 | cnxn = pyodbc.connect(
16 | Trusted_Connection="Yes", Driver="{SQL Server}", Server=server, Database=database
17 | )
18 | cursor = cnxn.cursor()
19 |
20 | # stick your query inside the triple quotes
21 | query = """select top 10 * from database.dbo.table"""
22 |
23 | # load query to dataframe
24 | df_sql = pd.read_sql(query, cnxn)
25 | df_sql.head()
26 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Cloning.sql:
--------------------------------------------------------------------------------
1 | /* How to clone data in Snowflake
2 | By: Martin Palkovic
3 | Date: 2022-06-10
4 |
5 | Description: Zero copy cloning is one of the awesome features of Snowflake.
6 | I like to use this feature to quickly create a development environment for
7 | testing */
8 |
9 | use role sysadmin;
10 | use warehouse reporting_wh;
11 | use database production;
12 | use schema dbo;
13 |
14 | /* clone database */
15 | create database my_cloned_db clone my_db;
16 |
17 | /* clone schema */
18 | create schema my_cloned_schema clone analytics_inventory;
19 |
20 | /* clone table */
21 | create table my_cloned_table clone main_inventory_table;
22 |
23 | /* cloning with time travel */
24 | create or replace table my_cloned_table clone main_inventory_table
25 | at (timestamp => '2022-06-10 9:30')
26 |
--------------------------------------------------------------------------------
/.yamllint:
--------------------------------------------------------------------------------
1 | yaml-files:
2 | - '*.yml'
3 | - '*.yaml'
4 | - '.yamllint'
5 |
6 | rules:
7 | braces: enable
8 | brackets: enable
9 | colons: enable
10 | commas: enable
11 | comments:
12 | level: warning
13 | comments-indentation:
14 | level: warning
15 | document-end: disable
16 | document-start: disable
17 | empty-lines: enable
18 | empty-values: disable
19 | hyphens: enable
20 | indentation: enable
21 | key-duplicates: enable
22 | key-ordering: disable
23 | new-line-at-end-of-file: enable
24 | new-lines: disable
25 | octal-values: disable
26 | quoted-strings: disable
27 | trailing-spaces: enable
28 | truthy:
29 | level: warning
30 | # 120 chars should be enough, but don't fail if a line is longer
31 | line-length:
32 | max: 120
33 | level: warning
34 |
--------------------------------------------------------------------------------
/SnowSQL_CICD/build.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 | - name: jobName
3 | default: 'SnowflakeBuild'
4 | - name: jobDisplay
5 | default: 'Build artifacts for Snowflake deployment'
6 | - name: artifactName
7 | default: 'SnowflakeTest'
8 | - name: vmImage
9 | default: 'ubuntu-latest'
10 | - name: environmentName
11 | default: 'DEV'
12 |
13 | jobs:
14 | - job: ${{ parameters.jobName }}
15 | displayName: ${{ parameters.jobDisplay }}
16 | timeoutInMinutes: 10
17 | pool:
18 | vmImage: ${{ parameters.vmImage }}
19 | workspace:
20 | clean: outputs
21 | steps:
22 | # Publish artifacts
23 | - publish: $(System.DefaultWorkingDirectory)
24 | artifact: ${{ parameters.artifactName }}
25 | name: Artifacts
26 | displayName: Publish pipeline artifacts
27 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Time_Travel.sql:
--------------------------------------------------------------------------------
1 | /* Title: Snowflake Time Travel
2 | By: Martin Palkovic
3 | Date: 2022-06-07
4 | Description: Snowflake has great time travel functionality, were you can easily restore
5 | a table to its state at a previous point in time. I have used this functionality with
6 | great success when a production table with 2 million records was deleted on accident!
7 | */
8 |
9 | show tables history;
10 |
11 | /* Note that you may need to rename the table */
12 | alter table my_table rename to my_table_whoops;
13 |
14 | /* specify the time */
15 | select
16 | acct_number,
17 | date
18 | from my_table at (timestamp => '2022-06-01 6:00');
19 |
20 | /* specify an offset, ex. 1 hour ago*/
21 | select
22 | acct_number,
23 | date
24 | from my_table at (offset => -60*60); --offset is in seconds here
25 |
--------------------------------------------------------------------------------
/Python/connecting_to_snowflake_using_python.py:
--------------------------------------------------------------------------------
1 | """ Import Modules """
2 | import os
3 | from dotenv import load_dotenv
4 | from snowflake import connector
5 | # import pandas as pd
6 |
7 | load_dotenv()
8 |
9 | # establish connection to Snowflake using .env file
10 | connection = connector.connect(
11 | user=os.getenv("SNOWFLAKE_USER"),
12 | password=os.getenv("SNOWFLAKE_PASSWORD"),
13 | account=os.getenv("SNOWFLAKE_ACCT"),
14 | role=os.getenv("SNOWFLAKE_ROLE"),
15 | warehouse="REPORTING_WH",
16 | )
17 |
18 | # sample SQL query, paste whatever you'd like in here
19 | sql_query = "select * from database.schema.table limit 10;"
20 |
21 | # execute the query
22 | cursor = connection.cursor()
23 | cursor.execute(sql_query)
24 |
25 | # load the data in to Pandas
26 | df = cursor.fetch_pandas_all()
27 | df.head()
28 |
--------------------------------------------------------------------------------
/Python/compare_two_lists_for_differences.py:
--------------------------------------------------------------------------------
1 | """Compare two lists for differences
2 | By: Martin Palkovic
3 | Date: 2022-02-09"""
4 | # ------------------------------
5 | # a common work task is to compare two database ID fields
6 | # against each other to determine which records exist
7 | # in one table but not another. This operation can take 10+
8 | # minutes to run in SQl and is syntactically heavy, but is
9 | # fast and easy in Python.
10 |
11 |
12 | # Copy and paste your fields below
13 | # to identify records that are unique to one of the tables
14 |
15 | list1 = ["red", "blue", "yellow", 7, 25] # copy and paste your values into here
16 | list2 = ["yellow", 7, "blue", 1, 5.4]
17 |
18 | # returns items that are in list1 but not in list2
19 | list_difference = [item for item in list1 if item not in list2]
20 | print(list_difference)
21 |
--------------------------------------------------------------------------------
/Python/Snowflake_Insert_Statements.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | df = pd.read_csv(r"your/file/here.csv")
5 | df = df.replace({np.nan: "NULL"})
6 |
7 | print("successfully read csv!\n")
8 |
9 |
10 | def sql_insert_statement_from_dataframe(source, target):
11 | print("insert into " + target + "(" + str(", ".join(source.columns)) + ") values ")
12 | for i, x in source.iterrows():
13 | values = x.values
14 | formatted_values = []
15 | for val in values:
16 | if val == "NULL":
17 | formatted_values.append(val)
18 | else:
19 | formatted_values.append("'" + str(val) + "'")
20 | if i == len(source) - 1:
21 | print("(" + str(", ".join(formatted_values)) + ");")
22 | else:
23 | print("(" + str(", ".join(formatted_values)) + "),")
24 |
25 |
26 | sql_insert_statement_from_dataframe(df, "my_db.my_schema.my_table")
27 |
--------------------------------------------------------------------------------
/.devcontainer/Microsoft.PowerShell_profile.ps1:
--------------------------------------------------------------------------------
1 | Import-Module posh-git
2 | Import-Module PSFzf -ArgumentList 'Ctrl+t', 'Ctrl+r'
3 | Import-Module z
4 | Import-Module Terminal-Icons
5 |
6 | Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete
7 |
8 | $env:POSH_GIT_ENABLED=$true
9 | oh-my-posh init pwsh --config $env:POSH_THEME | Invoke-Expression
10 |
11 | # NOTE: You can override the above env var from the devcontainer.json "args" under the "build" key.
12 | function PassGen {
13 | param (
14 | [int]$Length = 20
15 | )
16 |
17 | $ValidCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+='
18 | $Password = ''
19 |
20 | for ($i = 0; $i -lt $Length; $i++) {
21 | $RandomIndex = Get-Random -Minimum 0 -Maximum $ValidCharacters.Length
22 | $Password += $ValidCharacters[$RandomIndex]
23 | }
24 |
25 | return $Password
26 | }
27 |
28 | Set-Alias -Name pg -Value PassGen
29 | # Aliases
30 | Set-Alias -Name ac -Value Add-Content
--------------------------------------------------------------------------------
/Docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # I wrote this Dockerfile to run the Python script inside of my container, but it doesn't work because of our Windows auth at work
2 | FROM python:3.8-slim
3 |
4 | ENV DEBIAN_FRONTEND="noninteractive"\
5 | ACCEPT_EULA="y"
6 |
7 | # install system dependencies
8 | # Microsoft SQL Server Prerequisites
9 | RUN apt-get update -y \
10 | && apt-get install -y gcc curl gnupg build-essential\
11 | unixodbc unixodbc-dev tdsodbc freetds-common freetds-bin freetds-dev\
12 | && curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - \
13 | && curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list \
14 | && apt-get update \
15 | && apt-get install -y --no-install-recommends locales apt-transport-https\
16 | && apt-get -y --no-install-recommends install msodbcsql18 unixodbc-dev
17 |
18 | WORKDIR /usr/src/app
19 |
20 | COPY requirements.txt ./
21 |
22 | RUN pip install --no-cache-dir -r requirements.txt
23 |
24 | COPY . .
25 |
26 | CMD [ "python", "./SQL_Server_ForLoop.py" ]
27 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Find_Missing_Dates.sql:
--------------------------------------------------------------------------------
1 | /* Query: find missing dates in a range of dates
2 | By: Martin Palkovic
3 | Date: 2022-08-19
4 | System: Snowflake
5 | Description: Say, for example, you have a report, and there is data missing for certain dates
6 | on that report. You can use this query to identify dates where you may have missing data
7 | */
8 |
9 | use role sysadmin;
10 | use warehouse my_warehouse;
11 | use database my_db;
12 | use schema my_schema;
13 |
14 | with find_date_gaps (rownum, my_date_field) as (
15 | select
16 | my_date_field,
17 | row_number() over (order by my_date_field asc) as rownum
18 | from your_table
19 | where my_date_field > 'yyyy-mm-dd'
20 | group by my_date_field
21 | )
22 |
23 | select
24 | dateadd(dd, 1, fdg1.my_date_field) as startofgap,
25 | dateadd(dd, -1, fdg2.my_date_field) as endofgap
26 | from find_date_gaps as fdg1
27 | inner join find_date_gaps as fdg2
28 | on fdg1.rownum = (fdg2.rownum - 1)
29 | where datediff(dd, fdg1.my_date_field, dateadd(dd, -1, fdg2.my_date_field)) != 0;
30 |
--------------------------------------------------------------------------------
/dbt/dbt_python_model_example.py:
--------------------------------------------------------------------------------
1 | def calculate_checksum_digit(sscc: str) -> str:
2 | """Calculates and concats a checksum digit
3 | to a 17 character string using modulus 10 """
4 |
5 | sscc = sscc.strip()
6 | if not sscc:
7 | return 'BAD INPUT'
8 |
9 | try:
10 | digits = [int(d) for d in str(sscc) if d.isdigit()]
11 | if not digits:
12 | return 'BAD INPUT'
13 |
14 | weighted_digits = [(d * 3 if i % 2 == 0 else d) for i, d in enumerate(digits)]
15 | total_weighted_digits = sum(weighted_digits)
16 | check_digit = (10 - (total_weighted_digits % 10)) % 10
17 | return (str(sscc) + str(check_digit))
18 |
19 | except(ValueError, TypeError):
20 | return 'BAD INPUT'
21 |
22 | def model(dbt, session):
23 | dbt.config(materialized = 'table',
24 | packages = ['pandas'])
25 |
26 | df = dbt.ref('my_upstream_model')
27 | df = df.to_pandas()
28 | df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
29 |
30 | df['CHECKSUM'] = df['PRE_CHECKSUM'].apply(calculate_checksum_digit)
31 |
32 | return df
33 |
--------------------------------------------------------------------------------
/SnowSQL_CICD/sqlfluff_pr_check.yml:
--------------------------------------------------------------------------------
1 | # This pipeline uses SQLFluff to lint Snowflake SQL code during a pull request
2 | # Note that this uses Azure DevOps flavoured YAML but could easily be modified to work with Github or GitLab
3 |
4 | name: Pull Request check using SQLFluff
5 |
6 | parameters:
7 | - name: jobName
8 | default: 'SnowflakeTest'
9 | - name: jobDisplay
10 | default: 'Lint repo with SQLFluff'
11 |
12 | pr:
13 | branches:
14 | include:
15 | - main
16 |
17 | pool:
18 | vmImage: 'ubuntu-latest'
19 |
20 | jobs:
21 | - job: ${{ parameters.jobName }}
22 | timeoutInMinutes: 10
23 | displayName: ${{ parameters.jobDisplay }}
24 |
25 | workspace:
26 | clean: outputs
27 |
28 | steps:
29 | # Checkout repo
30 | - checkout: self
31 | fetchDepth: 10
32 | clean: true
33 |
34 | # Download and Install SnowSQL CLI
35 | - script: |
36 | pip install --upgrade pip
37 | pip install sqlfluff
38 | displayName: Download and Install SQLFLuff
39 |
40 | # Lint SQL
41 | - script: |
42 | git ls-files | grep \.sql | sqlfluff lint --dialect snowflake
43 | displayName: Analyzing the code with SQLFluff
44 |
--------------------------------------------------------------------------------
/CI_Examples/yamllint-pr.yml:
--------------------------------------------------------------------------------
1 | # Azure CI pipeline that lints all YAML files during a PR.
2 | # See the yamllint GitHub for more info: [yamllint](https://github.com/adrienverge/yamllint)
3 | name: yamllint PR Check
4 |
5 | parameters:
6 | - name: jobName
7 | default: 'yamllintPR'
8 | - name: jobDisplay
9 | default: 'Lint .yml files with yamllint'
10 |
11 | trigger:
12 | branches:
13 | include:
14 | - main
15 |
16 | pool:
17 | vmImage: 'ubuntu-latest'
18 |
19 | jobs:
20 | - job: ${{ parameters.jobName}}
21 | timeoutInMinutes: 10
22 | displayName: ${{ parameters.jobDisplay }}
23 |
24 | workspace:
25 | clean: outputs
26 |
27 | steps:
28 | # Checkout repo
29 | - checkout: self
30 | fetchDepth: 1
31 | clean: true
32 |
33 | # Install yamllint
34 | - script: |
35 | pip install yamllint
36 | displayName: Download yamllint
37 |
38 | # Test yamllint installation and list all .yml files in the repo
39 | - script: |
40 | yamllint --version
41 | git ls-files | grep \.yml
42 | displayName: Test yamllint Install, list all .yml files
43 |
44 | # Lint YAML
45 | - script: |
46 | git ls-files | grep \.yml | yamllint .
47 | displayName: Lint .yml files
48 |
--------------------------------------------------------------------------------
/CI_Examples/sqlfluff-pr.yml:
--------------------------------------------------------------------------------
1 | # Azure CI pipeline that lints all SQL files during a PR.
2 | # See the SQLFluff GitHub for more info: [SQLFluff](https://github.com/sqlfluff/sqlfluff)
3 | name: SQLFluff PR Check
4 |
5 | parameters:
6 | - name: jobName
7 | default: 'SQLFluffPR'
8 | - name: jobDisplay
9 | default: 'Lint repo with SQLFluff'
10 |
11 | trigger:
12 | branches:
13 | include:
14 | - main
15 |
16 | pool:
17 | vmImage: 'ubuntu-latest'
18 |
19 | jobs:
20 | - job: ${{ parameters.jobName }}
21 | timeoutInMinutes: 30
22 | displayName: ${{ parameters.jobDisplay }}
23 |
24 | workspace:
25 | clean: outputs
26 |
27 | steps:
28 | # Checkout repo
29 | - checkout: self
30 | fetchDepth: 1
31 | clean: true
32 |
33 | # Install SQLFluff
34 | - script: |
35 | pip install sqlfluff
36 | displayName: Download and Install SQLFluff
37 |
38 | # Test SQLFluff Installation and list all files in repo
39 | - script: |
40 | sqlfluff --version
41 | git ls-files | grep \.sql
42 | displayName: Test SQLFluff Install, List all files for CI
43 |
44 | # Lint SQL
45 | - script: |
46 | git ls-files | grep \.sql | sqlfluff lint --dialect snowflake
47 | displayName: Analyzing the code with SQLFluff
48 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Merge_Into_Example.sql:
--------------------------------------------------------------------------------
1 | /* Title: Example MERGE INTO statement for incremental loading into Snowflake
2 | By: Martin Palkovic
3 | Date: 2022-10-20
4 | Description: With large datasets, you'll often want to implement an incremental load to
5 | improve performance in your data pipeline. The code below will prevent duplicates in your load,
6 | while only adding new records and updating existing records if changes exist. Note that this code excludes
7 | the database name from the full qualified table name - that is deliberate so that this code can be run against
8 | a development database first. The database name is set in the environment extensions of your pipeline tool.
9 |
10 | -- This is a minimum reproducible example of code I've used in production.
11 | */
12 |
13 | merge into
14 | my_schema.my_table as destination
15 |
16 | using (
17 | select *
18 | from my_schema.my_staging_table
19 | qualify row_number() over (
20 | partition by my_unique_sk
21 | order by created_date desc
22 | ) = 1
23 | ) as source
24 | on (source.my_unique_sk = destination.my_unique_sk)
25 |
26 | when matched then
27 | update
28 | set
29 | destination.my_unique_sk = source.my_unique_sk,
30 | destination.order_id = source.order_id,
31 | destination.ship_date = source.ship_date
32 |
33 | when not matched
34 | then insert
35 | (
36 | my_unique_sk,
37 | order_id,
38 | ship_date
39 | )
40 | values
41 | (
42 | source.my_unique_sk,
43 | source.order_id,
44 | source.ship_date
45 | );
46 |
--------------------------------------------------------------------------------
/Python/parse_xml_compare_differences.py:
--------------------------------------------------------------------------------
1 | """
2 | Name: Parse XML, extract a field, compare that field to a field from a csv for diffs
3 | By: Martin Palkovic
4 | Date: 2022-08-18
5 | Description:
6 | """
7 |
8 | # Import Modules
9 | import pandas as pd
10 |
11 | # Paste your xml here
12 | xml = """
13 |
14 |
15 |
16 |
17 | 1
18 | warehouse1
19 | 1
20 | 127
21 | 9.16
22 | 08/16/2022 15:38:55
23 |
24 |
25 |
26 |
27 | 2
28 | warehouse2
29 | 2
30 | 450
31 | 13.3
32 | 08/17/2022 15:39:26
33 |
34 |
35 |
36 |
37 | """
38 |
39 | # Parse XML
40 | df = pd.read_xml(xml, xpath=".//Property")
41 |
42 | # Extract only the columns we need from the XML
43 | df_pallet = df.loc[df["name"] == "Pallet"]
44 |
45 | # Read CSV
46 | df_csv = pd.read_csv(r"your_csv_here.csv")
47 |
48 | # Convert values to Python list, cast to integer
49 | pallet = df_pallet["Property"].tolist()
50 | pallet = [int(i) for i in pallet]
51 | csv = df_csv["Pallet"].tolist()
52 |
53 | # Compare differences
54 | print([i for i in pallet if i not in csv])
55 |
--------------------------------------------------------------------------------
/Shell/Microsoft.PowerShell_profile.ps1:
--------------------------------------------------------------------------------
1 | # This is an example of my Microsoft PowerShell profile. It sets up the Oh-My-Posh terminal theme,
2 | # and contains the following user defined functions:
3 | # PassGen: Generates random strong passwords
4 | # Create-OpenInVSCode: Creates and opens a file in VS Code using one simple command
5 |
6 | Set-Item -Path Env:TERMINAL_THEME -Value "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/night-owl.omp.json"
7 |
8 | Import-Module Terminal-Icons
9 |
10 | Set-PSReadlineKeyHandler -Key Tab -Function MenuComplete
11 |
12 | oh-my-posh init pwsh --config $env:TERMINAL_THEME | Invoke-Expression
13 |
14 | # Password Generator
15 | function PassGen {
16 | param (
17 | [int]$Length = 20
18 | )
19 |
20 | $ValidCharacters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()_-+='
21 | $Password = ''
22 |
23 | for ($i = 0; $i -lt $Length; $i++) {
24 | $RandomIndex = Get-Random -Minimum 0 -Maximum $ValidCharacters.Length
25 | $Password += $ValidCharacters[$RandomIndex]
26 | }
27 |
28 | return $Password
29 | }
30 |
31 | # Alias for PassGen
32 | Set-Alias -Name pg -Value PassGen
33 |
34 | # ---
35 |
36 | # Create and open file in VS Code
37 | function Create-OpenInVSCode {
38 | param (
39 | [Parameter(Mandatory = $true)]
40 | [String]$newfile
41 | )
42 |
43 | code (new-item $newfile)
44 | }
45 |
46 | # Aliases for Create-OpenInVSCode
47 | Set-Alias -Name new-file -Value Create-OpenInVSCode
48 | Set-Alias -Name nf -Value Create-OpenInVSCode
49 |
--------------------------------------------------------------------------------
/Python/load_json_to_snowflake.py:
--------------------------------------------------------------------------------
1 | """Example script to load multiple JSONs to a named Snowflake staging area,
2 | then copy the JSONs into a Snowflake table
3 | By: Martin Palkovic
4 | Date: 2022-07-28
5 | Description: Sometimes in a dev environment,
6 | I need to manipulate a JSON file to see the effect those changes
7 | will have on my data pipeline. Here's a quick script I wrote
8 | to batch load json files into Snowflake, after I've altered some of the fields
9 | """
10 |
11 | import os
12 | from snowflake import connector
13 |
14 | from dotenv import load_dotenv
15 |
16 | load_dotenv()
17 |
18 | # folder containing your json files
19 | root = r"C:\Directory\containing\JSON\files"
20 |
21 | # Connect to your Snowflake account
22 | cnxn = connector.connect(
23 | user=os.getenv("SNOWFLAKE_USER"),
24 | password=os.getenv("SNOWFLAKE_PASSWORD"),
25 | account=os.getenv("SNOWFLAKE_ACCT"),
26 | role=os.getenv("SNOWFLAKE_ROLE"),
27 | warehouse="REPORTING_WH",
28 | )
29 |
30 | cursor = cnxn.cursor()
31 | cursor.execute("create or replace stage MY_STAGE;")
32 | cursor.execute("use role SYSADMIN;")
33 |
34 | for file in os.listdir(root):
35 | full_path = os.path.join(root, file)
36 | cursor.execute(f"put file://{full_path} @MY_STAGE;")
37 |
38 | copy_statement = file + ".gz"
39 | cursor.execute(
40 | f"""copy into EXAMPLE_TABLE (JSON_DATA, INSERT DATE)
41 | from (select t.$1,
42 | current_timestamp()
43 | from @MY_STAGE/{copy_statement} t)
44 | file_format = (type = JSON);"""
45 | )
46 | cursor.close()
47 | cnxn.close()
48 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Flatten_JSON_Example.sql:
--------------------------------------------------------------------------------
1 | /**********************************************************************************************************/
2 | -- Query: Flatten JSON to analytics view in Snowflake
3 | -- CreateBy: Martin Palkovic
4 | -- Create date: 2021-05-03
5 | -- Description: SQL code for creating a materialized view in Snowflake from a JSON in your staging area
6 | -- Modified by:
7 | -- Modify date:
8 | -- Mod Reason:
9 | /***********************************************************************************************************/
10 |
11 | create or replace materialized view my_db.schema.my_view
12 | as
13 | select
14 | jsn.value:Id::string as id,
15 | jsn.value:TotalAmount::number(10, 2) as total_amount,
16 | jsn.value:Cash::boolean as cash,
17 | jsn.value:TransactionDate::date as transaction_date
18 | from staging_area.schema.my_table,
19 | lateral flatten(input => json_data) as jsn
20 |
21 | qualify row_number()
22 | over (
23 | partition by jsn.value:Id
24 | order by jsn.value:Id
25 | )
26 | = 1;
27 |
28 | /*
29 | Input:
30 | Row JSON_DATA
31 | 1 [{"Id": 1,"TotalAmount": 42.75, "Cash": true,"TransactionDate": "2022-03-25T18:44:46.54"}]
32 | 2 [{"Id":2, "TotalAmount": 57.99, "Cash": false, "TransactionDate": "2022-03-28T12:24:33.12"}]
33 | 3 [{"Id": 1,"TotalAmount": 42.75, "Cash": true,"TransactionDate": "2022-03-25T18:44:46.54"}]
34 | 4 [{"Id": 3, "TotalAmount": 100.25, "Cash": false, "TransactionDate": "2022-04-01T06:10:15.30"}]
35 |
36 | Output:
37 | ID Total_Amount Cash Transaction_Date
38 | 1 42.75 True 2022-03-25
39 | 2 57.99 False 2022-03-28
40 | 3 100.25 False 2022-04-01
41 | */
42 |
--------------------------------------------------------------------------------
/Python/pull_records_for_all_sql_tables.py:
--------------------------------------------------------------------------------
1 | """Title: Data Pull for all views in SQL database
2 | By: Martin Palkovic
3 | Date: 2022-11-08
4 | Description: Script to loop through every view in my_db and pull 100 records.
5 | The Business Analyst for a project at work asked for the structure of
6 | each my_db table, this was the fastest way to do it
7 | """
8 |
9 | # import modules
10 | from sqlalchemy.engine import URL
11 | from sqlalchemy import create_engine
12 |
13 | import pandas as pd
14 |
15 | # SQL Server Connection - uses Active Directory to authenticate
16 | driver = "SQL Server"
17 | server = "my_server"
18 | database = "my_db"
19 | schema = "dbo"
20 |
21 |
22 | # Define connection function
23 | def sqlalchemy_cnxn(driver, server, db):
24 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
25 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
26 | engine = create_engine(url)
27 | return engine
28 |
29 |
30 | engine = sqlalchemy_cnxn(driver, server, database)
31 |
32 | list_of_views = "SELECT name FROM sys.views"
33 |
34 | my_server_views = pd.read_sql(list_of_views, engine)
35 | list_of_sql_views = sorted(my_server_views["name"].to_list())
36 | list_of_sql_views = [
37 | x for x in list_of_sql_views if x != "DailySensorReadings"
38 | ]
39 | # I had one table with 50M + rows that was causing performance issues, I removed it here
40 |
41 | for view in list_of_sql_views:
42 | try:
43 | query = f"SELECT TOP 100 * FROM {database}.{schema}.{view}"
44 | results = engine.execute(query)
45 | df = pd.read_sql(query, engine)
46 | if len(df) > 0:
47 | df.to_csv(f"{view}.csv")
48 | else:
49 | pass
50 | except Exception:
51 | print(f"failed to generate data for view {view}")
52 |
--------------------------------------------------------------------------------
/SnowSQL_CICD/snowsql.yml:
--------------------------------------------------------------------------------
1 | # This pipeline uses the SnowSQL CLI to deploy code to Snowflake that is merged to main after PR approval.
2 | # Note that this is the 'parent' pipeline, which calls the build.yml and deploy.yml files
3 | # Note that this uses Azure DevOps flavoured YAML but could easily be modified to work with Github or GitLab
4 |
5 | name: Snowflake CD Pipeline
6 |
7 | variables:
8 | - group: SnowSQL
9 | - name: artifactName
10 | value: 'snowflakeTest'
11 | - name: vmImage
12 | value: 'ubuntu-latest'
13 |
14 | trigger:
15 | branches:
16 | include:
17 | - main
18 |
19 | stages:
20 | - stage: Build
21 | jobs:
22 | - template: build.yml
23 | parameters:
24 | jobName: 'BuildSnowflakeObjects'
25 | artifactName: $(artifactName)
26 | vmImage: $(vmImage)
27 |
28 | - stage: DEV
29 | variables:
30 | - name: database
31 | value: DEV
32 | - name: schema
33 | value: misc
34 | jobs:
35 | - template: deploy.yml
36 | parameters:
37 | jobName: DEV
38 | databaseName: $(database)
39 | vmImage: $(vmImage)
40 | environmentName: DEV
41 |
42 | - stage: QA
43 | variables:
44 | - name: database
45 | value: QA
46 | - name: schema
47 | value: misc
48 | jobs:
49 | - template: deploy.yml
50 | parameters:
51 | jobName: QA
52 | databaseName: $(database)
53 | vmImage: $(vmImage)
54 | environmentName: QA
55 |
56 | - stage: PROD
57 | variables:
58 | - name: database
59 | value: PROD
60 | jobs:
61 | - template: deploy.yml
62 | parameters:
63 | jobName: PROD
64 | databaseName: $(database)
65 | vmImage: $(vmImage)
66 | environmentName: PROD
67 |
--------------------------------------------------------------------------------
/CI_Examples/python-pr.yml:
--------------------------------------------------------------------------------
1 | name: Python Pull Request Check
2 |
3 | parameters:
4 | - name: jobName
5 | default: 'PythonCI'
6 | - name: jobDisplay
7 | default: 'Lint repo with Ruff + run all unit tests'
8 |
9 | trigger:
10 | branches:
11 | include:
12 | - main
13 |
14 | pool:
15 | vmImage: 'ubuntu-latest'
16 |
17 | jobs:
18 | - job: ${{ parameters.jobName }}
19 | timeoutInMinutes: 30
20 | displayName: ${{ parameters.jobDisplay }}
21 |
22 | workspace:
23 | clean: outputs
24 |
25 | steps:
26 | # Checkout repo
27 | - checkout: self
28 | fetchDepth: 1
29 | clean: true
30 |
31 | # Install Ruff
32 | - script: |
33 | pip install pytest pytest-azurepipelines pytest-cov ruff
34 | displayName: Install Pytest, Pytest Code Coverage and Ruff
35 |
36 | # Test Ruff Installation and list all files in repo
37 | - script: |
38 | echo "Ruff Version:" && ruff --version
39 | echo "Pytest Version:" && pytest --version
40 | echo "Pytest Coverage Version:" && pytest-cov --version
41 | echo "Pytest Azure Pipelines Version:" && pytest-azurepipelines --version
42 | git ls-files | grep '\.py$'
43 | displayName: Test Installs, List all files for CI
44 |
45 | # Lint SQL
46 | - script: |
47 | git ls-files | grep '\.py$' | ruff check .
48 | displayName: Analyzing the code with Ruff
49 | continueOnError: true
50 |
51 | - script: |
52 | pytest tests/ --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-report=xml
53 | displayName: Run all Python unit tests
54 | condition: always()
55 | continueOnError: false
56 |
57 | - task: PublishCodeCoverageResults@1
58 | inputs:
59 | codeCoverageTool: Cobertura
60 | summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
61 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Clean_Staging_Area.sql:
--------------------------------------------------------------------------------
1 | /*******************************************************************/
2 | -- Procedure: sp_clean_stage
3 | -- Created By: Martin Palkovic
4 | -- Create date: 2022-08-16
5 | -- Organization: Cooke Inc.
6 | -- Summary: Delete files from a named Snowflake staging area
7 | -- Description: In data pipelines, we sometimes stick files in a named
8 | -- Snowflake internal staging area - occasionally, you'll want to purge the
9 | -- files from here. Append this stored procedure call as the last step in your pipeline
10 | -- to keep your staging area clean
11 | /*******************************************************************/
12 | use warehouse REPORTING_WH;
13 | use database STAGING_DEV;
14 | use schema NS_LANDING;
15 |
16 | create or replace procedure sp_clean_stage(
17 | stage_name varchar, DAYS number, DRY_RUN boolean
18 | )
19 | returns varchar
20 | language sql
21 | execute as caller
22 | as
23 | $$
24 | declare
25 | ListFiles resultset;
26 | LastModified date;
27 | RemovedCount number := 0;
28 | TotalCount number := 0;
29 | begin
30 | ListFiles := (execute immediate 'ls @' || stage_name );
31 | let C1 cursor for ListFiles;
32 | for files in C1 do
33 | TotalCount := TotalCount + 1;
34 | LastModified := to_date(left( files."last_modified", length(files."last_modified") - 4 ), 'DY, DD MON YYYY HH24:MI:SS' );
35 | if (LastModified <= dateadd( 'day', -1 * days, current_timestamp())) then
36 | RemovedCount := RemovedCount + 1;
37 | if (not dry_run) then
38 | execute immediate 'rm @' || files."name";
39 | end if;
40 | end if;
41 | end for;
42 | return RemovedCount || ' of ' || TotalCount || ' files ' || iff(dry_run,'will be','were') || ' deleted.';
43 | end;
44 | $$;
45 |
46 | -- Run Stored Procedure
47 | -- use database my_db;
48 | -- call sp_clean_stage('my_stage', 14, false);
49 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Shorten_Huge_Union_Queries.sql:
--------------------------------------------------------------------------------
1 | /**********************************************************************/
2 | -- Title: How to shorten a huge union query
3 | -- By: Martin Palkovic
4 | -- Date: 2022-11-25
5 | -- Description: Have you encountered a production small_sql query with a large number of unions,
6 | -- and very little changes between the queries except perhaps the database and/or schema name?
7 | -- In this example, you can loop over the COMPANY_NAME field in MY_TABLE to create
8 | -- one select statement per 'COMPANY_NAME', union them together, and return the results
9 | -- in one go. The first implementation of this at work reduced a 300 line query to ~ 40 lines!
10 | /*********************************************************************/
11 |
12 | use role sysadmin;
13 | use warehouse my_wh;
14 | use database dev;
15 |
16 | -- Declare variables, loop over results of the 'organization' cursor variable
17 | declare
18 | small_sql varchar;
19 | big_sql varchar;
20 | organization cursor for (select COMPANY_NAME from MY_SCHEMA.MY_TABLE);
21 | my_results resultset;
22 | begin
23 | big_sql := '';
24 | -- In Snowflake, $$ is a multi-line string delimiter
25 | for company in organization do
26 | small_sql := $$select 'COMPANY_NAME' as Company
27 | , GL.ACTNUM as Account_Number
28 | , ACT.DESCRIPTION as Account_Name
29 | from COMPANY_NAME.General_Ledger_Table GL
30 |
31 | inner join COMPANY_NAME.Account_Name_Table ACT
32 | on ACT.ID = GL.ID
33 | $$;
34 | small_sql := replace(small_sql, 'COMPANY_NAME', company.COMPANY_NAME);
35 |
36 | if(big_sql != '') then
37 | big_sql := big_sql || ' union all ';
38 | end if;
39 |
40 | big_sql := big_sql || small_sql;
41 | end for;
42 |
43 | my_results := (execute immediate :big_sql);
44 | return table(my_results);
45 | end;
46 |
--------------------------------------------------------------------------------
/CI_Examples/python-ci.yml:
--------------------------------------------------------------------------------
1 | name: Python Continuous Integration
2 |
3 | parameters:
4 | - name: jobName
5 | default: 'PythonCI'
6 | - name: jobDisplay
7 | default: 'Lint .py files with Ruff'
8 |
9 | trigger:
10 | branches:
11 | include:
12 | - '*'
13 | exclude:
14 | - main
15 |
16 | pool:
17 | vmImage: 'ubuntu-latest'
18 |
19 | jobs:
20 | - job: ${{ parameters.jobName }}
21 | timeoutInMinutes: 10
22 | displayName: ${{ parameters.jobDisplay }}
23 |
24 | workspace:
25 | clean: outputs
26 |
27 | steps:
28 | # Checkout repo
29 | - checkout: self
30 | fetchDepth: 0
31 | clean: true
32 |
33 | # List Pipeline directory and Build Source Version
34 | - script: |
35 | ls -R $(System.DefaultWorkingDirectory)
36 | displayName: List directory contents
37 |
38 | - script: |
39 | echo "Build.SourceVersion: $(Build.SourceVersion)"
40 | displayName: Print Build.SourceVersion
41 |
42 | # Install Ruff
43 | - script: |
44 | pip install ruff
45 | displayName: Install Ruff
46 |
47 | # Test Ruff Installation and list all .py files in repo
48 | - script: |
49 | ruff --version
50 | echo "All changes in this commit:"
51 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.py$' ||
52 | echo "No Python files changed."
53 | displayName: Test Ruff Install, List all .py files
54 |
55 | # Lint Python
56 | - script: |
57 | changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.py$') )
58 | if [[ "${#changed[@]}" -gt 0 ]]; then
59 | failed=false
60 | for filename in "${changed[@]}"; do
61 | if [[ -f "$filename" ]]; then
62 | echo "linting $filename"
63 | ruff check "$filename" || failed=true
64 | else
65 | echo "File not found: $filename"
66 | fi
67 | done
68 | if [[ $failed == true ]]; then
69 | exit 1
70 | fi
71 | fi
72 | displayName: Lint .py files
73 |
--------------------------------------------------------------------------------
/SQL/Load_CSV_to_Snowflake/Snowflake_Worksheet_Load_CSV.sql:
--------------------------------------------------------------------------------
1 | /*****************************************************/
2 | -- Worksheet: Loading a local csv to a Snowflake table
3 | -- Date: 2022-12-08
4 | /*****************************************************/
5 |
6 | /* Set session variables
7 | Enter the relevant database, schema, table and file format names here
8 | */
9 | set role_name = 'sysadmin';
10 | set wh = 'reporting_wh';
11 | set db = 'my_new_db';
12 | set sch = 'my_schema';
13 | set table_name = 'my_table';
14 | set fileformat = 'my_file_format';
15 | set stage_name = 'my_stage';
16 |
17 | /* initialize session */
18 | -- role, warehouse
19 | use role identifier($role_name);
20 | use warehouse identifier($wh);
21 |
22 | -- database
23 | create database if not exists identifier($db);
24 | use database identifier($db);
25 |
26 | -- schema
27 | create schema if not exists identifier($sch);
28 | use schema identifier($sch);
29 |
30 | -- file format
31 | create file format if not exists identifier($fileformat)
32 | type = csv
33 | field_delimiter = ','
34 | empty_field_as_null = true
35 | skip_header = 1
36 | comment = 'file format for loading csv files to Snowflake';
37 |
38 | -- stage
39 | create stage if not exists identifier($stage_name)
40 | file_format = $fileformat; --this may need to be typed out
41 | show stages;
42 |
43 | -- table;
44 | create table if not exists identifier($table_name) (
45 | field1 varchar,
46 | field2 number
47 | );
48 |
49 | /* the PUT command must be executed in the SnowSQL CLI!
50 | See the following documentation on this topic:
51 | https://docs.snowflake.com/en/user-guide/snowsql-install-config.html
52 | https://docs.snowflake.com/en/user-guide/data-load-internal-tutorial.html
53 |
54 | download link: https://developers.snowflake.com/snowsql/
55 | put file://c:\your\filepath\here\my_file.csv;
56 | */
57 |
58 | /* confirm that the PUT command worked */
59 | list @my_stage;
60 |
61 | copy into identifier($table_name)
62 | from @my_stage/my_file.csv.gz --variables dont work in conjunction with the @ argument
63 | file_format = (format_name = $fileformat)
64 | on_error = 'skip_file';
65 |
66 | -- confirm the COPY INTO command worked
67 | select * from identifier($table_name);
68 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Python_Stored_Procedure_Example.sql:
--------------------------------------------------------------------------------
1 | /************************************************************************/
2 | -- Script: Simple Python stored procedure in Snowflake
3 | -- Date: 2022-12-28
4 | -- Description: One thing I frequently do is compare one field to another,
5 | -- to determine if something exists in one dataset but not another. Does one table
6 | -- contain sales orders, pallet numbers, or report ID's that the other table
7 | -- does not?
8 |
9 | -- This stored procedure allows you to quickly determine that from within
10 | -- the Snowflake environment
11 | /************************************************************************/
12 |
13 | use role sysadmin;
14 | use warehouse reporting_wh;
15 | use database dev;
16 | use schema my_schema;
17 |
18 | create or replace table mytable (amount number comment 'fake amounts for testing', fruits string comment 'fake types of fruit for testing');
19 | create or replace table mytable2 like mytable;
20 |
21 | insert into mytable values (1, 'apple'), (2, 'orange'), (5, 'grape'), (7, 'cantelope'), (9, 'pineapple'), (17, 'banana'), (21, 'tangerine');
22 | insert into mytable2 values (1, 'apple'), (3, 'orange'), (5, 'grape'), (7, 'strawberry'), (10, 'pineapple'), (17, 'banana'), (22, 'raspberry');
23 |
24 | -- select * from mytable;
25 | -- select * from mytable2;
26 |
27 | create or replace procedure print_differences(TABLE1 string, TABLE2 string, FIELD1 string, FIELD2 string)
28 | returns array
29 | language python
30 | runtime_version = '3.8'
31 | packages = ('snowflake-snowpark-python', 'pandas')
32 | handler = 'print_differences'
33 | as
34 | $$
35 | import pandas as pd
36 |
37 | def print_differences(session, table1: str,table2: str,field1: str,field2: str):
38 |
39 | #read the tables into a snowpark dataframe
40 | table1 = session.table(table1)
41 | table2 = session.table(table2)
42 |
43 | #convert to pandas
44 | df1 = table1.to_pandas()
45 | df2 = table2.to_pandas()
46 |
47 | # convert the the fields of interest from each table to a list
48 | list1 = df1[field1].to_list()
49 | list2 = df2[field2].to_list()
50 |
51 | return [item for item in list1 if item not in list2]
52 | $$;
53 |
54 | call print_differences('MYTABLE2', 'MYTABLE', 'FRUITS', 'FRUITS');
55 |
56 | -- output:
57 | -- ["cantelope","tangerine"]
58 |
--------------------------------------------------------------------------------
/CI_Examples/yamllint-ci.yml:
--------------------------------------------------------------------------------
1 | # Azure CI pipeline that lints YAML files in the dbt repository.
2 | # See the yamllint GitHub for more info: [yamllint](https://github.com/adrienverge/yamllint)
3 | name: yamllint Continuous Integration
4 |
5 | parameters:
6 | - name: jobName
7 | default: 'YAMLLintCI'
8 | - name: jobDisplay
9 | default: 'Lint .yml files with YAMLLint'
10 |
11 | trigger:
12 | branches:
13 | include:
14 | - '*'
15 | exclude:
16 | - main
17 |
18 | pool:
19 | vmImage: 'ubuntu-latest'
20 |
21 | jobs:
22 | - job: ${{ parameters.jobName}}
23 | timeoutInMinutes: 10
24 | displayName: ${{ parameters.jobDisplay }}
25 |
26 | workspace:
27 | clean: outputs
28 |
29 | steps:
30 | # Checkout repo
31 | - checkout: self
32 | fetchDepth: 0
33 | clean: true
34 |
35 | # List Pipeline directory and Build Source Version
36 | - script: |
37 | ls -R $(System.DefaultWorkingDirectory)
38 | displayName: List directory contents
39 |
40 | - script: |
41 | echo "Build.SourceVersion: $(Build.SourceVersion)"
42 | displayName: Print Build.SourceVersion
43 |
44 | # Install yamllint
45 | - script: |
46 | pip install yamllint
47 | displayName: Install yamllint
48 |
49 | # Test yamllint Installation and list all .yml files in repo
50 | - script: |
51 | yamllint --version
52 | echo "All changes in this commit:"
53 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.yml$' ||
54 | echo "No YAML files changed."
55 | displayName: Test yamllint Install, List all .yml files
56 |
57 | # Lint YAML
58 | - script: |
59 | changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.yml$') )
60 | if [[ "${#changed[@]}" -gt 0 ]]; then
61 | failed=false
62 | for filename in "${changed[@]}"; do
63 | if [[ -f "$filename" ]]; then
64 | echo "linting $filename"
65 | yamllint "$filename" || failed=true
66 | else
67 | echo "File not found: $filename"
68 | fi
69 | done
70 | if [[ $failed == true ]]; then
71 | exit 1
72 | fi
73 | fi
74 | displayName: Lint .yml files
75 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Account_Setup.sql:
--------------------------------------------------------------------------------
1 | /******************************************************************************/
2 | -- Script: Account Setup in Snowflake
3 | -- CreateBy: Martin Palkovic
4 | -- Create date: 2022-11-01
5 | -- Description: Script to set up a warehouse,
6 | -- role and user with basic privileges
7 | /******************************************************************************/
8 |
9 | /* Set session variables*/
10 | set role_name = 'my_role';
11 | set user_name = 'my_user';
12 | set wh_name = 'my_warehouse';
13 | set db_name = 'my_db';
14 |
15 | /* Create warehouse for service account */
16 | use role sysadmin;
17 | create or replace warehouse identifier($wh_name)
18 | warehouse_size = xsmall
19 | auto_suspend = 60
20 | auto_resume = true
21 | min_cluster_count = 1
22 | max_cluster_count = 5
23 | scaling_policy = standard
24 | comment = 'Warehouse for service account to query the Snowflake API';
25 |
26 | /* Create role */
27 | use role securityadmin;
28 | create or replace role identifier($role_name)
29 | comment = 'Default role for service account my_user';
30 |
31 | /* Create user */
32 | use role accountadmin;
33 | create or replace user identifier($user_name)
34 | login_name = $user_name
35 | display_name = $user_name
36 | password = '********************'
37 | must_change_password = false
38 | default_role = $role_name
39 | default_warehouse = $wh_name
40 | comment = 'Service account for application to query the Snowflake API';
41 |
42 | /* grant account permissions */
43 | grant role identifier($role_name) to user identifier($user_name);
44 | grant usage on warehouse identifier($wh_name) to role identifier($role_name);
45 | grant usage on database identifier($db_name) to role identifier($role_name);
46 | grant usage on all schemas in database identifier($db_name) to role identifier(
47 | $role_name
48 | );
49 | grant select on all tables in database identifier($db_name) to role identifier(
50 | $role_name
51 | );
52 |
53 | /* Future Grants */
54 | grant select on future tables in database identifier(
55 | $db_name
56 | ) to role identifier($role_name);
57 | grant usage on future schemas in database identifier(
58 | $db_name
59 | ) to role identifier($role_name);
60 |
61 | /* Confirm access is correct */
62 | show grants to role identifier($role_name);
63 |
64 | show grants of role identifier($role_name);
65 | show grants to user identifier($user_name);
66 |
--------------------------------------------------------------------------------
/CI_Examples/sqlfluff-ci.yml:
--------------------------------------------------------------------------------
1 | # Azure CI pipeline that lints new/modified SQL files after every push to a git repository.
2 | # See the SQLFluff GitHub for more info: [SQLFluff](https://github.com/sqlfluff/sqlfluff)
3 | name: SQLFluff Continuous Integration
4 |
5 | parameters:
6 | - name: jobName
7 | default: 'SQLFluffCI'
8 | - name: jobDisplay
9 | default: 'Lint .sql files with SQLFluff'
10 |
11 | trigger:
12 | branches:
13 | include:
14 | - '*'
15 | exclude:
16 | - main
17 |
18 | pool:
19 | vmImage: 'ubuntu-latest'
20 |
21 | jobs:
22 | - job: ${{ parameters.jobName }}
23 | timeoutInMinutes: 10
24 | displayName: ${{ parameters.jobDisplay }}
25 |
26 | workspace:
27 | clean: outputs
28 |
29 | steps:
30 | # Checkout repo
31 | - checkout: self
32 | fetchDepth: 0
33 | clean: true
34 |
35 | # List Pipeline directory and Build Source Version
36 | - script: |
37 | ls -R $(System.DefaultWorkingDirectory)
38 | displayName: List directory contents
39 |
40 | - script: |
41 | echo "Build.SourceVersion: $(Build.SourceVersion)"
42 | displayName: Print Build.SourceVersion
43 |
44 | # Install SQLFluff
45 | - script: |
46 | pip install sqlfluff
47 | displayName: Install SQLFluff
48 |
49 | # Test SQLFluff Installation and list all .sql files in repo
50 | - script: |
51 | sqlfluff --version
52 | echo "All changes in this commit:"
53 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.sql$' ||
54 | echo "No SQL files changed."
55 | displayName: Test SQLFluff Install, List all .sql files
56 |
57 | # Lint SQL
58 | - script: |
59 | changed=( $(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep '\.sql$') )
60 | if [[ "${#changed[@]}" -gt 0 ]]; then
61 | failed=false
62 | for filename in "${changed[@]}"; do
63 | if [[ -f "$filename" ]]; then
64 | echo "linting $filename"
65 | sqlfluff lint "$filename" --dialect snowflake || failed=true
66 | else
67 | echo "File not found: $filename"
68 | fi
69 | done
70 | if [[ $failed == true ]]; then
71 | exit 1
72 | fi
73 | fi
74 | displayName: Lint .sql files
75 |
--------------------------------------------------------------------------------
/Python/sql_insert_statement_from_csv.py:
--------------------------------------------------------------------------------
1 | """Generate a SQL insert statement from a csv file
2 | By: Martin Palkovic
3 | Date: 2022-03-14"""
4 |
5 | import pandas as pd
6 |
7 | # Filepath for the csv
8 | df = pd.read_csv("my_file.csv")
9 |
10 | # In my case I only wanted after row 1022
11 | df = df.iloc[1022:]
12 |
13 | # There are some weird unicode characters in the excel sheet I received,
14 | # I removed them with this for loop:
15 | for column in df.columns:
16 | df[column] = df[column].str.split().str.join(" ")
17 |
18 |
19 | # Define Function
20 | def sql_insert_statement_from_dataframe(source, target):
21 | """This function generates a SQL insert statement"""
22 | for index, row in source.iterrows():
23 | # full insert statement:
24 | print(
25 | "insert into "
26 | + target
27 | + "("
28 | + str(", ".join(source.columns))
29 | + ") values "
30 | + str(tuple(row.values))
31 | + ";"
32 | )
33 |
34 |
35 | # Execute Function
36 | sql_insert_statement_from_dataframe(df, "database.schema.table")
37 | """
38 | #Full insert statement:
39 | insert into database.schema.table(code,
40 | expense_type,
41 | acct,
42 | company)
43 | values ('02113',
44 | 'Accounts Receivable,
45 | Other',
46 | '35400',
47 | 'An_Awesome_Company');
48 |
49 | insert into database.schema.table(code,
50 | expense_type,
51 | acct,
52 | company)
53 | values ('02114',
54 | 'Accounts Payable',
55 | '36500',
56 | 'A_Different_Company');
57 | insert into database.schema.table(code,
58 | expense_type,
59 | acct,
60 | company) values ('02115',
61 | 'Donations',
62 | '12220',
63 | 'Another_Company');
64 |
65 | #just the values:
66 | ('02113', 'Accounts Receivable, Other', '35400', 'An_Awesome_Company'),
67 | ('02114', 'Accounts Payable', '36500', 'A_Different_Company'),
68 | ('02115', 'Donations', '12220', 'Another_Company'),
69 | """
70 |
--------------------------------------------------------------------------------
/Python/sql_style_join_csv.py:
--------------------------------------------------------------------------------
1 | """Performing a SQL style join on two csv files
2 | By: Martin Palkovic
3 | Date: 2022-02-11
4 |
5 | Description: The inventory team is producing Excel sheets on a weekly basis
6 | and would like to move comments from one sheet to another. Inventory goes out,
7 | new inventory comes in, and they want the comments transfered on items that are
8 | still in stock. I wasn't sure how to do this in SQL without making new tables
9 | in the database and decided to use Python.
10 |
11 | Note that this program is specific to a workflow I do for the Inventory team,
12 | and you cant really make a one size fits all program for this task since you
13 | need to specify which fields you want to join. But hopefully it will give you
14 | an idea of how to do this if you encounter a similar task
15 | """
16 |
17 | import os
18 | import pandas as pd
19 |
20 | old_csv = input("Enter filepath for the old csv: ")
21 | while not os.path.isfile(old_csv):
22 | print("Error: that is not a valid file, try again...")
23 | old_csv = input("Enter filepath for the old csv: ")
24 |
25 | new_csv = input("Enter filepath for the new csv: ")
26 | while not os.path.isfile(new_csv):
27 | print("Error: that is not a valid file, try again...")
28 | new_csv = input("Enter filepath for the new csv: ")
29 |
30 | try:
31 | df_old = pd.read_csv(old_csv, low_memory=False)
32 | df_new = pd.read_csv(new_csv, low_memory=False)
33 |
34 | # makes all column names lower case, ensuring they meet the join criteria
35 | # i.e if the user capitalizes one of the column names one week but not the next,
36 | # it doesn't matter with this block of code
37 | df_old.columns = map(str.lower, df_old.columns)
38 | df_new.columns = map(str.lower, df_new.columns)
39 |
40 | # removes any whitespace from the column names
41 | df_old = df_old.rename(columns=lambda x: x.strip())
42 | df_new = df_new.rename(columns=lambda x: x.strip())
43 |
44 | df_old = df_old.loc[:, df_old.columns.isin(["columns_you_want_to_keep"])]
45 | df_old = df_old.reset_index(drop=True)
46 |
47 | df_new = df_new.loc[:, ~df_new.columns.isin(["columns_you_want_to_keep"])]
48 | df_new = df_new.reset_index(drop=True)
49 |
50 | df = pd.merge(
51 | df_new,
52 | df_old.drop_duplicates(subset=["pallet"]),
53 | how="left",
54 | on=["pallet"],
55 | suffixes=("", "_drop"),
56 | )
57 |
58 | df = df.drop([c for c in df.columns if "drop" in c], axis=1)
59 | df.columns = map(str.capitalize, df.columns)
60 |
61 | file_name = input("Enter your file name (dont add the .csv extension): ")
62 | df.to_csv("{}.csv".format(file_name))
63 |
64 | except BaseException as exception:
65 | print(f"An exception occurred: {exception}")
66 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Data_Pipeline_From_Internal_Stage.sql:
--------------------------------------------------------------------------------
1 | /**********************************************************************************************************/
2 | -- Proc: Basic data pipeline from Snowflake internal stage
3 | -- CreateBy: Martin Palkovic
4 | -- Create date: 2022-10-31
5 | -- Description: Basic workflow for building the latter portions of a data pipeline within Snowflake.
6 | -- Note that this code assumes you have loaded a csv file into a Snowflake internal stage via a
7 | -- 3rd party or open source integration tool
8 | /***********************************************************************************************************/
9 |
10 | /* initialize environment */
11 | use role sysadmin;
12 | use warehouse reporting_wh;
13 | use database my_dev_database;
14 | use schema my_schema;
15 |
16 | /* Provides information for your third party/open source integration tool */
17 | desc table dimcustomer;
18 |
19 | /* create stage, if needed */
20 | show stages;
21 | -- create or replace my_stage
22 | list @my_stage;
23 |
24 | /* create file format */
25 | create or replace file format my_file_format
26 | type = 'CSV'
27 | field_delimiter = ','
28 | replace_invalid_characters = true
29 | null_if = ('');
30 |
31 | /* create stored procedure */
32 | create or replace procedure dim_customer_pipeline()
33 | returns varchar
34 | language sql
35 | execute as caller
36 | as
37 | $$
38 | begin
39 | truncate table MY_SCHEMA.DIMCUSTOMER;
40 |
41 | copy into
42 | MY_SCHEMA.DIMCUSTOMER
43 | from
44 | ( select t1.$1
45 | ,t1.$2
46 | ,t1.$3
47 | ,nullif(t1.$4, '')
48 | from @MY_SCHEMA.MY_STAGE/Dim_Customer.csv.gz (file_format => 'my_file_format') t1
49 | )
50 | file_format=my_file_format ON_ERROR='SKIP_FILE';
51 |
52 | remove @MY_SCHEMA.MY_STAGE pattern='.*Customer.*';
53 |
54 | return 'Successfully loaded data into MY_DEV_DATABASE.MY_SCHEMA.DIMCUSTOMER';
55 | end;
56 | $$;
57 |
58 | /* create task */
59 | create or replace task dim_customer
60 | warehouse = load_wh
61 | schedule = 'using cron 30 9 * * * UTC'
62 | comment
63 | = 'Truncates MY_DEV_DATABASE.MY_SCHEMA.DIMCUSTOMER, loads all rows of the dimcustomer table from Azure SQL and deletes the csv from the staging area'
64 | as
65 | call dim_customer_pipeline();
66 |
67 | /* grant execute task priveleges to role sysadmin */
68 | use role accountadmin;
69 | grant execute task on account to role sysadmin;
70 |
71 | /* tasks are created in a suspended state by default, you must 'resume' them to schedule them */
72 | use role sysadmin;
73 | alter task dim_customer resume;
74 |
75 | /* confirm that the tasks are working */
76 | show tasks;
77 | select * from table(information_schema.task_history()) order by scheduled_time;
78 |
--------------------------------------------------------------------------------
/Python/Snowpark_Backload_API_Data.py:
--------------------------------------------------------------------------------
1 | # **********************************************************************#
2 | # Title: Backload API data using Snowpark Python
3 | # By: Martin Palkovic
4 | # Date: 2022-11-18
5 | # Description: Here is another Snowpark example, where you can loop through
6 | # an API call and insert the JSON response for each days worth of data
7 | # into a VARIANT table in Snowflake
8 | # *********************************************************************#
9 |
10 | # Import modules
11 | import os
12 | import json
13 | import requests
14 |
15 | from datetime import date, timedelta
16 | from snowflake.snowpark import Session
17 |
18 | from dotenv import load_dotenv
19 |
20 | load_dotenv()
21 |
22 | # Establish Snowflake Connection using Snowpark
23 | account = os.getenv("SNOWFLAKE_ACCT")
24 | user = os.getenv("SNOWFLAKE_USER")
25 | password = os.getenv("SNOWFLAKE_PASSWORD")
26 | role = os.getenv("SNOWFLAKE_ROLE")
27 | role = "SYSADMIN"
28 | warehouse = "MY_WH"
29 | database = "DEV"
30 | schema = "MY_SCHEMA"
31 | target_table = "MY_TABLE"
32 |
33 | api_key = os.getenv("MY_API_KEY")
34 |
35 |
36 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema):
37 | connection_parameters = {
38 | "account": account,
39 | "user": user,
40 | "password": password,
41 | "role": role,
42 | "warehouse": warehouse,
43 | "database": database,
44 | "schema": schema,
45 | }
46 | session = Session.builder.configs(connection_parameters).create()
47 | return session
48 |
49 |
50 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema)
51 |
52 | print(
53 | session.sql(
54 | "SELECT CURRENT_WAREHOUSE(), CURRENT_DATABASE(), CURRENT_SCHEMA()"
55 | ).collect()
56 | )
57 |
58 | # API variables
59 | headers = {"APIKey": f"{api_key}"}
60 |
61 |
62 | # Define a function so we can loop over a date range
63 | def daterange(start_date, end_date):
64 | for n in range(int((end_date - start_date).days)):
65 | yield start_date + timedelta(n)
66 |
67 |
68 | start_date = date(2019, 1, 1)
69 | end_date = date(2022, 11, 18)
70 |
71 | # Loop through 4 years worth of API data, insert into Snowflake VARIANT table
72 | for dates in daterange(start_date, end_date):
73 | url = f"https://api.mywebsite.com/api/data?&startDate={date}&endDate={date}"
74 | response = requests.request("GET", url, headers=headers)
75 |
76 | formatted_json = json.loads(response.text)
77 | formatted_json = json.dumps(formatted_json, indent=4)
78 |
79 | # insert to Snowflake
80 | session.sql(
81 | f"""INSERT INTO {target_table} (JSON_DATA, INSERT_DATE)
82 | SELECT PARSE_JSON('{formatted_json}'),
83 | CURRENT_TIMESTAMP();"""
84 | ).collect()
85 |
--------------------------------------------------------------------------------
/.sqlfluff:
--------------------------------------------------------------------------------
1 | [sqlfluff]
2 |
3 | # Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html
4 | # Or run 'sqlfluff dialects'
5 | dialect = snowflake
6 |
7 | # One of [raw|jinja|python|placeholder]
8 | templater = jinja
9 |
10 | # Comma separated list of rules to exclude, or None
11 | # See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules
12 | # AM04 (ambiguous.column_count) and ST06 (structure.column_order) are
13 | # two of the more controversial rules included to illustrate usage.
14 | exclude_rules = ambiguous.column_count, structure.column_order
15 | warnings = LT05
16 |
17 | # The standard max_line_length is 80 in line with the convention of
18 | # other tools and several style guides. Many projects however prefer
19 | # something a little longer.
20 | # Set to zero or negative to disable checks.
21 | max_line_length = 120
22 |
23 | # CPU processes to use while linting.
24 | # The default is "single threaded" to allow easy debugging, but this
25 | # is often undesirable at scale.
26 | # If positive, just implies number of processes.
27 | # If negative or zero, implies number_of_cpus - specified_number.
28 | # e.g. -1 means use all processors but one. 0 means all cpus.
29 | processes = -1
30 |
31 | # If using the dbt templater, we recommend setting the project dir.
32 | ; [sqlfluff:templater:dbt]
33 | ; project_dir = ./
34 |
35 | [sqlfluff:indentation]
36 | # While implicit indents are not enabled by default. Many of the
37 | # SQLFluff maintainers do use them in their projects.
38 | allow_implicit_indents = true
39 |
40 | # The default configuration for aliasing rules is "consistent"
41 | # which will auto-detect the setting from the rest of the file. This
42 | # is less desirable in a new project and you may find this (slightly
43 | # more strict) setting more useful.
44 | [sqlfluff:rules:aliasing.table]
45 | aliasing = explicit
46 | [sqlfluff:rules:aliasing.column]
47 | aliasing = explicit
48 | [sqlfluff:rules:aliasing.length]
49 | min_alias_length = 3
50 |
51 | # The default configuration for capitalisation rules is "consistent"
52 | # which will auto-detect the setting from the rest of the file. This
53 | # is less desirable in a new project and you may find this (slightly
54 | # more strict) setting more useful.
55 | # Typically we find users rely on syntax highlighting rather than
56 | # capitalisation to distinguish between keywords and identifiers.
57 | # Clearly, if your organisation has already settled on uppercase
58 | # formatting for any of these syntax elements then set them to "upper".
59 | # See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason
60 | [sqlfluff:rules:capitalisation.keywords]
61 | capitalisation_policy = lower
62 | [sqlfluff:rules:capitalisation.identifiers]
63 | capitalisation_policy = lower
64 | [sqlfluff:rules:capitalisation.functions]
65 | extended_capitalisation_policy = lower
66 | [sqlfluff:rules:capitalisation.literals]
67 | capitalisation_policy = lower
68 | [sqlfluff:rules:capitalisation.types]
69 | extended_capitalisation_policy = lower
70 |
--------------------------------------------------------------------------------
/SQL/Snowflake_ForLoop_GrantPermissions.sql:
--------------------------------------------------------------------------------
1 | /* ######################### */
2 | /* Script: Revoke/Grant permissions for reader accounts in Snowflake */
3 | /* Author: Martin Palkovic */
4 | /* Date: 2023-02-09 */
5 | /* Description: This script loops through query results from the information_schema and grants privileges only to tables */
6 | /* that have > 0 rows. This script was inspired by a database containing ~2,500 tables, 400 of which contained >= 1 row of data. */
7 | /* This script revokes all privileges and then grants select on tables with > 0 rows. Modify your cursor queries as needed to provide a */
8 | /* list of tables, schemas etc. to loop over. */
9 |
10 | -- Set session variables
11 | set db = 'my_db';
12 | set rl = 'accountadmin';
13 | set wh = 'my_wh';
14 | set role_var = '"My_Role"'; --the double quotes are required as this is a case sensitive string value!
15 | set share_name = 'ab12345.my_secure_share';
16 |
17 | -- Schemas to exclude. Set as desired, add as many as you need
18 | set exc1 = 'information_schema';
19 | set exc2 = 'my_schema1';
20 |
21 | use database identifier($db);
22 | use role identifier($rl);
23 | use warehouse identifier($wh);
24 |
25 | /* SHARE LEVEL - EXECUTED IN MAIN ACCOUNT */
26 | -- Revoke privileges
27 | declare
28 | iter_schema cursor for (select * from information_schema.schemata where schema_name not in ($exc1, $exc2));
29 | begin
30 | for s in iter_schema do
31 | execute immediate 'revoke select on all tables in schema ' || s.schema_name || ' from share identifier($share_name)';
32 | end for;
33 | return 'Permissions successfully revoked from secure share!';
34 | end;
35 |
36 | -- Add to share all tables that have > 0 rows
37 | declare
38 | iter_tables cursor for (select * from information_schema.tables
39 | where row_count > 0 and table_schema not in ($exc1, $exc2));
40 | begin
41 | for t in iter_tables do
42 | execute immediate 'grant select on table ' || t.table_schema || '.' || t.table_name || ' to share identifier($share_name)';
43 | end for;
44 | return 'Permissions successfully granted to secure share!';
45 | end;
46 |
47 | /* SHARE LEVEL - EXECUTED IN READER ACCOUNT BY ADMIN */
48 | -- Revoke privileges
49 | declare
50 | iter_schema cursor for (select * from information_schema.schemata where schema_name not in ($exc1, $exc2));
51 | begin
52 | for s in iter_schema do
53 | execute immediate 'revoke select on all tables in schema ' || s.schema_name || ' from role identifier($role_var)';
54 | end for;
55 | return 'Permissions successfully revoked!';
56 | end;
57 |
58 | -- Grant only permissions on tables that have > 0 rows
59 | declare
60 | iter_tables cursor for (select * from information_schema.tables
61 | where row_count > 0 and table_schema not in ($exc1, $exc2));
62 | begin
63 | for t in iter_tables do
64 | execute immediate 'grant select on table ' || t.table_schema || '.' || t.table_name || ' to role identifier($role_var)';
65 | end for;
66 | return 'Permissions successfully granted!';
67 | end;
68 |
--------------------------------------------------------------------------------
/SQL/Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql:
--------------------------------------------------------------------------------
1 | /**********************************************************************/
2 | -- Title: Azure Blob Snowpipe setup
3 | -- By: Martin Palkovic
4 | -- Date: 2022-11-09
5 | -- Description: Snowflake set up of an auto-ingest snowpipe from Azure Blob Storage to Snowflake table.
6 | -- Documentation: https://docs.snowflake.com/en/user-guide/data-load-snowpipe-auto-azure.html
7 | /*********************************************************************/
8 |
9 | /* Set session variables */
10 | set session_role = 'sysadmin';
11 | set session_warehouse = 'reporting_wh';
12 | set session_database = 'dev';
13 | set session_table = 'my_table';
14 | set project_name = 'MY_PROJECT';
15 | set storage_loc = 'azure://your_blob_account_here.blob.core.windows.net/my_project';
16 | set tenant_id = 'a123b4c5-1234-123a-a12b-1a23b45678c9'; -- example tenant id from Snowflake docs
17 |
18 | /* Initialize Environment */
19 | use role identifier($session_role);
20 | use warehouse identifier($session_warehouse);
21 | use database identifier($session_database);
22 |
23 | create schema if not exists identifier($project_name);
24 | use schema identifier($project_name);
25 |
26 | /* Create storage integration for Snowflake to connect to Azure Blob.
27 | See the 'Configuring Secure Access to Cloud Storage' section in the url above*/
28 | create storage integration if not exists identifier($project_name)
29 | type = external_stage
30 | storage_provider = 'AZURE'
31 | enabled = true
32 | azure_tenant_id = $tenant_id
33 | storage_allowed_locations = ($storage_loc)
34 | comment = 'Storage Integration for moving my_project data into Snowflake';
35 |
36 | /* The output of this command is needed for setup in the Azure Portal */
37 | desc storage integration identifier($project_name);
38 |
39 | /* Create notification integration to connect Snowflake to Azure Event Grid.
40 | See Step 2 of 'Configuring Automation With Azure Event Grid'*/
41 | create notification integration if not exists identifier($project_name)
42 | enabled = true
43 | type = queue
44 | notification_provider = azure_storage_queue
45 | azure_storage_queue_primary_uri = ''
46 | azure_tenant_id = $tenant_id
47 | comment = 'Notification Integration for moving my_project data into Snowflake';
48 |
49 | /* The output of this command is needed for setup in the Azure Portal */
50 | desc notification integration identifier($project_name);
51 |
52 | /* Create a Snowflake stage */
53 | create stage if not exists identifier($project_name)
54 | url = $storage_loc
55 | storage_integration = $project_name
56 | comment = 'Staging area for my_project data, between Azure Blob and Snowflake';
57 |
58 | -- show stages;
59 |
60 | /* Create a Snowpipe that will be notified via Azure Event Grid
61 | when a file is added to the Azure Blob instance specified above*/
62 | create pipe if not exists identifier($project_name)
63 | auto_ingest = true
64 | integration = $project_name
65 | as
66 | copy into $session_table
67 | from @$project_name
68 | file_format = (type = 'csv')
69 | comment = 'Auto Ingest Snowpipe for moving data from Azure Blob to Snowflake. When a file is added to
70 | Azure Blob, this Snowpipe will automatically trigger';
71 |
--------------------------------------------------------------------------------
/Docker/Populate_SQL_Server_Docker_Container.py:
--------------------------------------------------------------------------------
1 | """Title: Populate SQL Server Docker Container with production data
2 | By: Martin Palkovic
3 | Date: 2022-07-25
4 | Description: Recently I had a need for a small, lightweight SQL Server development
5 | environment where I could play around with data and not impact anything in production.
6 | This python script was my solution - it iteratively creates and populates tables
7 | in a test database that resides within a docker container.
8 |
9 | Due to our Windows auth at work, I couldn't get this to run in a docker-compose
10 | file (i.e within the container). The solution is to run docker-compose to initialize
11 | SQL Server in the container, and then run this script locally
12 |
13 | Exec in shell:
14 | cd your/file/location
15 | docker-compose up
16 | python3 Populate_SQL_Server_Docker_Container.py
17 | """
18 |
19 | #import modules
20 | import pandas as pd
21 | from sqlalchemy.engine import URL
22 | from sqlalchemy import create_engine
23 |
24 | #server credentials - prod
25 | prod_server = 'prod_server'
26 | prod_db = 'prod_db'
27 |
28 | #server credentials - docker
29 | docker_server = 'localhost'
30 | docker_db = 'test_db'
31 | username = 'sa'
32 | password = 'Your-Strong!Password@Here%'
33 | #-------------------------
34 | driver = 'SQL Server'
35 | schema = 'dbo'
36 |
37 | def sqlalchemy_cnxn(driver, server, db):
38 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
39 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
40 | engine = create_engine(url)
41 | return engine
42 |
43 | # SQLAlchemy for Prod
44 | prod_engine = sqlalchemy_cnxn(driver, prod_server, prod_db)
45 |
46 | # SQLAlchemy for Docker
47 | docker_engine = sqlalchemy_cnxn(driver, docker_server, docker_db)
48 |
49 | docker_engine.execute('''
50 | if not exists (select 1 from sys.databases where name = N'test_db')
51 | create database test_db;
52 | '''
53 | )
54 |
55 | """create a list of each table in the database,
56 | and remove table names from the list that contain numbers
57 | (i.e duplicates/backups with dates on the end)
58 | If you only want certain tables, you can manipulate this list however you like.
59 | Only table names on this list will be queried from your prod database in the
60 | for loop below"""
61 | prod_tables = [table for table in prod_engine.table_names()]
62 | prod_tables = [i for i in prod_tables if not any(char.isdigit() for char in i)]
63 |
64 | # This block is needed to connect to the db now that we have created it
65 | docker_engine = sqlalchemy_cnxn(driver, docker_server, docker_db)
66 |
67 | """iterate over each table to populate the Docker container
68 | Note that this takes ~1 min per 50 tables"""
69 | for table in prod_tables:
70 | try:
71 | #read
72 | query = f'select top 1000 * from {prod_db}.{schema}.{table}'
73 | results = prod_engine.execute(query)
74 | df_sql = pd.read_sql(query, prod_engine)
75 |
76 | #write
77 | df_sql.to_sql(f'{table}', schema= f'{schema}',
78 | con = docker_engine, chunksize=1,
79 | index=False, if_exists='replace')
80 | except Exception:
81 | print(f'failed to insert {table} to docker container')
--------------------------------------------------------------------------------
/SQL/Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql:
--------------------------------------------------------------------------------
1 | /******************************************************************************/
2 | -- Script: Basic CDC Pipeline using Streams and Tasks in Snowflake
3 | -- CreateBy: Martin Palkovic
4 | -- Create date: 2022-11-01
5 | -- Description: Basic implementation of a Streams/Tasks workflow in Snowflake.
6 | -- Streams detect DML changes to one table and will update another table based
7 | -- on those changes
8 | /******************************************************************************/
9 |
10 | /* Set session variables */
11 | set role_name = 'sysadmin';
12 | set wh = 'my_wh';
13 | set db = 'my_db';
14 | set schema_name = 'my_schema';
15 | set dest_table = 'my_table';
16 | set stream_name = 'my_stream';
17 | set source_table = 'staging_db.staging_schema.staging_table';
18 | set proc_name = 'my_procedure';
19 | set task_name = 'push_my_table';
20 |
21 | /* Initialize Environment */
22 | use role identifier($role_name);
23 | use warehouse identifier($wh);
24 |
25 | create database if not exists identifier($db);
26 | create schema if not exists identifier($schema_name);
27 |
28 | use database identifier($db);
29 | use schema identifier($schema_name);
30 |
31 | create table if not exists identifier($dest_table)
32 | comment = 'JSON data from API, streaming from the staging database'
33 | clone identifier($source_table);
34 |
35 | create stream if not exists identifier($stream_name) on table identifier($source_table)
36 | comment = 'CDC stream from staging table to prod table';
37 |
38 | /* quick diagnostic check */
39 | show streams;
40 | select * from identifier($stream_name);
41 |
42 | create or replace procedure identifier($proc_name)()
43 | returns varchar
44 | language sql
45 | execute as owner
46 | as
47 | $$
48 | begin
49 | merge into my_table DEST using (
50 | select * from my_stream
51 | qualify row_number() over (
52 | partition by json_data:ID order by insert_date) = 1
53 | ) SOURCE
54 | on DEST.json_data:ID = SOURCE.json_data:ID
55 | when matched and metadata$action = 'INSERT' then
56 | update set DEST.json_data = SOURCE.json_data,
57 | DEST.insert_date = current_timestamp()
58 | when not matched and metadata$action = 'INSERT' then
59 | insert (DEST.json_data, DEST.insert_date)
60 | values(SOURCE.json_data, current_timestamp());
61 | return 'CDC records successfully inserted';
62 | end;
63 | $$;
64 |
65 | create or replace task identifier($task_name)
66 | warehouse = LOAD_WH
67 | schedule = '1 minute'
68 | comment = 'Change data capture task that pulls over new data once a minute'
69 | when system$stream_has_data ('my_stream')
70 | as
71 | call my_procedure();
72 |
73 | /* grant execute task priveleges to role sysadmin */
74 | set role_name = 'accountadmin';
75 | use role identifier($role_name);
76 | grant execute task on account to role identifier($role_name);
77 |
78 | /* tasks are created in a suspended state by default, you must 'resume' them to schedule them */
79 | set role_name = 'sysadmin';
80 | use role identifier($role_name);
81 | alter task identifier($task_name) resume;
82 |
83 | select * from identifier($my_table);
84 |
85 | show tasks;
86 | select * from table(information_schema.task_history()) order by SCHEDULED_TIME;
87 |
--------------------------------------------------------------------------------
/SnowSQL_CICD/deploy.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 | - name: jobName
3 | default: 'SnowflakeDeploy'
4 | - name: jobDisplay
5 | default: 'Deploy Snowflake Objects'
6 | - name: databaseName
7 | default: ''
8 | - name: vmImage
9 | default: 'ubuntu-latest'
10 | - name: environmentName
11 | default: 'DEV'
12 |
13 | jobs:
14 | - deployment: ${{ parameters.jobName }}
15 | displayName: ${{ parameters.jobDisplay }}
16 | timeoutInMinutes: 10
17 | pool:
18 | vmImage: ${{ parameters.vmImage }}
19 | environment: ${{ parameters.environmentName }}
20 | workspace:
21 | clean: outputs
22 |
23 | strategy:
24 | runOnce:
25 | deploy:
26 | steps:
27 | # Checkout repo
28 | - checkout: self
29 | fetchDepth: 10
30 | clean: true
31 |
32 | # Download and Install SnowSQL CLI
33 | - script: |
34 | curl -O https://sfc-repo.snowflakecomputing.com/snowsql/bootstrap/1.2/linux_x86_64/snowsql-1.2.9-linux_x86_64.bash
35 | SNOWSQL_DEST=~/snowflake SNOWSQL_LOGIN_SHELL=~/.profile bash snowsql-1.2.9-linux_x86_64.bash
36 | name: SnowSQLSetup
37 | displayName: Download and Install SnowSQL
38 |
39 | # Test SnowSQL Installation
40 | - script: ~/snowflake/snowsql -v
41 | name: TestSnowSQL
42 | displayName: Test SnowSQL Installation
43 |
44 | - script: |
45 | echo "All changes in this commit:"
46 | git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion)
47 | name: detectingChanges
48 | displayName: 'Detecting changes'
49 |
50 | # Confirm Snowflake is properly connected
51 | - script: |
52 | # Test SnowSQL connection to our Snowflake instance
53 | ~/snowflake/snowsql -q "select current_account(), current_user(), current_role(), current_warehouse()"
54 |
55 | # Confirm that the pipeline is finding the changed SQL files
56 | files=$(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep \.sql)
57 |
58 | echo "Changed files:"
59 | echo "$files"
60 | env:
61 | SNOWSQL_ACCOUNT: $(SNOWSQL_ACCOUNT)
62 | SNOWSQL_USER: $(SNOWSQL_USER)
63 | SNOWSQL_PWD: $(SNOWSQL_PWD)
64 | SNOWSQL_ROLE: $(SNOWSQL_ROLE)
65 | name: TestSnowSQLConnection
66 | displayName: Test Snowflake Connection
67 |
68 | # Deploy code to Snowflake
69 | - script: |
70 | files=$(git diff-tree --no-commit-id --name-only -r $(Build.SourceVersion) | grep \.sql)
71 | for file in $files; do
72 | echo "Deploying $file"
73 | ~/snowflake/snowsql -d ${{ parameters.databaseName }} -f $file
74 | done
75 | env:
76 | SNOWSQL_ACCOUNT: $(SNOWSQL_ACCOUNT)
77 | SNOWSQL_USER: $(SNOWSQL_USER)
78 | SNOWSQL_PWD: $(SNOWSQL_PWD)
79 | SNOWSQL_ROLE: $(SNOWSQL_ROLE)
80 | name: Deploy
81 | displayName: Deploy code to Snowflake
82 |
--------------------------------------------------------------------------------
/Python/determine_sql_field_length.py:
--------------------------------------------------------------------------------
1 | """Determing the maximum Length of a field for database table design
2 | By: Martin Palkovic
3 | Date: 2022-02-04
4 |
5 | When building ETL/Integration jobs to Snowflake (or building any SQL table),
6 | you need to designate how many characters are allowed in a field. I like to use
7 | Python to quantitatively answer this question rather than manually counting or
8 | guessing how many characters to allow in a varchar field """
9 |
10 | #import modules
11 | import pyodbc
12 | import pandas as pd
13 |
14 | #set all rows and columns visible
15 | #pd.set_option('display.max_columns', None)
16 | #pd.set_option('display.max_rows', None)
17 |
18 | #server credentials
19 | server = 'server'
20 | database = 'database'
21 |
22 | #sql connection
23 | cnxn = pyodbc.connect(
24 | Trusted_Connection= 'Yes',
25 | Driver= '{SQL Server}',
26 | Server= server,
27 | Database= database
28 | )
29 | cursor = cnxn.cursor()
30 |
31 | """stick your query inside the triple quotes"""
32 |
33 | query = """SELECT * FROM """
34 |
35 | #load query to dataframe
36 | df_sql = pd.read_sql(query, cnxn)
37 | df_sql.head()
38 |
39 | """Example"""
40 | #Field of Interest
41 | foi = 'Item_Key'
42 | print('{} maximum record length ='.format(foi),
43 | max(df_sql[foi].astype(str).map(len)), 'characters')
44 | # Output: Item_Key maximum record length = 19 characters
45 |
46 | #Or run a for loop to get values for every column:
47 | for c in df_sql.columns:
48 | print('{} maximum record length ='.format(c),
49 | max(df_sql[c].astype(str).map(len)), 'characters',
50 | 'data type = {}'.format(df_sql[c].dtype))
51 |
52 | #object == varchar
53 | """
54 | Company maximum record length = 18 characters , data type = object
55 | Company_Key maximum record length = 4 characters , data type = object
56 | Site_Key maximum record length = 4 characters , data type = object
57 | Item_Key maximum record length = 19 characters , data type = object
58 | Item_Description maximum record length = 100 characters , data type = object
59 | Species maximum record length = 15 characters , data type = object
60 | Standard_Cost maximum record length = 8 characters , data type = float64
61 | Current_Cost maximum record length = 8 characters , data type = float64
62 | Category maximum record length = 16 characters , data type = object
63 | Sub_Category maximum record length = 22 characters , data type = object
64 | Size maximum record length = 8 characters , data type = object
65 | Grade maximum record length = 7 characters , data type = object
66 | Country_Of_Origin maximum record length = 15 characters , data type = object
67 | Pallet maximum record length = 10 characters , data type = object
68 | Bin maximum record length = 15 characters , data type = object
69 | Order_Allocation maximum record length = 15 characters , data type = object
70 | Production_Date maximum record length = 10 characters , data type = datetime64[ns]
71 | Production_Age maximum record length = 4 characters , data type = int64
72 | Lot_Date maximum record length = 10 characters , data type = datetime64[ns]
73 | Lot_Age maximum record length = 7 characters , data type = float64
74 | Weight maximum record length = 18 characters , data type = float64
75 | Cases maximum record length = 9 characters , data type = float64
76 | """
--------------------------------------------------------------------------------
/Python/read_sql_server_write_snowflake.py:
--------------------------------------------------------------------------------
1 | """Script to read data from SQL Server and write it to Snowflake
2 | By: Martin Palkovic
3 | Date: 2022-09-14
4 | Description: For a work task, I needed to add some historical exchange rate data
5 | to Snowflake for analytical reporting. This data existed on SQL server, so I wrote this
6 | Python script to read the data from SQL Server, transform it, and load it into
7 | Snowflake. I've modified this as a minimum reproducable example for the purposes of my
8 | project portfolio.
9 | """
10 |
11 | #Step 1: Read data from SQL Server
12 |
13 | # import modules
14 | import os
15 | import pyodbc
16 | import pandas as pd
17 |
18 | from snowflake import connector
19 | from dotenv import load_dotenv
20 | load_dotenv()
21 |
22 | # set all rows and columns visible
23 | # pd.set_option('display.max_columns', None)
24 | # pd.set_option('display.max_rows', None)
25 |
26 | # server credentials
27 | server = "my_server"
28 | database = "my_database"
29 |
30 | # sql connection
31 | cnxn = pyodbc.connect(
32 | Trusted_Connection="Yes", Driver="{SQL Server}", Server=server, Database=database
33 | )
34 | cursor = cnxn.cursor()
35 |
36 | # stick your query inside the triple quotes
37 | query = """select * from DATABASE.SCHEMA.EXCHANGERATES
38 | where EXCHDATE > '2021-09-03' and EXCHDATE < '2021-09-09'
39 | order by EXCHDATE asc"""
40 |
41 | # load query to dataframe
42 | df_fx = pd.read_sql(query, cnxn)
43 | print(df_fx.dtypes)
44 |
45 | # --------------------------------------------------------
46 |
47 | # Step 2: Create a dataframe that matches the Snowflake table we are inserting to
48 | df_sf = pd.DataFrame()
49 |
50 | # Create the from and to currency columns
51 | df_sf[["FROM_CURRENCY", "TO_CURRENCY"]] = df_fx["EXCHANGE_ID"].str.split(
52 | "-", 1, expand=True
53 | )
54 | df_sf = df_sf[
55 | df_sf["TO_CURRENCY"].str.contains("AVG") is False
56 | ] # drops rows that show avg - there are some GBP AVG
57 |
58 | # Create the start and stop date columns
59 | df_sf["EFFECTIVE_START"] = df_fx["EXCHDATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S")
60 | df_sf["EFFECTIVE_STOP"] = (
61 | df_fx["EXCHDATE"] + pd.DateOffset(days=7, hours=23, minutes=59)
62 | ).dt.strftime("%Y-%m-%d %H:%m:%s.%S")
63 |
64 | # Exchange Rate
65 | df_sf["RATE"] = df_fx["XCHGRATE"]
66 |
67 | # Get current datetime
68 | df_sf["STAGE_DATE"] = pd.Timestamp.now()
69 |
70 | # strip all whitespace from every field
71 | df_sf = df_sf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
72 |
73 | # diagnostic check...number of rows, data types etc.
74 | print("Number of rows:", len(df_sf))
75 | print(df_sf.dtypes)
76 | # print(df_sf.head())
77 | df_sf.to_csv("FXRates.csv", header=False, index=False)
78 |
79 | # ------------------------------------------------------------------
80 | # Step 3: Write data to Snowflake
81 | # Establish connection to Cooke Snowflake
82 | cnxn = connector.connect(
83 | user=os.getenv("SNOWFLAKE_USER"),
84 | password=os.getenv("SNOWFLAKE_PASSWORD"),
85 | account=os.getenv("SNOWFLAKE_ACCT"),
86 | role=os.getenv("SNOWFLAKE_ROLE"),
87 | warehouse="REPORTING_WH",
88 | )
89 | # assign csv to variable
90 | csv = r"\FXRates.csv.csv"
91 | staged_file = os.path.basename(csv) + ".gz"
92 |
93 | # execute write operations
94 | cursor = cnxn.cursor()
95 | cursor.execute("use database STAGING_DEV;")
96 | cursor.execute("use schema MY_SCHEMA;")
97 | cursor.execute("create or replace stage FX_RATES;")
98 | cursor.execute(f"put file://{csv} @FX_RATES;")
99 | cursor.execute(
100 | f"""copy into CURRENCY_EXCHANGE_RATES(FROM_CURRENCY,
101 | TO_CURRENCY,
102 | EFFECTIVE_START,
103 | EFFECTIVE_STOP,
104 | RATE,
105 | STAGE_DATE)
106 | from @FX_RATES/{staged_file}
107 | file_format = (type = CSV)"""
108 | )
109 | cursor.execute('rm @MY_SCHEMA.FX_RATES pattern = ".*FX_RATES.*";')
110 |
111 | cursor.close()
112 | cnxn.close()
113 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Engineering Portfolio
2 |
3 |
6 |
7 | [](https://github.com/MartyC-137/Data-Engineering/actions/workflows/ruff.yml)
8 | [](https://github.com/MartyC-137/Data-Engineering/actions/workflows/sqlfluff.yml)
9 |
10 | ---
11 |
12 | ### Introduction
13 |
14 | This repository contains numerous work examples of code I use in my day to day work as a data engineer, all of which has been modified as minimum reproducible examples. My favourite tools are Snowflake, Python, and dbt, and I also have an interest in DevOps as it pertains to data engineering.
15 |
16 |
21 |
22 | [](https://www.linkedin.com/in/mpalkovic/)
23 | [](https://my.visualcv.com/martin-palkovic/)
24 |
25 | ### Table of Contents
26 | * [Python Examples](https://github.com/MartyC-137/Data-Engineering/tree/main/Python)
27 | - [Snowpark example - backload data from SQL Server](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Snowpark_Example_Backload_Data.py)
28 | - [Snowpark example - backload data from API](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Snowpark_Backload_API_Data.py)
29 | - [Automated SQL insert statements from a CSV file](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Generate_SQL_Insert_Statements_From_CSV.py)
30 | - [Extract data from SQL Server, transform, and load to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Read_SQLServer_Write_Snowflake.py)
31 | - [Batch load JSON files to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/LoadJSONToSnowflake.py)
32 | - [SQL Server data Pull - 100 Records from every view in a database](https://github.com/MartyC-137/Data-Engineering/blob/main/Python/Pull_records_for_all_SQL_tables_in_db.py)
33 | * [SQL Examples](https://github.com/MartyC-137/Data-Engineering/tree/main/SQL)
34 | - [Only grant permissions on tables with > 0 rows of data - Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_ForLoop_GrantPermissions.sql)
35 | - [Auto Ingest Snowpipe from Azure Blob to Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Azure_Blob_Auto_Ingest_Snowpipe.sql)
36 | - [Shorten large union queries using Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Shorten_Huge_Union_Queries.sql)
37 | - [Basic Snowflake CDC Pipeline using Streams and Tasks](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Basic_CDC_Pipeline_Using_Streams_Tasks.sql)
38 | - [Find missing dates in a date field - Snowflake](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Find_Missing_Dates.sql)
39 | - [Snowflake data pipeline from internal stage](https://github.com/MartyC-137/Data-Engineering/blob/main/SQL/Snowflake_Data_Pipeline_From_Internal_Stage.sql)
40 | * [Snowflake CI/CD using Azure Pipelines - SQLFluff testing, build and deploy using SnowSQL](https://github.com/MartyC-137/Data-Engineering/tree/main/SnowSQL_CICD)
41 | * [SQLFluff and yamllint pipelines for a dbt project](https://github.com/MartyC-137/Data-Engineering/tree/main/CI_Examples)
42 |
43 | ---
44 |
45 | ### Usage
46 |
47 | ```bash
48 | # Clone the repository
49 | $ git clone https://github.com/MartyC-137/Data-Engineering.git
50 |
51 | # Connect to the repository
52 | $ cd Data-Engineering
53 | ```
54 |
--------------------------------------------------------------------------------
/Python/Snowpark_Create_Stored_Procedure.py:
--------------------------------------------------------------------------------
1 | # This only runs on a Python 3.8 environment
2 |
3 | # import modules
4 | import os
5 | import snowflake
6 | import pandas as pd
7 |
8 | from snowflake.snowpark import Session
9 | from snowflake.snowpark.types import StringType
10 |
11 | from dotenv import load_dotenv
12 |
13 | load_dotenv()
14 |
15 | # Establish Snowflake Connection
16 | account = os.getenv("SNOWFLAKE_ACCT")
17 | user = os.getenv("SNOWFLAKE_USER")
18 | password = os.getenv("SNOWFLAKE_PASSWORD")
19 | role = os.getenv("SNOWFLAKE_ROLE")
20 | warehouse = "REPORTING_WH"
21 | database = "STAGING_DEV"
22 | schema = "MISC"
23 |
24 |
25 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema):
26 | connection_parameters = {
27 | "account": account,
28 | "user": user,
29 | "password": password,
30 | "role": role,
31 | "warehouse": warehouse,
32 | "database": database,
33 | "schema": schema,
34 | }
35 | session = Session.builder.configs(connection_parameters).create()
36 | return session
37 |
38 |
39 | print("Connecting to Snowpark...\n")
40 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema)
41 |
42 | print(
43 | session.sql(
44 | "select current_warehouse(), current_database(), current_schema()"
45 | ).collect(),
46 | "\n",
47 | )
48 | print("Connected!\n")
49 |
50 | session.sql(
51 | """create or replace table
52 | mytable(amount number comment 'fake amounts for testing',
53 | fruits string comment 'fake types of fruit for testing')"""
54 | ).show()
55 |
56 | session.sql("""create or replace table mytable2 like mytable""").show()
57 |
58 | session.sql(
59 | """insert into mytable values (1, 'apple'),
60 | (2, 'orange'),
61 | (5, 'grape'),
62 | (7, 'cantelope'),
63 | (9, 'pineapple'),
64 | (17, 'banana'),
65 | (21, 'tangerine')"""
66 | ).show()
67 |
68 | session.sql(
69 | """insert into mytable2 values (1, 'apple'),
70 | (3, 'orange'),
71 | (5, 'grape'),
72 | (7, 'strawberry'),
73 | (10, 'pineapple'),
74 | (17, 'banana'),
75 | (22, 'raspberry')"""
76 | ).show()
77 |
78 |
79 | def print_differences(
80 | session: snowflake.snowpark.Session,
81 | table1: str,
82 | table2: str,
83 | field1: str,
84 | field2: str,
85 | ):
86 | # read the tables into a snowpark dataframe
87 | table1 = session.table(table1)
88 | table2 = session.table(table2)
89 |
90 | # convert to pandas
91 | df1 = table1.to_pandas()
92 | df2 = table2.to_pandas()
93 |
94 | # convert the the fields of interest from each table to a list
95 | list1 = df1[field1].to_list()
96 | list2 = df2[field2].to_list()
97 |
98 | return ", ".join(item for item in list1 if item not in list2)
99 |
100 |
101 | session.add_packages("snowflake-snowpark-python")
102 |
103 | print("Registering Stored Procedure with Snowflake...\n")
104 |
105 | session.sproc.register(
106 | func=print_differences,
107 | return_type=StringType(),
108 | input_types=[StringType(), StringType(), StringType(), StringType()],
109 | is_permanent=True,
110 | name="PRINT_DIFFERENCES",
111 | replace=True,
112 | stage_location="@UDF_STAGE",
113 | )
114 |
115 | print("Stored Procedure registered with Snowflake!\n")
116 |
117 | # You can return the results on one line using the sql() method:
118 | """session.sql('''call print_differences('MYTABLE',
119 | 'MYTABLE2',
120 | 'FRUITS',
121 | 'FRUITS')''').show()"""
122 |
123 | # Call stored procedure, print results as dataframe
124 | x = session.call("print_differences", "MYTABLE", "MYTABLE2", "FRUITS", "FRUITS")
125 | print(x, "\n")
126 |
127 | df = pd.DataFrame({"Differences": x.split(",")})
128 | print(df)
129 |
--------------------------------------------------------------------------------
/dbt/filter_dbt_catalog_query_snowflake.sql:
--------------------------------------------------------------------------------
1 | {% macro snowflake__get_catalog(information_schema, schemas) -%}
2 |
3 | {%- set relations_in_project = [] -%}
4 |
5 | {%- for node in graph.nodes.values() -%}
6 | {%- if node.resource_type == 'model' -%}
7 | {%- do relations_in_project.append(node.alias) -%}
8 | {%- endif -%}
9 | {%- endfor -%}
10 | {%- for source in graph.sources.values() -%}
11 | {%- do relations_in_project.append(source.name) -%}
12 | {%- endfor -%}
13 |
14 | {%- set relations_in_project = set(relations_in_project) | list -%}
15 |
16 | {%- if (schemas | length) == 0 -%}
17 | {%- set query = "select 1 as id limit 0" -%}
18 | {%- else -%}
19 |
20 | {% set query %}
21 |
22 | with tables as (
23 |
24 | select
25 |
26 | table_catalog as "table_database",
27 | table_schema as "table_schema",
28 | table_name as "table_name",
29 | table_type as "table_type",
30 | comment as "table_comment",
31 | table_owner as "table_owner",
32 | 'Clustering Key' as "stats:clustering_key:label",
33 | clustering_key as "stats:clustering_key:value",
34 | 'The key used to cluster this table' as "stats:clustering_key:description",
35 | (clustering_key is not null) as "stats:clustering_key:include",
36 | 'Row Count' as "stats:row_count:label",
37 | row_count as "stats:row_count:value",
38 | 'An approximate count of rows in this table' as "stats:row_count:description",
39 | (row_count is not null) as "stats:row_count:include",
40 | 'Approximate Size' as "stats:bytes:label",
41 | bytes as "stats:bytes:value",
42 | 'Approximate size of the table as reported by Snowflake' as "stats:bytes:description",
43 | (bytes is not null) as "stats:bytes:include",
44 | 'Last Modified' as "stats:last_modified:label",
45 | to_varchar(convert_timezone('UTC', last_altered), 'yyyy-mm-dd HH24:MI'||'UTC') as "stats:last_modified:value",
46 | 'The timestamp for last update/change' as "stats:last_modified:description",
47 | (last_altered is not null and table_type='BASE TABLE') as "stats:last_modified:include"
48 |
49 | from {{ information_schema }}.tables
50 |
51 | where row_count > 0
52 |
53 | and (
54 | {%- for schema in schemas -%}
55 | upper("table_schema") = upper('{{ schema }}') {%- if not loop.last %} or {% endif -%}
56 | {%- endfor -%}
57 | )
58 |
59 | {%- if relations_in_project | length > 0 %}
60 |
61 | and coalesce(regexp_substr(table_name, '^(.+)_{1}[0-9]{8}$'), table_name) in (
62 | {%- for rel in relations_in_project -%} upper('{{ rel }}') {%- if not loop.last %}, {% endif -%}{%- endfor -%}
63 | )
64 | {% endif -%}
65 |
66 | ),
67 |
68 | columns as (
69 |
70 | select
71 |
72 | table_catalog as "table_database",
73 | table_schema as "table_schema",
74 | table_name as "table_name",
75 | column_name as "column_name",
76 | ordinal_position as "column_index",
77 | data_type as "column_type",
78 | comment as "column_comment"
79 |
80 | from {{ information_schema }}.columns
81 |
82 | where (
83 | {%- for schema in schemas -%}
84 | upper("table_schema") = upper('{{ schema }}') {%- if not loop.last %} or {% endif -%}
85 | {%- endfor -%}
86 | )
87 |
88 | {%- if relations_in_project | length > 0 %}
89 |
90 | and coalesce(regexp_substr(table_name, '^(.+)_{1}[0-9]{8}$'), table_name) in (
91 | {%- for rel in relations_in_project -%} upper('{{ rel }}') {%- if not loop.last %}, {% endif -%}{%- endfor -%}
92 | )
93 | {% endif -%}
94 |
95 | )
96 |
97 | select * from tables
98 |
99 | inner join columns using ("table_database", "table_schema", "table_name")
100 |
101 | order by "c
102 | {%- endset -%}
103 |
104 | {%- endif -%}
105 |
106 | {%- do log(query) -%}
107 | {%- set results = run_query(query) -%}
108 | {%- do log(schemas ~ ' - rows returned: ' ~ results | length, True) -%}
109 |
110 | {{ return(results) }}
111 |
112 | {%- endmacro %}
113 |
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go
3 | {
4 | "name": "oh-my-posh",
5 | "build": {
6 | "dockerfile": "Dockerfile",
7 | "args": {
8 | // Update the VARIANT arg to pick a version of Go: 1, 1.16, 1.17
9 | // Append -bullseye or -buster to pin to an OS version.
10 | // Use -bullseye variants on local arm64/Apple Silicon.
11 | "VARIANT": "1.19-bullseye",
12 | // Options:
13 |
14 | "POSH_THEME": "https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/clean-detailed.omp.json",
15 |
16 | // Override me with your own timezone:
17 | "TZ": "America/Moncton",
18 | // Use one of the "TZ database name" entries from:
19 | // https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
20 |
21 | "NODE_VERSION": "lts/*",
22 | //Powershell version
23 | "PS_VERSION": "7.2.1"
24 | }
25 | },
26 | "runArgs": ["--cap-add=SYS_PTRACE",
27 | "--security-opt",
28 | "seccomp=unconfined"
29 | ],
30 |
31 | "features": {
32 | "ghcr.io/devcontainers/features/azure-cli:1": {
33 | "version": "latest"
34 | },
35 | "ghcr.io/devcontainers/features/python:1": {
36 | "version": "3.8"
37 | },
38 | "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {},
39 | "ghcr.io/devcontainers-contrib/features/terraform-asdf:2": {},
40 | "ghcr.io/devcontainers-contrib/features/yamllint:2": {},
41 | "ghcr.io/devcontainers/features/docker-in-docker:2": {},
42 | "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
43 | "ghcr.io/devcontainers/features/github-cli:1": {},
44 | "ghcr.io/devcontainers-contrib/features/spark-sdkman:2": {
45 | "jdkVersion": "11"
46 | },
47 | "ghcr.io/dhoeric/features/google-cloud-cli:1": {
48 | "version": "latest"
49 | }
50 | },
51 |
52 | // Set *default* container specific settings.json values on container create.
53 | "customizations": {
54 | "vscode": {
55 | "settings": {
56 | "go.toolsManagement.checkForUpdates": "local",
57 | "go.useLanguageServer": true,
58 | "go.gopath": "/go",
59 | "go.goroot": "/usr/local/go",
60 | "terminal.integrated.profiles.linux": {
61 | "bash": {
62 | "path": "bash"
63 | },
64 | "zsh": {
65 | "path": "zsh"
66 | },
67 | "fish": {
68 | "path": "fish"
69 | },
70 | "tmux": {
71 | "path": "tmux",
72 | "icon": "terminal-tmux"
73 | },
74 | "pwsh": {
75 | "path": "pwsh",
76 | "icon": "terminal-powershell"
77 | }
78 | },
79 | "terminal.integrated.defaultProfile.linux": "pwsh",
80 | "terminal.integrated.defaultProfile.windows": "pwsh",
81 | "terminal.integrated.defaultProfile.osx": "pwsh",
82 | "tasks.statusbar.default.hide": true,
83 | "terminal.integrated.tabs.defaultIcon": "terminal-powershell",
84 | "terminal.integrated.tabs.defaultColor": "terminal.ansiBlue",
85 | "workbench.colorTheme": "GitHub Dark Dimmed",
86 | "workbench.iconTheme": "material-icon-theme"
87 | },
88 |
89 | // Add the IDs of extensions you want installed when the container is created.
90 | "extensions": [
91 | "ms-mssql.mssql",
92 | "snowflake.snowflake-vsc",
93 | "golang.go",
94 | "ms-vscode.powershell",
95 | "ms-python.python",
96 | "ms-python.vscode-pylance",
97 | "redhat.vscode-yaml",
98 | "ms-vscode-remote.remote-containers",
99 | "ms-toolsai.jupyter",
100 | "eamodio.gitlens",
101 | "yzhang.markdown-all-in-one",
102 | "davidanson.vscode-markdownlint",
103 | "editorconfig.editorconfig",
104 | "esbenp.prettier-vscode",
105 | "github.vscode-pull-request-github",
106 | "akamud.vscode-theme-onedark",
107 | "PKief.material-icon-theme",
108 | "GitHub.github-vscode-theme",
109 | "actboy168.tasks",
110 | "bastienboutonnet.vscode-dbt",
111 | "innoverio.vscode-dbt-power-user",
112 | "redhat.vscode-xml",
113 | "adpyke.vscode-sql-formatter",
114 | "inferrinizzard.prettier-sql-vscode",
115 | "github.vscode-github-actions",
116 | "ms-python.black-formatter"
117 | ]
118 | }
119 | },
120 |
121 | // Use 'forwardPorts' to make a list of ports inside the container available locally.
122 | // "forwardPorts": [3000],
123 |
124 | // Use 'postCreateCommand' to run commands after the container is created.
125 | "postCreateCommand": "pip3 install --user -r .devcontainer/requirements.txt --use-pep517",
126 |
127 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
128 | "remoteUser": "vscode"
129 | }
130 |
--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/go/.devcontainer/base.Dockerfile
2 |
3 | # [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster
4 | ARG VARIANT=1-bullseye
5 | FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT}
6 |
7 | # [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
8 | ARG NODE_VERSION="none"
9 | RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
10 |
11 | # Install powershell
12 | ARG PS_VERSION="7.2.1"
13 | # powershell-7.3.0-linux-x64.tar.gz
14 | # powershell-7.3.0-linux-arm64.tar.gz
15 | RUN ARCH="$(dpkg --print-architecture)"; \
16 | if [ "${ARCH}" = "amd64" ]; then \
17 | PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-x64.tar.gz"; \
18 | elif [ "${ARCH}" = "arm64" ]; then \
19 | PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm64.tar.gz"; \
20 | elif [ "${ARCH}" = "armhf" ]; then \
21 | PS_BIN="v$PS_VERSION/powershell-$PS_VERSION-linux-arm32.tar.gz"; \
22 | fi; \
23 | wget https://github.com/PowerShell/PowerShell/releases/download/$PS_BIN -O pwsh.tar.gz; \
24 | mkdir /usr/local/pwsh && \
25 | tar Cxvfz /usr/local/pwsh pwsh.tar.gz && \
26 | rm pwsh.tar.gz
27 |
28 | ENV PATH=$PATH:/usr/local/pwsh
29 |
30 | RUN echo 'deb http://download.opensuse.org/repositories/shells:/fish:/release:/3/Debian_11/ /' | tee /etc/apt/sources.list.d/shells:fish:release:3.list; \
31 | curl -fsSL https://download.opensuse.org/repositories/shells:fish:release:3/Debian_11/Release.key | gpg --dearmor | tee /etc/apt/trusted.gpg.d/shells_fish_release_3.gpg > /dev/null; \
32 | apt-get update && export DEBIAN_FRONTEND=noninteractive \
33 | && apt-get install -y --no-install-recommends \
34 | fish \
35 | tmux \
36 | fzf \
37 | && apt-get clean
38 |
39 | ARG USERNAME=vscode
40 |
41 | # Download the oh-my-posh binary
42 | RUN mkdir /home/${USERNAME}/bin; \
43 | wget https://github.com/JanDeDobbeleer/oh-my-posh/releases/latest/download/posh-linux-$(dpkg --print-architecture) -O /home/${USERNAME}/bin/oh-my-posh; \
44 | chmod +x /home/${USERNAME}/bin/oh-my-posh; \
45 | chown ${USERNAME}: /home/${USERNAME}/bin;
46 |
47 | # NOTE: devcontainers are Linux-only at this time but when
48 | # Windows or Darwin is supported someone will need to improve
49 | # the code logic above.
50 |
51 | # Setup a neat little PowerShell experience
52 | RUN pwsh -Command Install-Module posh-git -Scope AllUsers -Force; \
53 | pwsh -Command Install-Module z -Scope AllUsers -Force; \
54 | pwsh -Command Install-Module PSFzf -Scope AllUsers -Force; \
55 | pwsh -Command Install-Module Terminal-Icons -Scope AllUsers -Force;
56 |
57 | # add the oh-my-posh path to the PATH variable
58 | ENV PATH "$PATH:/home/${USERNAME}/bin"
59 |
60 | # Add vscode default dir to the PATH variable
61 | ENV PATH "$PATH:/home/vscode/.local/bin"
62 |
63 | # Can be used to override the devcontainer prompt default theme:
64 | ENV POSH_THEME="https://raw.githubusercontent.com/JanDeDobbeleer/oh-my-posh/main/themes/night-owl.omp.json"
65 |
66 | # Deploy oh-my-posh prompt to Powershell:
67 | COPY Microsoft.PowerShell_profile.ps1 /home/${USERNAME}/.config/powershell/Microsoft.PowerShell_profile.ps1
68 |
69 | # Deploy oh-my-posh prompt to Fish:
70 | COPY config.fish /home/${USERNAME}/.config/fish/config.fish
71 |
72 | # Everything runs as root during build time, so we want
73 | # to make sure the vscode user can edit these paths too:
74 | RUN chmod 777 -R /home/${USERNAME}/.config
75 |
76 | # Override vscode's own Bash prompt with oh-my-posh:
77 | RUN sed -i 's/^__bash_prompt$/#&/' /home/${USERNAME}/.bashrc && \
78 | echo "eval \"\$(oh-my-posh init bash --config $POSH_THEME)\"" >> /home/${USERNAME}/.bashrc
79 |
80 | # Override vscode's own ZSH prompt with oh-my-posh:
81 | RUN echo "eval \"\$(oh-my-posh init zsh --config $POSH_THEME)\"" >> /home/${USERNAME}/.zshrc
82 |
83 | # Set container timezone:
84 | ARG TZ="UTC"
85 | RUN ln -sf /usr/share/zoneinfo/${TZ} /etc/localtime
86 |
87 | # Required for Python - Confluent Kafka on M1 Silicon
88 | RUN apt update && apt -y install software-properties-common gcc
89 | RUN git clone https://github.com/edenhill/librdkafka
90 | RUN cd librdkafka && ./configure && make && make install && ldconfig
91 |
92 | # [Optional] Uncomment the next line to use go get to install anything else you need
93 | # RUN go get -x github.com/JanDeDobbeleer/battery
94 |
95 | # [Optional] Uncomment this line to install global node packages.
96 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1
--------------------------------------------------------------------------------
/Fivetran/disable_tables_with_zero_rows_fivetran_api.py:
--------------------------------------------------------------------------------
1 | """ Import Modules """
2 | import os
3 | import json
4 | import requests
5 | import pandas as pd
6 |
7 | from sqlalchemy.engine import URL
8 | from sqlalchemy import create_engine
9 | from dotenv import load_dotenv
10 |
11 | load_dotenv()
12 |
13 | # Retrieve Fivetran secrets
14 | fivetran_key = os.getenv("FIVETRAN_KEY")
15 | fivetran_secret = os.getenv("FIVETRAN_SECRET")
16 |
17 | # --------------------------------------------
18 | """ Retrieve list of Fivetran connector IDs"""
19 |
20 | # Define API variables
21 | group_id = "my_fivetran_group_id"
22 | url = "https://api.fivetran.com/v1/groups/" + group_id + "/connectors"
23 | headers = {"Accept": "application/json"}
24 |
25 | # API GET request
26 | response = requests.get(url, headers=headers, auth=(fivetran_key, fivetran_secret))
27 | data = response.json()
28 |
29 | # Save Fivetran connector list to file
30 | with open("fivetran_connector_list.json", "w") as file:
31 | json.dump(data, file, indent=4)
32 |
33 | # Create a dictionary containing the database name(key) and connector ID(value)
34 | connector_id_dict = {
35 | item["schema"].upper()
36 | if item["schema"] != "db_name_you_want_capitalized"
37 | else item["schema"].capitalize(): item["id"]
38 | for item in data["data"]["items"]
39 | }
40 |
41 | print(
42 | f"""Dictionary of connector ID's for Fivetran databases:
43 | {connector_id_dict} \n"""
44 | )
45 |
46 | # ------------------------------------------------------------------
47 | """ Establish SQL Server Connection"""
48 |
49 | # Define variables
50 | driver = "SQL Server"
51 | server = "my_server"
52 |
53 | # Define connection function
54 | def sqlalchemy_cnxn(driver, server, db):
55 | """ Function for connecting to SQL Server via SQLAlchemy """
56 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
57 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
58 | engine = create_engine(url)
59 | return engine
60 |
61 | # ------------------------------------------------------------
62 | """ Loop over list of databases/connector IDs to retrive tables
63 | with 0 rows from SQL server, and call a PATCH request with the Fivetran API
64 | to disable tables with 0 rows for that connector"""
65 |
66 | for database in connector_id_dict.keys():
67 | engine = sqlalchemy_cnxn(driver, server, database)
68 |
69 | print(f"successfully connected to {server}.{database}!\n")
70 | print() # new line
71 |
72 | # Query the sys schema for the database to get tables with 0 rows of data
73 | query = f"""
74 | SELECT
75 | t.NAME AS TableName,
76 | p.rows AS RowCounts
77 | FROM {database}.sys.tables AS t
78 |
79 | INNER JOIN {database}.sys.partitions AS p
80 | ON t.object_id = p.OBJECT_ID
81 |
82 | WHERE
83 | t.NAME NOT LIKE 'dt%'
84 | AND t.is_ms_shipped = 0
85 | AND p.rows = 0
86 |
87 | GROUP BY
88 | t.Name, p.Rows
89 |
90 | ORDER BY
91 | t.Name
92 | """
93 |
94 | # load results of query to Pandas dataframe
95 | df = pd.read_sql(query, engine)
96 |
97 | print(f"tables with 0 rows of data in {database} database: {len(df)}\n")
98 |
99 | tables_to_unsync = df["TableName"].tolist()
100 |
101 | # Create a JSON payload of tables to disable
102 | tables_payload = {table_name: {"enabled": False} for table_name in tables_to_unsync}
103 | payload = {"enabled": True, "tables": tables_payload}
104 |
105 | # For testing, if needed
106 | # with open(f"{database}_payload.json", "w") as file:
107 | # json.dump(payload, file, indent = 4)
108 |
109 | # # ######################################
110 | """ Fivetran API Call to disable tables"""
111 |
112 | connector_id = connector_id_dict[database]
113 | print(f"Connector ID for {database}: {connector_id}\n")
114 |
115 | schema_name = "dbo"
116 | url = (
117 | "https://api.fivetran.com/v1/connectors/"
118 | + connector_id
119 | + "/schemas/"
120 | + schema_name
121 | )
122 |
123 | headers = {"Content-Type": "application/json", "Accept": "application/json"}
124 |
125 | """Fivetran API call - comment this block if you are testing the script"""
126 | response = requests.patch(url,
127 | json = payload,
128 | headers = headers,
129 | auth = (fivetran_key, fivetran_secret))
130 |
131 | data = response.json()
132 | print(f"Successfully called the Fivetran API for the {connector_id} connector!\n")
133 |
134 | # For testing, if needed
135 | # with open('fivetran_api_response.json', 'w') as file:
136 | # file.write(str(data))
137 | # print(f"Successfully saved logs to file!\n")
138 |
139 | # break #LEAVE THIS IN IF YOU ARE TESTING
140 |
--------------------------------------------------------------------------------
/Python/Snowpark_Example_Backload_SQL_Server_Data.py:
--------------------------------------------------------------------------------
1 | # **********************************************************************#
2 | # Title: Basic Snowpark Example for backloading data to Snowflake
3 | # By: Martin Palkovic
4 | # Date: 2022-11-18
5 | # Description: Recently I needed to backload some exchange rate data into Snowflake from
6 | # SQL Server, and was excited because I got to test out Snowpark! It is a really nice
7 | # way to interact with Snowflake using Python.
8 | # *********************************************************************#
9 |
10 | # Import modules
11 | import os
12 | from sqlalchemy.engine import URL
13 | from sqlalchemy import create_engine
14 |
15 | import pandas as pd
16 |
17 | from snowflake.snowpark import Session
18 |
19 | from dotenv import load_dotenv
20 |
21 | load_dotenv()
22 |
23 | # Establish SQL Server Connection
24 | driver = "SQL Server"
25 | server = "my_server"
26 | database = "my_db"
27 | schema = "dbo"
28 | table = "Daily_Exchange_Rates"
29 |
30 |
31 | # Define connection function
32 | def sqlalchemy_cnxn(driver, server, db):
33 | connection = f"DRIVER={driver};SERVER={server};DATABASE={db}"
34 | url = URL.create("mssql+pyodbc", query={"odbc_connect": connection})
35 | engine = create_engine(url)
36 | return engine
37 |
38 |
39 | engine = sqlalchemy_cnxn(driver, server, database)
40 |
41 | # If you're not performing any data transformation at the
42 | # SQL Server level, this is a great way to parameterize column names
43 | columns = f"""SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
44 | WHERE TABLE_NAME LIKE N'{table}'"""
45 |
46 | df_cols = pd.read_sql(columns, engine)
47 | columns = ", ".join(df_cols["COLUMN_NAME"].to_list())
48 |
49 | query = f"""SELECT {columns} FROM {database}.{schema}.{table}"""
50 |
51 | # load query to dataframe
52 | df_fx = pd.read_sql(query, engine)
53 | print("Total records from SQL Server:", len(df_fx))
54 |
55 | # --------------------------------------------
56 |
57 | # Establish Snowpark Connection
58 | account = os.getenv("SNOWFLAKE_ACCT")
59 | user = os.getenv("SNOWFLAKE_USER")
60 | password = os.getenv("SNOWFLAKE_PASSWORD")
61 | role = os.getenv("SNOWFLAKE_ROLE")
62 | warehouse = "REPORTING_WH"
63 | database = "DEV"
64 | schema = "MY_SCHEMA"
65 | target_table = "CURRENCY_EXCHANGE_RATES"
66 | temp_table = "FX_RATE_TEMP"
67 |
68 |
69 | def snowpark_cnxn(account, user, password, role, warehouse, database, schema):
70 | connection_parameters = {
71 | "account": account,
72 | "user": user,
73 | "password": password,
74 | "role": role,
75 | "warehouse": warehouse,
76 | "database": database,
77 | "schema": schema,
78 | }
79 | session = Session.builder.configs(connection_parameters).create()
80 | return session
81 |
82 |
83 | session = snowpark_cnxn(account, user, password, role, warehouse, database, schema)
84 |
85 | print(
86 | session.sql(
87 | "select current_warehouse(), current_database(), current_schema()"
88 | ).collect()
89 | )
90 |
91 | # ---------------------------------------------------------------------
92 |
93 | # Transform the data (if needed) to match the format that is required for Snowflake
94 | # In my case, the data in the source data did not match what I needed
95 | # for Snowflake.
96 |
97 | df_sf = pd.DataFrame()
98 |
99 | df_sf[["FROM_CURRENCY", "TO_CURRENCY"]] = df_fx["EXGTBLID_TRANSFORMED"].str.split(
100 | "-", 1, expand=True
101 | )
102 | df_sf = df_sf[
103 | df_sf["TO_CURRENCY"].str.contains("|".join(["AVG", "BUY", "SELL", "ALL"])) is False
104 | ] # drops rows that contain junk data
105 |
106 | df_sf["EFFECTIVE_START"] = df_fx["EXCHDATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S")
107 | df_sf["EFFECTIVE_STOP"] = (
108 | df_fx["EXCHDATE"] + pd.DateOffset(days=7, hours=23, minutes=59)
109 | ).dt.strftime("%Y-%m-%d %H:%m:%s.%S")
110 |
111 | df_sf["RATE"] = df_fx["XCHGRATE"]
112 |
113 | # Get current datetime
114 | df_sf["STAGE_DATE"] = pd.Timestamp.now()
115 | df_sf["STAGE_DATE"] = df_sf["STAGE_DATE"].dt.strftime("%Y-%m-%d %H:%m:%s.%S")
116 |
117 | # strip all whitespace from every field
118 | df_sf = df_sf.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
119 | print("Total records after transformations:", len(df_sf))
120 |
121 | columns = ", ".join(df_sf.columns)
122 |
123 | # Create Snowpark DataFrame
124 | df = session.create_dataframe(df_sf)
125 |
126 | df.write.mode("overwrite").save_as_table(
127 | f"{temp_table}", column_order="name", table_type="temporary"
128 | )
129 |
130 | session.sql(f"SELECT COUNT(*) FROM {temp_table}").collect()
131 |
132 | # OPTION 1: Overwrite + insert new data
133 | session.sql(
134 | f"""INSERT OVERWRITE INTO {target_table} ({columns})
135 | SELECT {columns} FROM {temp_table}"""
136 | ).collect()
137 |
138 | # -------------------------------------------------------------
139 |
140 | # OPTION 2: Incremental load
141 | session.sql(
142 | f"""MERGE INTO {target_table} Dest
143 | USING (
144 | SELECT {columns} FROM {temp_table}
145 | QUALIFY ROW_NUMBER() OVER (
146 | PARTITION BY MY_KEY
147 | ORDER BY DATE ASC) = 1
148 | ) Source
149 | ON Dest.MY_KEY = Source.MY_KEY
150 | AND Dest.FROM_CURRENCY = Source.FROM_CURRENCY
151 | AND Dest.TO_CURRENCY = Source.TO_CURRENCY
152 | WHEN MATCHED THEN UPDATE
153 | SET Dest.FROM_CURRENCY = Source.FROM_CURRENCY
154 | , Dest.TO_CURRENCY = Source.TO_CURRENCY
155 | , Dest.DATE = Source.DATE
156 | , Dest.RATE = Source.RATE
157 | , Dest.STAGE_DATE = Source.STAGE_DATE
158 |
159 | WHEN NOT MATCHED THEN INSERT(
160 | FROM_CURRENCY
161 | , TO_CURRENCY
162 | , DATE
163 | , RATE
164 | , STAGE_DATE
165 | )
166 | VALUES(
167 | Source.FROM_CURRENCY
168 | , Source.TO_CURRENCY
169 | , Source.EFFECTIVE_START
170 | , Source.RATE
171 | , Source.STAGE_DATE
172 | )
173 | """
174 | ).collect()
175 |
--------------------------------------------------------------------------------