├── .devcontainer
    ├── devcontainer.json
    └── profiles.yml
├── .github
    └── workflows
    │   ├── build-image.yml
    │   ├── docs.yml
    │   ├── pre-commit.yml
    │   ├── submit-job.yml
    │   └── terraform-validation.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierrc.yml
├── .sqlfluff
├── .terraform-docs.yml
├── .tflint.hcl
├── .yamllint
├── LICENSE
├── README.md
├── airflow
    ├── .gitignore
    ├── aws_cli.py
    ├── dags
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── defaults.py
    │   │   ├── requests.py
    │   │   └── slack.py
    │   ├── geo_reference
    │   │   └── load_building_footprints.py
    │   └── state_entities
    │   │   ├── base_entities.py
    │   │   └── budgets.py
    ├── deploy.sh
    ├── plugins
    │   └── .gitkeep
    └── requirements
    │   └── requirements.txt
├── ci
    └── profiles.yml
├── docs
    ├── code
    │   ├── azdevops-project-management.md
    │   ├── code-review.md
    │   ├── codespaces.md
    │   ├── github-project-management.md
    │   ├── local-setup.md
    │   ├── terraform-local-setup.md
    │   └── writing-documentation.md
    ├── data
    │   └── footprints.md
    ├── dbt
    │   ├── dbt-performance.md
    │   └── dbt.md
    ├── images
    │   ├── codespace-secrets.png
    │   ├── column-pruning.png
    │   ├── columnar-storage.png
    │   ├── consumers.png
    │   ├── create-new-codespace.png
    │   ├── databases.png
    │   ├── dbt_model_timing.png
    │   ├── dbt_run_summary.png
    │   ├── developer.png
    │   ├── devops
    │   │   ├── workitemdetails.png
    │   │   ├── workitemexamples.png
    │   │   └── workitemtypes.png
    │   ├── environment_variables.png
    │   ├── github
    │   │   ├── commit-changes.png
    │   │   ├── comparing-changes.png
    │   │   ├── conflict-markers.png
    │   │   ├── conflict-sections.png
    │   │   ├── create-a-pull-request-github.png
    │   │   ├── issues-pr-actions.png
    │   │   ├── merging-in-vs-code.png
    │   │   ├── merging-in-vs-code2.png
    │   │   ├── open-a-pr.png
    │   │   ├── page4.png
    │   │   ├── pm-eg-metaissue.png
    │   │   ├── pm-subtasks.png
    │   │   ├── pr-description.png
    │   │   ├── request-for-review.png
    │   │   ├── suggest-a-change.png
    │   │   └── version-control.png
    │   ├── initial-query.png
    │   ├── launch-codespace.png
    │   ├── nightly.png
    │   ├── odi-circle_logomark-blue.png
    │   ├── odi-square_logomark-blue.svg
    │   └── partition-pruning.png
    ├── index.md
    ├── infra
    │   ├── architecture.md
    │   ├── cloud-infrastructure.md
    │   └── snowflake.md
    ├── learning
    │   ├── cloud-data-warehouses.md
    │   ├── dbt.md
    │   ├── git.md
    │   ├── glossary.md
    │   ├── naming-conventions.md
    │   └── security.md
    ├── setup
    │   ├── dbt-setup.md
    │   ├── fivetran-setup.md
    │   ├── project-teardown.md
    │   ├── repo-setup.md
    │   ├── sentinel-setup.md
    │   ├── snowflake-service-accounts.md
    │   ├── snowflake-setup.md
    │   └── terraform-project-setup.md
    ├── static
    │   ├── AccessFootprintsArcPro.pdf
    │   └── Download.MS.Global.Footprints.zip
    └── stylesheets
    │   └── extra.css
├── images
    ├── Dockerfile
    └── environment.yml
├── jobs
    ├── __init__.py
    ├── geo
    │   ├── __init__.py
    │   ├── data
    │   │   └── california.geojson
    │   ├── load_global_ml_building_footprints.py
    │   ├── load_us_building_footprints.py
    │   ├── tiger.py
    │   └── write_building_footprints.py
    ├── test.py
    └── utils
    │   ├── __init__.py
    │   └── snowflake.py
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── terraform
    ├── .gitignore
    ├── aws
    │   ├── README.md
    │   ├── environments
    │   │   └── dev
    │   │   │   ├── .terraform.lock.hcl
    │   │   │   ├── dse-infra-dev.tfbackend
    │   │   │   ├── main.tf
    │   │   │   └── remote-state
    │   │   │       ├── .terraform.lock.hcl
    │   │   │       ├── main.tf
    │   │   │       └── terraform.tfvars
    │   └── modules
    │   │   └── infra
    │   │       ├── airflow.tf
    │   │       ├── batch.tf
    │   │       ├── ecr.tf
    │   │       ├── iam.tf
    │   │       ├── main.tf
    │   │       ├── network.tf
    │   │       ├── outputs.tf
    │   │       ├── s3.tf
    │   │       ├── secrets.tf
    │   │       └── variables.tf
    ├── s3-remote-state
    │   ├── README.md
    │   └── main.tf
    └── snowflake
    │   ├── environments
    │       ├── dev
    │       │   ├── .terraform.lock.hcl
    │       │   ├── dse-snowflake-dev.tfbackend
    │       │   ├── main.tf
    │       │   ├── remote-state
    │       │   │   ├── .terraform.lock.hcl
    │       │   │   ├── main.tf
    │       │   │   └── terraform.tfvars
    │       │   └── terraform.tfvars
    │       └── prd
    │       │   ├── .terraform.lock.hcl
    │       │   ├── dse-snowflake-prd.tfbackend
    │       │   ├── main.tf
    │       │   ├── remote-state
    │       │       ├── .terraform.lock.hcl
    │       │       ├── main.tf
    │       │       └── terraform.tfvars
    │       │   └── terraform.tfvars
    │   └── modules
    │       ├── database
    │           ├── main.tf
    │           ├── outputs.tf
    │           └── variables.tf
    │       ├── elt
    │           ├── databases.tf
    │           ├── main.tf
    │           ├── roles.tf
    │           ├── users.tf
    │           ├── variables.tf
    │           └── warehouses.tf
    │       └── warehouse
    │           ├── main.tf
    │           ├── outputs.tf
    │           └── variables.tf
└── transform
    ├── .gitignore
    ├── .sqlfluff
    ├── .sqlfluffignore
    ├── README.md
    ├── analyses
        └── .gitkeep
    ├── dbt_project.yml
    ├── macros
        ├── .gitkeep
        ├── _macros.yml
        ├── get_custom_schema.sql
        ├── map_class_fp.sql
        └── spatial_join_with_deduplication.sql
    ├── models
        ├── intermediate
        │   ├── snowflake_cost_tracking
        │   │   ├── _snowflake_cost_tracking.yml
        │   │   ├── int_automatic_clustering_history.sql
        │   │   ├── int_cortex_usage_daily_history.sql
        │   │   ├── int_materialized_view_refresh_history.sql
        │   │   ├── int_pipe_usage_history.sql
        │   │   ├── int_storage_daily_history.sql
        │   │   └── int_warehouse_metering_history.sql
        │   └── state_entities
        │   │   ├── _int_state_entities__models.yml
        │   │   ├── int_state_entities__active.sql
        │   │   ├── int_state_entities__budgets.sql
        │   │   └── int_state_entities__technical.sql
        ├── marts
        │   ├── geo_reference
        │   │   ├── _geo_reference__models.yml
        │   │   ├── geo_reference__global_ml_building_footprints_with_tiger.sql
        │   │   └── geo_reference__us_building_footprints_with_tiger.sql
        │   ├── snowflake_cost_tracking
        │   │   ├── _snowflake_cost_tracking.yml
        │   │   └── snowflake_costs_by_date.sql
        │   └── state_entities
        │   │   ├── _state_entities__models.yml
        │   │   └── dim_state_entities__agencies.sql
        ├── overview.md
        └── staging
        │   ├── department_of_finance
        │       ├── _department_of_finance__models.yml
        │       ├── stg_department_of_finance__entities.sql
        │       └── stg_ebudget__budgets.sql
        │   └── snowflake_cost_tracking
        │       ├── _snowflake_cost_tracking__models.yml
        │       ├── stg_automatic_clustering_history.sql
        │       ├── stg_cortex_usage_daily_history.sql
        │       ├── stg_data_transfer_history.sql
        │       ├── stg_database_storage_usage_history.sql
        │       ├── stg_materialized_view_refresh_history.sql
        │       ├── stg_metering_daily_history.sql
        │       ├── stg_pipe_usage_history.sql
        │       ├── stg_stage_storage_usage_history.sql
        │       ├── stg_storage_daily_history.sql
        │       └── stg_warehouse_metering_history.sql
    ├── package-lock.yml
    ├── packages.yml
    ├── seeds
        └── .gitkeep
    ├── snapshots
        └── .gitkeep
    └── tests
        └── .gitkeep


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "dbt",
 3 |     "image": "mcr.microsoft.com/devcontainers/python:1-3.10-bookworm",
 4 |     "features": {
 5 |         "ghcr.io/devcontainers-contrib/features/poetry:2": {},
 6 |         "ghcr.io/devcontainers/features/terraform:1": {}
 7 |     },
 8 |     "customizations": {
 9 |         "vscode": {
10 |             "settings": {
11 |                 "dbt.queryLimit": 50,
12 |                 "git.allowNoVerifyCommit": true,
13 |                 "python.defaultInterpreterPath": ".venv/bin/python"
14 |             },
15 |             "extensions": [
16 |                 "innoverio.vscode-dbt-power-user"
17 |             ]
18 |         }
19 |     },
20 |     "secrets": {
21 |         "SNOWFLAKE_USER": {
22 |             "description": "Your Snowflake Username"
23 |         },
24 |         "SNOWFLAKE_PASSWORD": {
25 |             "description": "Your Snowflake Password"
26 |         },
27 |         "DBT_SCHEMA": {
28 |             "description": "The dev schema into which to build your dbt models (e.g. DBT_<YOURNAME>)"
29 |         }
30 |     },
31 |     "postCreateCommand": "poetry config virtualenvs.in-project true && poetry install && poetry run pre-commit install && mkdir -p ~/.dbt && cp .devcontainer/profiles.yml ~/.dbt"
32 | }
33 | 


--------------------------------------------------------------------------------
/.devcontainer/profiles.yml:
--------------------------------------------------------------------------------
 1 | dse_snowflake:
 2 |   target: snowflake_dev
 3 |   outputs:
 4 |     snowflake_dev:
 5 |       type: snowflake
 6 |       account: heb41095
 7 |       authenticator: username_password_mfa
 8 |       user: "{{ env_var('SNOWFLAKE_USER') }}"
 9 |       password: "{{ env_var('SNOWFLAKE_PASSWORD') }}"
10 |       role: TRANSFORMER_DEV
11 |       warehouse: TRANSFORMING_XS_DEV
12 |       database: TRANSFORM_DEV
13 |       schema: "{{ env_var('DBT_SCHEMA') }}"
14 |       threads: 4
15 | 


--------------------------------------------------------------------------------
/.github/workflows/build-image.yml:
--------------------------------------------------------------------------------
 1 | name: Build Image
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build-image:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Check out repository
10 |         uses: actions/checkout@v3
11 | 
12 |       - name: Set up Docker Buildx
13 |         uses: docker/setup-buildx-action@v2
14 | 
15 |       - name: Configure AWS Credentials
16 |         uses: aws-actions/configure-aws-credentials@v2
17 |         with:
18 |           # TODO: use OIDC for auth:
19 |           # https://github.com/aws-actions/configure-aws-credentials#assuming-a-role
20 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
21 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
22 |           aws-region: us-west-2
23 | 
24 |       - name: Login to Amazon ECR
25 |         id: login-ecr
26 |         uses: aws-actions/amazon-ecr-login@v1
27 | 
28 |       - name: Build, tag, and push docker image to Amazon ECR
29 |         uses: docker/build-push-action@v4
30 |         env:
31 |           REGISTRY: ${{ steps.login-ecr.outputs.registry }}
32 |           REPOSITORY: dse-infra-dev-us-west-2-default
33 |           IMAGE_TAG: ${{ github.ref == 'refs/heads/main' && 'latest' || 'test' }}
34 |         with:
35 |           push: true
36 |           context: "."
37 |           file: "./images/Dockerfile"
38 |           tags: ${{ env.REGISTRY }}/${{ env.REPOSITORY }}:${{ env.IMAGE_TAG }}
39 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches: [main]
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | env:
11 |   DBT_PROFILES_DIR: ci
12 |   SNOWFLAKE_PRIVATE_KEY: ${{ SECRETS.SNOWFLAKE_PRIVATE_KEY_DEV }}
13 |   SNOWFLAKE_USER: GITHUB_ACTIONS_SVC_USER_DEV
14 | 
15 | jobs:
16 |   build-docs:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - uses: actions/setup-python@v5
21 |         with:
22 |           python-version: "3.10"
23 |       - uses: snok/install-poetry@v1
24 |         with:
25 |           virtualenvs-path: .venv
26 |       - name: Load cached venv
27 |         id: cached-poetry-dependencies
28 |         uses: actions/cache@v4
29 |         with:
30 |           path: .venv
31 |           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
32 |       - name: Install dependencies
33 |         run: poetry install --no-interaction --no-root
34 |       - name: Build dbt docs
35 |         run: |
36 |           # Generate snowflake dbt docs
37 |           poetry run dbt deps --project-dir=transform
38 |           poetry run dbt docs generate --project-dir=transform
39 |           cp -r transform/target docs/dbt_docs_snowflake
40 |       - name: Deploy docs to GitHub Pages
41 |         if: github.ref == 'refs/heads/main'
42 |         run: poetry run mkdocs gh-deploy --force
43 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | env:
10 |   DBT_PROFILES_DIR: ci
11 |   SNOWFLAKE_PRIVATE_KEY: ${{ SECRETS.SNOWFLAKE_PRIVATE_KEY_DEV }}
12 |   SNOWFLAKE_USER: GITHUB_ACTIONS_SVC_USER_DEV
13 | 
14 | jobs:
15 |   pre-commit:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - uses: actions/setup-python@v5
20 |         with:
21 |           python-version: "3.10"
22 |       - uses: snok/install-poetry@v1
23 |         with:
24 |           virtualenvs-path: .venv
25 |       - name: Load cached venv
26 |         id: cached-poetry-dependencies
27 |         uses: actions/cache@v4
28 |         with:
29 |           path: .venv
30 |           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
31 |       - name: Install dependencies
32 |         run: poetry install --no-interaction --no-root
33 |       - name: Install dbt deps
34 |         run: poetry run dbt deps --project-dir transform
35 |       - uses: pre-commit/action@v3.0.0
36 | 


--------------------------------------------------------------------------------
/.github/workflows/submit-job.yml:
--------------------------------------------------------------------------------
 1 | name: submit
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "23 12 * * *"
 7 | 
 8 | jobs:
 9 |   submit:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Check out repository
13 |         uses: actions/checkout@v3
14 | 
15 |       - name: Configure AWS Credentials
16 |         uses: aws-actions/configure-aws-credentials@v2
17 |         with:
18 |           # TODO: use OIDC for auth:
19 |           # https://github.com/aws-actions/configure-aws-credentials#assuming-a-role
20 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
21 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
22 |           aws-region: us-west-2
23 |       - name: Submit batch job
24 |         run: |
25 |           aws batch submit-job \
26 |           --job-name test \
27 |           --job-queue dse-infra-dev-us-west-2-default \
28 |           --job-definition dse-infra-dev-us-west-2-latest \
29 |           --container-overrides '{
30 |             "resourceRequirements":
31 |             [{"value": "2", "type": "VCPU"}, {"value": "4096", "type": "MEMORY"}]
32 |           }'
33 | 


--------------------------------------------------------------------------------
/.github/workflows/terraform-validation.yml:
--------------------------------------------------------------------------------
 1 | name: terraform-validation
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | jobs:
 9 |   terraform-validation:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 | 
14 |       - name: Setup terraform
15 |         uses: hashicorp/setup-terraform@v2
16 |         with:
17 |           terraform_version: v1.4.0
18 |       - name: Install tflint
19 |         run: |
20 |           curl -s https://raw.githubusercontent.com/terraform-linters/\
21 |           tflint/master/install_linux.sh | bash
22 | 
23 |       - name: Run terraform fmt
24 |         run: |
25 |           terraform fmt
26 |       - name: Run terraform validate
27 |         run: |
28 |           terraform validate
29 |       - name: Run terraform tflint
30 |         run: |
31 |           tflint --chdir=terraform/ --recursive
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.swp
  2 | docs/dbt_docs*
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | .DS_Store
165 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: check-yaml
 6 |         args:
 7 |           - --unsafe
 8 |       - id: check-json
 9 |       - id: end-of-file-fixer
10 |       - id: trailing-whitespace
11 |       - id: check-merge-conflict
12 |       - id: detect-aws-credentials
13 |         args: [--allow-missing-credentials]
14 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
15 |     rev: v0.1.6
16 |     hooks:
17 |       - id: ruff
18 |         args: [--show-source, --fix]
19 |       - id: ruff-format
20 |   - repo: https://github.com/codespell-project/codespell
21 |     rev: v2.2.4
22 |     hooks:
23 |       - id: codespell
24 |         types_or: [rst, markdown]
25 |         files: docs
26 |   - repo: https://github.com/pre-commit/mirrors-mypy
27 |     rev: v1.1.1
28 |     hooks:
29 |       - id: mypy
30 |         args: [--warn-unused-configs]
31 |         additional_dependencies:
32 |           # Type stubs
33 |           - pandas-stubs==v1.5.3.230321
34 |           - types-requests
35 |           - numpy
36 |   - repo: https://github.com/pre-commit/mirrors-prettier
37 |     rev: v2.7.1
38 |     hooks:
39 |       - id: prettier
40 |         types: [yaml]
41 |   - repo: https://github.com/adrienverge/yamllint.git
42 |     rev: v1.28.0
43 |     hooks:
44 |       - id: yamllint
45 |         args: []
46 |   # Note: for SQLFluff we don't use the default pre-commit hook because
47 |   # the pre-commit managed python environment can be difficult to install,
48 |   # especially due to issues with pyarrow being brought in by Snowflake.
49 |   # This keep things more predictable by using the poetry.lock environment.
50 |   - repo: local
51 |     hooks:
52 |       - id: sqlfluff
53 |         name: sqlfluff
54 |         language: system
55 |         description: "Lints sql files with `SQLFluff`"
56 |         types: [sql]
57 |         require_serial: true
58 |         entry: poetry run sqlfluff fix --show-lint-violations --nocolor --disable-progress-bar
59 |         pass_filenames: true
60 | 


--------------------------------------------------------------------------------
/.prettierrc.yml:
--------------------------------------------------------------------------------
1 | endOfLine: auto
2 | proseWrap: "preserve"
3 | 


--------------------------------------------------------------------------------
/.sqlfluff:
--------------------------------------------------------------------------------
 1 | [sqlfluff]
 2 | # For some reason this can only be set in the root directory, cf
 3 | # https://docs.sqlfluff.com/en/stable/configuration.html#nesting.
 4 | # Other config parameters are set in the dbt project directory, as
 5 | # that's where dbt cloud looks for them.
 6 | templater = dbt
 7 | 
 8 | [sqlfluff:templater:dbt]
 9 | project_dir = ./transform
10 | 


--------------------------------------------------------------------------------
/.terraform-docs.yml:
--------------------------------------------------------------------------------
 1 | formatter: markdown table
 2 | recursive:
 3 |   enabled: false
 4 | output:
 5 |   file: README.md
 6 |   mode: inject
 7 | sort:
 8 |   enabled: true
 9 |   by: name
10 | 


--------------------------------------------------------------------------------
/.tflint.hcl:
--------------------------------------------------------------------------------
1 | plugin "aws" {
2 |     enabled = true
3 |     version = "0.22.1"
4 |     source  = "github.com/terraform-linters/tflint-ruleset-aws"
5 | }
6 | 


--------------------------------------------------------------------------------
/.yamllint:
--------------------------------------------------------------------------------
 1 | extends: default
 2 | 
 3 | rules:
 4 |   document-start: disable
 5 |   line-length: {max: 120}
 6 |   quoted-strings:
 7 |     quote-type: double
 8 |     required: false
 9 |   truthy:
10 |     allowed-values: ["true", "false"]
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Office of Data & Innovation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CalData Data Services and Engineering Infrastructure
2 | 
3 | ![deploy](https://github.com/cagov/data-infrastructure/actions/workflows/deploy.yml/badge.svg?branch=main)
4 | ![docs](https://github.com/cagov/data-infrastructure/actions/workflows/docs.yml/badge.svg?branch=main)
5 | 
6 | Documentation for this project can be found [here](https://cagov.github.io/data-infrastructure/).
7 | 


--------------------------------------------------------------------------------
/airflow/.gitignore:
--------------------------------------------------------------------------------
1 | aws-mwaa-local-runner
2 | 


--------------------------------------------------------------------------------
/airflow/aws_cli.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Python script to run airflow CLI commands in a MWAA environment.
 3 | 
 4 | Adapted from sample code here:
 5 | https://docs.aws.amazon.com/mwaa/latest/userguide/airflow-cli-command-reference.html#airflow-cli-command-examples
 6 | """
 7 | import base64
 8 | import json
 9 | import sys
10 | 
11 | import boto3
12 | import requests
13 | 
14 | mwaa_env_name = "dse-infra-dev-us-west-2-mwaa-environment"
15 | 
16 | client = boto3.client("mwaa")
17 | 
18 | mwaa_cli_token = client.create_cli_token(Name=mwaa_env_name)
19 | 
20 | mwaa_auth_token = "Bearer " + mwaa_cli_token["CliToken"]
21 | mwaa_webserver_hostname = f"https://{mwaa_cli_token['WebServerHostname']}/aws_mwaa/cli"
22 | raw_data = " ".join(sys.argv[1:])
23 | 
24 | mwaa_response = requests.post(
25 |     mwaa_webserver_hostname,
26 |     headers={"Authorization": mwaa_auth_token, "Content-Type": "text/plain"},
27 |     data=raw_data,
28 | )
29 | 
30 | print(mwaa_response.status_code)
31 | try:
32 |     mwaa_std_err_message = base64.b64decode(mwaa_response.json()["stderr"]).decode(
33 |         "utf8"
34 |     )
35 |     mwaa_std_out_message = base64.b64decode(mwaa_response.json()["stdout"]).decode(
36 |         "utf8"
37 |     )
38 |     print(mwaa_std_err_message)
39 |     print(mwaa_std_out_message)
40 | except json.decoder.JSONDecodeError:
41 |     print(mwaa_response.text)
42 | 


--------------------------------------------------------------------------------
/airflow/dags/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Package for shared utility functions and classes
2 | 


--------------------------------------------------------------------------------
/airflow/dags/common/defaults.py:
--------------------------------------------------------------------------------
 1 | """Shared default arguments for DAGs."""
 2 | from __future__ import annotations
 3 | 
 4 | from datetime import timedelta
 5 | from typing import Any
 6 | 
 7 | from common.slack import post_to_slack_on_failure
 8 | 
 9 | DEFAULT_ARGS: dict[str, Any] = {
10 |     "owner": "CalData",
11 |     "depends_on_past": False,
12 |     "email": ["odi-caldata-dse@innovation.ca.gov"],
13 |     "email_on_failure": False,
14 |     "email_on_retry": False,
15 |     "retries": 2,
16 |     "retry_delay": timedelta(minutes=5),
17 |     "on_failure_callback": post_to_slack_on_failure,
18 | }
19 | 


--------------------------------------------------------------------------------
/airflow/dags/common/requests.py:
--------------------------------------------------------------------------------
 1 | """Utilities for making HTTP requests."""
 2 | 
 3 | import backoff
 4 | import requests
 5 | 
 6 | 
 7 | @backoff.on_exception(
 8 |     backoff.expo,
 9 |     requests.exceptions.RequestException,
10 |     max_time=30,
11 |     max_tries=4,
12 | )
13 | def get(url):
14 |     return requests.get(url)
15 | 


--------------------------------------------------------------------------------
/airflow/dags/common/slack.py:
--------------------------------------------------------------------------------
 1 | from airflow.providers.slack.hooks.slack_webhook import SlackWebhookHook
 2 | 
 3 | 
 4 | def post_to_slack_on_failure(context):
 5 |     hook = SlackWebhookHook(
 6 |         slack_webhook_conn_id="caldata-dataservices-bot-notifications"
 7 |     )
 8 |     msg = f"""
 9 |         :x: Task Failed.
10 |         *Task*: {context.get('task_instance').task_id}
11 |         *Dag*: {context.get('task_instance').dag_id}
12 |         *Execution Time*: {context.get('execution_date')}
13 |         <{context.get('task_instance').log_url}|*Logs*>
14 |     """
15 |     hook.send_text(msg)
16 | 


--------------------------------------------------------------------------------
/airflow/dags/geo_reference/load_building_footprints.py:
--------------------------------------------------------------------------------
  1 | """Load building footprints to Snowflake."""
  2 | from __future__ import annotations
  3 | 
  4 | import os
  5 | from datetime import datetime
  6 | 
  7 | from common.defaults import DEFAULT_ARGS
  8 | 
  9 | from airflow.decorators import dag
 10 | from airflow.providers.amazon.aws.operators.batch import BatchOperator
 11 | from airflow.providers.amazon.aws.sensors.batch import BatchSensor
 12 | from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator
 13 | 
 14 | 
 15 | def _construct_batch_args(name: str, command: list[str]) -> dict:
 16 |     return {
 17 |         "task_id": name,
 18 |         "job_name": name,
 19 |         "job_queue": os.environ["AIRFLOW__CUSTOM__DEFAULT_JOB_QUEUE"],
 20 |         "job_definition": os.environ["AIRFLOW__CUSTOM__DEFAULT_JOB_DEFINITION"],
 21 |         "overrides": {
 22 |             "command": command,
 23 |             "resourceRequirements": [
 24 |                 {"type": "VCPU", "value": "8"},
 25 |                 {"type": "MEMORY", "value": "32768"},
 26 |             ],
 27 |         },
 28 |         "region_name": "us-west-2",  # TODO: can we make this unnecessary?
 29 |     }
 30 | 
 31 | 
 32 | @dag(
 33 |     description="Test DAG",
 34 |     start_date=datetime(2023, 5, 23),
 35 |     schedule_interval="@monthly",
 36 |     default_args=DEFAULT_ARGS,
 37 |     catchup=False,
 38 | )
 39 | def building_footprints_dag():
 40 |     """DAG for loading MS Building footprints dataset."""
 41 |     load_us_footprints = BatchOperator(
 42 |         **_construct_batch_args(
 43 |             name="load_us_building_footprints",
 44 |             command=["python", "-m", "jobs.geo.load_us_building_footprints"],
 45 |         )
 46 |     )
 47 |     wait_for_us_footprints_load = BatchSensor(
 48 |         task_id="wait_for_us_footprints_load",
 49 |         job_id=load_us_footprints.output,
 50 |         region_name="us-west-2",  # TODO: can we make this unnecessary?
 51 |     )
 52 | 
 53 |     load_global_ml_footprints = BatchOperator(
 54 |         **_construct_batch_args(
 55 |             name="load_global_ml_building_footprints",
 56 |             command=["python", "-m", "jobs.geo.load_global_ml_building_footprints"],
 57 |         )
 58 |     )
 59 |     wait_for_global_ml_footprints_load = BatchSensor(
 60 |         task_id="wait_for_global_ml_footprints_load",
 61 |         job_id=load_global_ml_footprints.output,
 62 |         region_name="us-west-2",  # TODO: can we make this unnecessary?
 63 |     )
 64 | 
 65 |     run_dbt_cloud_job = DbtCloudRunJobOperator(
 66 |         job_id=14,
 67 |         task_id="run_dbt_cloud_job",
 68 |         dbt_cloud_conn_id="dbt_cloud_default",
 69 |         wait_for_termination=True,
 70 |         timeout=1800,
 71 |     )
 72 | 
 73 |     run_dbt_cloud_job.set_upstream(wait_for_us_footprints_load)
 74 |     run_dbt_cloud_job.set_upstream(wait_for_global_ml_footprints_load)
 75 | 
 76 |     unload_us_footprints = BatchOperator(
 77 |         **_construct_batch_args(
 78 |             name="unload_us_building_footprints",
 79 |             command=["python", "-m", "jobs.geo.write_building_footprints", "us"],
 80 |         )
 81 |     )
 82 |     _ = BatchSensor(
 83 |         task_id="wait_for_us_footprints_unload",
 84 |         job_id=unload_us_footprints.output,
 85 |         region_name="us-west-2",  # TODO: can we make this unnecessary?
 86 |     )
 87 | 
 88 |     unload_us_footprints.set_upstream(run_dbt_cloud_job)
 89 | 
 90 |     unload_global_ml_footprints = BatchOperator(
 91 |         **_construct_batch_args(
 92 |             name="unload_global_ml_building_footprints",
 93 |             command=["python", "-m", "jobs.geo.write_building_footprints", "global_ml"],
 94 |         )
 95 |     )
 96 |     _ = BatchSensor(
 97 |         task_id="wait_for_global_ml_footprints_unload",
 98 |         job_id=unload_global_ml_footprints.output,
 99 |         region_name="us-west-2",  # TODO: can we make this unnecessary?
100 |     )
101 | 
102 |     unload_global_ml_footprints.set_upstream(run_dbt_cloud_job)
103 | 
104 | 
105 | run = building_footprints_dag()
106 | 


--------------------------------------------------------------------------------
/airflow/dags/state_entities/base_entities.py:
--------------------------------------------------------------------------------
  1 | """Load state entities list from department of finance."""
  2 | from __future__ import annotations
  3 | 
  4 | import io
  5 | import re
  6 | from datetime import datetime
  7 | 
  8 | import pandas
  9 | import requests
 10 | from common.defaults import DEFAULT_ARGS
 11 | from snowflake.connector.pandas_tools import write_pandas
 12 | 
 13 | from airflow.decorators import dag, task
 14 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook
 15 | 
 16 | GBQ_DATASET = "state_entities"
 17 | LEVEL_LABELS = ["A", "B", "1", "2", "3"]
 18 | DATA_URL = (
 19 |     "https://dof.ca.gov/wp-content/uploads/sites/352/Accounting/"
 20 |     "Policies_and_Procedures/Uniform_Codes_Manual/3orgstruc.pdf"
 21 | )
 22 | 
 23 | 
 24 | def clean_name(name: str) -> str:
 25 |     """Strip leading/trailing whitespace and replace repeated spaces with single spaces."""
 26 |     return re.sub(" {2,}", " ", name.strip())
 27 | 
 28 | 
 29 | @task
 30 | def load_data() -> None:
 31 |     """### Load Department of Finance State Entities data."""
 32 |     import pdfplumber
 33 | 
 34 |     hook = SnowflakeHook(snowflake_conn_id="raw")
 35 |     conn = hook.get_conn()
 36 | 
 37 |     # Regexes matching frontmatter and other lines we should skip
 38 |     skip = [
 39 |         # Just white space
 40 |         r"^\s*$",
 41 |         # Header material
 42 |         r"REVISED(\s+)(\w+)(\s+)(\d+)",
 43 |         r"(\s*)DEPARTMENT(\s+)OF(\s+)FINANCE(\s*)",
 44 |         r"(\s*)UNIFORM(\s+)CODES(\s+)MANUAL(\s*)",
 45 |         r"(\s*)ORGANIZATION(\s+)CODES(\s*)",
 46 |         r"(\s*)BY(\s+)STRUCTURE(\s*)",
 47 |         # Column headers
 48 |         r"(\s*)A(_+)(\s+)B(_+)(\s+)1(_+)(\s*)",
 49 |         # Page number
 50 |         r"^(\s*)(\d+)(\s*)$",
 51 |     ]
 52 | 
 53 |     skip_re = re.compile("|".join(skip), flags=re.IGNORECASE)
 54 |     entity_re = re.compile(r"^( *)(\d+)\s+(.+)$")
 55 | 
 56 |     r = requests.get(DATA_URL)
 57 |     f = io.BytesIO(r.content)
 58 |     pdf = pdfplumber.open(f)  # type: ignore
 59 | 
 60 |     levels: list[str | None] = [
 61 |         None,
 62 |     ] * len(LEVEL_LABELS)
 63 |     indent = None
 64 |     ts = 5
 65 |     entities: list[tuple[str | None, ...]] = []
 66 | 
 67 |     for page in pdf.pages:
 68 |         lines = page.extract_text(layout=True).split("\n")
 69 |         print(page)
 70 |         for line in lines:
 71 |             if skip_re.search(line):
 72 |                 continue
 73 | 
 74 |             match = entity_re.match(line)
 75 |             if match is None:
 76 |                 print(
 77 |                     f'Unable to parse line "{clean_name(line)}", assigning to previous name'
 78 |                 )
 79 |                 revised = list(entities[-1])
 80 |                 revised[-1] = revised[5] + " " + clean_name(line)  # type: ignore
 81 |                 entities[-1] = tuple(revised)
 82 |                 continue
 83 | 
 84 |             # Get the raw matches
 85 |             spaces, code, name = match.groups()
 86 | 
 87 |             # Set the top-level indentation
 88 |             if indent is None:
 89 |                 indent = len(spaces)
 90 | 
 91 |             # Strip excess whitespace from the name
 92 |             name = clean_name(name)
 93 | 
 94 |             # Get the level number from the whitespace 😬
 95 |             level_n = (len(spaces) - indent) // ts
 96 |             assert level_n <= len(LEVEL_LABELS) - 1
 97 | 
 98 |             # Fill the levels, null out everything after the current level
 99 |             levels[level_n] = code
100 |             levels[level_n + 1 :] = [None] * (len(LEVEL_LABELS) - level_n - 1)
101 | 
102 |             entities.append((*levels, name))
103 | 
104 |     df = (
105 |         pandas.DataFrame.from_records(entities, columns=[*LEVEL_LABELS, "name"])
106 |         .astype("string[python]")  # type: ignore
107 |         .rename(columns={"1": "L1", "2": "L2", "3": "L3"})
108 |     )
109 | 
110 |     DB = conn.database
111 |     SCHEMA = "STATE_ENTITIES"
112 |     conn.cursor().execute(f"CREATE SCHEMA IF NOT EXISTS {DB}.{SCHEMA}")
113 | 
114 |     write_pandas(
115 |         conn,
116 |         df,
117 |         database=DB,
118 |         schema=SCHEMA,
119 |         table_name="BASE_ENTITIES",
120 |         auto_create_table=True,
121 |         overwrite=True,
122 |     )
123 | 
124 | 
125 | @dag(
126 |     description="Load department of finance state entities list",
127 |     start_date=datetime(2022, 12, 19),
128 |     schedule_interval="@monthly",
129 |     default_args=DEFAULT_ARGS,
130 |     catchup=False,
131 | )
132 | def load_department_of_finance_state_entities():
133 |     load_data()
134 | 
135 | 
136 | run = load_department_of_finance_state_entities()
137 | 


--------------------------------------------------------------------------------
/airflow/dags/state_entities/budgets.py:
--------------------------------------------------------------------------------
  1 | """Load state entity budgets from ebudget site."""
  2 | from __future__ import annotations
  3 | 
  4 | import re
  5 | from datetime import datetime
  6 | 
  7 | import pandas
  8 | from common.defaults import DEFAULT_ARGS
  9 | from common.requests import get
 10 | from snowflake.connector.pandas_tools import write_pandas
 11 | 
 12 | from airflow.decorators import dag, task
 13 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook
 14 | 
 15 | PREFIX = "https://ebudget.ca.gov/budget/publication/admin"
 16 | 
 17 | 
 18 | def camel_to_snake(s: str) -> str:
 19 |     """
 20 |     Convert a camel-cased name to a snake-cased one.
 21 | 
 22 |     Snake-cased names are more appropriate for case-insensitive systems like
 23 |     data warehouse backends.
 24 |     """
 25 |     return re.sub(r"(?<!^)(?=[A-Z])", "_", s).lower()
 26 | 
 27 | 
 28 | @task
 29 | def crawl_ebudget_site(year="2022-23"):
 30 |     """Crawl the eBudget site for a year's budget information."""
 31 |     # This ontology doesn't match cleanly into the UCM one (Agency, subagency,
 32 |     # department, etc,  but that's okay since we treat UCM as authoritative and join
 33 |     # on the BU code. We collect and write agencies+departments differently from
 34 |     # programs because the latter have a different schema returned from the API.
 35 |     # Normalization is done in the data warehouse.
 36 |     all_agencies_and_departments = []
 37 |     all_programs = []
 38 | 
 39 |     agencies = get(f"{PREFIX}/e/{year}/statistics").json()
 40 |     all_agencies_and_departments.extend(agencies)
 41 | 
 42 |     for agency in agencies:
 43 |         print(f"Fetching department data for {agency['legalTitl']}")
 44 |         departments = get(
 45 |             f"{PREFIX}/e/{year}/statistics/{agency['webAgencyCd']}"
 46 |         ).json()
 47 |         all_agencies_and_departments.extend(departments)
 48 | 
 49 |         for department in departments:
 50 |             print(f"Fetching program data for {department['legalTitl']}")
 51 |             programs = get(
 52 |                 f"{PREFIX}/e/{year}/orgProgram/{department['webAgencyCd']}"
 53 |             ).json()
 54 |             all_programs.extend(programs["lines"])
 55 | 
 56 |     agencies_df = pandas.DataFrame.from_records(all_agencies_and_departments).rename(
 57 |         columns=camel_to_snake
 58 |     )
 59 |     programs_df = pandas.DataFrame.from_records(all_programs).rename(
 60 |         columns=camel_to_snake
 61 |     )
 62 | 
 63 |     hook = SnowflakeHook(snowflake_conn_id="raw")
 64 |     conn = hook.get_conn()
 65 | 
 66 |     DB = conn.database
 67 |     SCHEMA = "STATE_ENTITIES"
 68 |     conn.cursor().execute(f"CREATE SCHEMA IF NOT EXISTS {DB}.{SCHEMA}")
 69 | 
 70 |     print("Loading agencies")
 71 |     write_pandas(
 72 |         conn,
 73 |         agencies_df,
 74 |         database=DB,
 75 |         schema=SCHEMA,
 76 |         table_name="EBUDGET_AGENCY_AND_DEPARTMENT_BUDGETS",
 77 |         auto_create_table=True,
 78 |         overwrite=True,
 79 |     )
 80 | 
 81 |     print("Loading programs")
 82 |     write_pandas(
 83 |         conn,
 84 |         programs_df,
 85 |         database=DB,
 86 |         schema=SCHEMA,
 87 |         table_name="EBUDGET_PROGRAM_BUDGETS",
 88 |         auto_create_table=True,
 89 |         overwrite=True,
 90 |     )
 91 | 
 92 | 
 93 | @dag(
 94 |     description="Load budget data from ebudget site",
 95 |     start_date=datetime(2023, 1, 17),
 96 |     schedule_interval="@monthly",
 97 |     default_args=DEFAULT_ARGS,
 98 | )
 99 | def load_ebudget_data():
100 |     """Load eBudget data."""
101 |     # TODO: we will likely want to grab multiple years, and also load
102 |     # proposed and May revision data.
103 |     crawl_ebudget_site("2022-23")
104 | 
105 | 
106 | run = load_ebudget_data()
107 | 


--------------------------------------------------------------------------------
/airflow/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BUCKET="dse-infra-dev-us-west-2-mwaa"
 4 | ENVIRONMENT="dse-infra-dev-us-west-2-mwaa-environment"
 5 | 
 6 | # Copy the dags
 7 | for SUBDIR in $(ls dags); do
 8 |   if [ $SUBDIR == '__pycache__' ]; then
 9 |       continue
10 |   fi
11 |   aws s3 sync dags/$SUBDIR s3://${BUCKET}/dags/$SUBDIR --delete --exclude "*__pycache__*"
12 | done
13 | 
14 | # Copy the requirements.txt
15 | OLD_VERSION=$(aws mwaa get-environment --name $ENVIRONMENT --query 'Environment.RequirementsS3ObjectVersion' --output text)
16 | OLD_ETAG=$(aws s3api list-object-versions --bucket $BUCKET --prefix requirements.txt --query "Versions[?VersionId=='$OLD_VERSION'].ETag" --output text)
17 | aws s3 cp requirements/requirements.txt s3://${BUCKET}/requirements.txt
18 | NEW_VERSION=$(aws s3api list-object-versions --bucket $BUCKET --prefix requirements.txt --query 'Versions[?IsLatest].[VersionId]' --output text)
19 | NEW_ETAG=$(aws s3api list-object-versions --bucket $BUCKET --prefix requirements.txt --query "Versions[?IsLatest].ETag" --output text)
20 | 
21 | # Only update if the requirements have changed
22 | if [ $OLD_ETAG != $NEW_ETAG ]; then
23 |     echo "Detected a difference in requirements.txt, updating the environment"
24 |     aws mwaa update-environment --requirements-s3-object-version "$NEW_VERSION" --name $ENVIRONMENT
25 |     echo "New requirements.txt version is ${VERSION_ID}"
26 | fi
27 | 


--------------------------------------------------------------------------------
/airflow/plugins/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/airflow/plugins/.gitkeep


--------------------------------------------------------------------------------
/airflow/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.2/constraints-3.10.txt
 2 | apache-airflow-providers-dbt-cloud
 3 | apache-airflow-providers-slack
 4 | apache-airflow-providers-snowflake
 5 | backoff
 6 | fsspec
 7 | pandas
 8 | pdfplumber
 9 | pyarrow
10 | snowflake-connector-python
11 | 


--------------------------------------------------------------------------------
/ci/profiles.yml:
--------------------------------------------------------------------------------
 1 | dse_snowflake:
 2 |   target: snowflake_dev
 3 |   outputs:
 4 |     snowflake_dev:
 5 |       type: snowflake
 6 |       account: heb41095
 7 |       user: "{{ env_var('SNOWFLAKE_USER') }}"
 8 |       private_key: "{{ env_var('SNOWFLAKE_PRIVATE_KEY') }}"
 9 |       role: READER_DEV
10 |       warehouse: REPORTING_XS_DEV
11 |       database: ANALYTICS_DEV
12 |       schema: ci_should_not_create_this_schema
13 |       threads: 4
14 | 


--------------------------------------------------------------------------------
/docs/code/codespaces.md:
--------------------------------------------------------------------------------
 1 | # Developing using Codespaces
 2 | 
 3 | GitHub Codespaces allow you to spin up an ephemeral development environment in VS Code
 4 | which includes a git repository, configurations, and pre-installed libraries.
 5 | It provides an easy way for developers to get started working in a repository,
 6 | especially if they are uncomfortable
 7 | 
 8 | ## Creating a Codespace
 9 | 
10 | Go to the "Code" dropdown from the main repository page,
11 | select the three dot dropdown, and select "New with options..."
12 | This will allow more configuration than the default codespace.
13 | 
14 | ![Create a new codespace](../images/create-new-codespace.png)
15 | 
16 | In the codespace configuration form, you will have an option to add "Recommended Secrets".
17 | This is where you can add your personal Snowflake credentials to your codespace,
18 | allowing for development against our Snowflake warehouse, including using dbt.
19 | You should only add credentials for accounts that are protected by multi-factor authentication (MFA).
20 | 
21 | ![Add codespace secrets](../images/codespace-secrets.png)
22 | 
23 | After you have added your secrets, click "Create Codespace".
24 | Building it may take a few minutes,
25 | but then you should be redirected to a VS Code environment in your browser.
26 | 
27 | ## Launching an existing Codespace
28 | 
29 | Once your codespace is created, you should be able to launch it
30 | without re-creating it every time using the "Code" dropdown,
31 | going to "Open in...", and selecting "Open in browser":
32 | 
33 | ![Launch codespace](../images/launch-codespace.png)
34 | 
35 | ## Using a Codespace
36 | 
37 | Once you have created and configured a codespace,
38 | you have access to a relatively full-featured VS Code-based development environment.
39 | This includes:
40 | 
41 | * An integrated bash terminal
42 | * dbt profiles configured
43 | * All Python dependencies for the project
44 | * All pre-commit hooks installed
45 | * git configured using your GitHub account
46 | 
47 | ### Usage notes
48 | 
49 | 1. When you launch a new codespace, it can take a couple of minutes for all of the extensions to install. In particular, this means that the Python environment may not be fully set-up when you land in VS Code. We recommend closing existing terminal sessions and starting a new one once the extensions have finished installing.
50 | 1. The first time you make a commit, the pre-commit hooks will be installed. This may take a few minutes. Subsequent commits will take less time.
51 | 1. If the pre-commit hooks fail when making a commit, it will give you the opportunity to open the git logs to view the errors. If you are unable to fix the errors for whatever reason, you can always make a new commit from the command line with `--no-verify`:
52 |     ```bash
53 |     git commit --no-verify -m "Your commit message"
54 |     ```
55 | 


--------------------------------------------------------------------------------
/docs/code/writing-documentation.md:
--------------------------------------------------------------------------------
 1 | # Writing Documentation
 2 | 
 3 | Documentation for this project is built using [mkdocs](https://www.mkdocs.org/)
 4 | with the [material theme](https://squidfunk.github.io/mkdocs-material/)
 5 | and hosted using [GitHub Pages](https://pages.github.com/).
 6 | The documentation source files are in the `docs/` directory
 7 | and are authored using [markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax).
 8 | 
 9 | ## Local Development
10 | 
11 | To write documentation for this project, make sure that the build tools are installed.
12 | In a Python environment and in the data-infrastructure repo, you should be able to start a local server for the docs by running:
13 | 
14 | ```bash
15 | mkdocs serve
16 | ```
17 | 
18 | Then open a web browser to [http://localhost:8000](http://localhost:8000) to view the built docs.
19 | Any edits you make to the markdown sources should be automatically picked up,
20 | and the page should automatically rebuild and refresh.
21 | 
22 | ## Deployment
23 | 
24 | Deployment of the docs for this repository is done automatically upon merging to `main`
25 | using the `docs` GitHub Action.
26 | 
27 | Built documentation is pushed to the `gh-pages` branch of the repository,
28 | and can be viewed by navigating to [https://cagov.github.io/data-infrastructure](https://cagov.github.io/data-infrastructure).
29 | 


--------------------------------------------------------------------------------
/docs/dbt/dbt.md:
--------------------------------------------------------------------------------
 1 | # dbt on the Data Services and Engineering team
 2 | 
 3 | ## Architecture
 4 | 
 5 | We broadly follow the architecture described in
 6 | [this dbt blog post](https://www.getdbt.com/blog/how-we-configure-snowflake/)
 7 | for our Snowflake dbt project.
 8 | 
 9 | It is described in more detail in our [Snowflake docs](../infra/snowflake.md#architecture).
10 | 
11 | ## Naming conventions
12 | 
13 | Models in a data warehouse do not follow the same naming conventions as [raw cloud resources](../learning/naming-conventions.md#general-approach),
14 | as their most frequent consumers are analytics engineers and data analysts.
15 | 
16 | The following conventions are used where appropriate:
17 | 
18 | **Dimension tables** are prefixed with `dim_`.
19 | 
20 | **Fact tables** are prefixed with `fct_`.
21 | 
22 | **Staging tables** are prefixed with `stg_`.
23 | 
24 | **Intermediate tables** are prefixed with `int_`.
25 | 
26 | We may adopt additional conventions for denoting aggregations, column data types, etc. in the future.
27 | If during the course of a project's model development we determine that simpler human-readable names
28 | work better for our partners or downstream consumers, we may drop the above prefixing conventions.
29 | 
30 | ## Custom schema names
31 | 
32 | dbt's default method for generating [custom schema names](https://docs.getdbt.com/docs/build/custom-schemas)
33 | works well for a single-database setup:
34 | 
35 | * It allows development work to occur in a separate schema from production models.
36 | * It allows analytics engineers to develop side-by-side without stepping on each others toes.
37 | 
38 | A downside of the default is that production models all get a prefix,
39 | which may not be an ideal naming convention for end-users.
40 | 
41 | Because our architecture separates development and production databases,
42 | and has strict permissions protecting the `RAW` database,
43 | there is less danger of breaking production models.
44 | So we use our own custom schema name following a modified
45 | [approach from the GitLab Data Team](https://gitlab.com/gitlab-data/analytics/-/blob/master/transform/snowflake-dbt/macros/utils/override/generate_schema_name.sql).
46 | 
47 | In production, each schema is just the custom schema name without any prefix.
48 | In non-production environments the default is used, where analytics engineers
49 | get the custom schema name prefixed with their target schema name (i.e. `dbt_username_schemaname`),
50 | and CI runs get the custom schema name prefixed with a CI job name.
51 | 
52 | This approach may be reevaluated as the project matures.
53 | 


--------------------------------------------------------------------------------
/docs/images/codespace-secrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/codespace-secrets.png


--------------------------------------------------------------------------------
/docs/images/column-pruning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/column-pruning.png


--------------------------------------------------------------------------------
/docs/images/columnar-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/columnar-storage.png


--------------------------------------------------------------------------------
/docs/images/consumers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/consumers.png


--------------------------------------------------------------------------------
/docs/images/create-new-codespace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/create-new-codespace.png


--------------------------------------------------------------------------------
/docs/images/databases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/databases.png


--------------------------------------------------------------------------------
/docs/images/dbt_model_timing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/dbt_model_timing.png


--------------------------------------------------------------------------------
/docs/images/dbt_run_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/dbt_run_summary.png


--------------------------------------------------------------------------------
/docs/images/developer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/developer.png


--------------------------------------------------------------------------------
/docs/images/devops/workitemdetails.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/devops/workitemdetails.png


--------------------------------------------------------------------------------
/docs/images/devops/workitemexamples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/devops/workitemexamples.png


--------------------------------------------------------------------------------
/docs/images/devops/workitemtypes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/devops/workitemtypes.png


--------------------------------------------------------------------------------
/docs/images/environment_variables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/environment_variables.png


--------------------------------------------------------------------------------
/docs/images/github/commit-changes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/commit-changes.png


--------------------------------------------------------------------------------
/docs/images/github/comparing-changes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/comparing-changes.png


--------------------------------------------------------------------------------
/docs/images/github/conflict-markers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/conflict-markers.png


--------------------------------------------------------------------------------
/docs/images/github/conflict-sections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/conflict-sections.png


--------------------------------------------------------------------------------
/docs/images/github/create-a-pull-request-github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/create-a-pull-request-github.png


--------------------------------------------------------------------------------
/docs/images/github/issues-pr-actions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/issues-pr-actions.png


--------------------------------------------------------------------------------
/docs/images/github/merging-in-vs-code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/merging-in-vs-code.png


--------------------------------------------------------------------------------
/docs/images/github/merging-in-vs-code2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/merging-in-vs-code2.png


--------------------------------------------------------------------------------
/docs/images/github/open-a-pr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/open-a-pr.png


--------------------------------------------------------------------------------
/docs/images/github/page4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/page4.png


--------------------------------------------------------------------------------
/docs/images/github/pm-eg-metaissue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/pm-eg-metaissue.png


--------------------------------------------------------------------------------
/docs/images/github/pm-subtasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/pm-subtasks.png


--------------------------------------------------------------------------------
/docs/images/github/pr-description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/pr-description.png


--------------------------------------------------------------------------------
/docs/images/github/request-for-review.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/request-for-review.png


--------------------------------------------------------------------------------
/docs/images/github/suggest-a-change.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/suggest-a-change.png


--------------------------------------------------------------------------------
/docs/images/github/version-control.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/github/version-control.png


--------------------------------------------------------------------------------
/docs/images/initial-query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/initial-query.png


--------------------------------------------------------------------------------
/docs/images/launch-codespace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/launch-codespace.png


--------------------------------------------------------------------------------
/docs/images/nightly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/nightly.png


--------------------------------------------------------------------------------
/docs/images/odi-circle_logomark-blue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/odi-circle_logomark-blue.png


--------------------------------------------------------------------------------
/docs/images/odi-square_logomark-blue.svg:
--------------------------------------------------------------------------------
1 | <svg width="218" height="218" viewBox="0 0 218 218" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <rect width="218" height="218" fill="#00315F"/>
3 | <path d="M180.812 156.059V154.57L179.77 153.428L179.078 151.32L176.216 148.859V146.051L146.884 120.958C146.857 121.026 146.831 121.094 146.797 121.176L135.36 111.393L96.7762 78.3797L95.3853 22H37.2269L36.7027 22.3059C36.7027 22.3059 37.0925 23.1421 37.2269 23.4888C37.3545 23.8424 37.6637 24.8961 36.8371 25.644C36.0106 26.3918 36.3601 26.9628 36.3601 26.9628C36.3601 26.9628 37.2739 27.6223 37.8787 27.9282C38.4834 28.2341 39.1352 31.5722 39.1352 32.5374C39.1352 33.5028 38.3558 36.1815 37.919 37.7179C37.4822 39.2543 37.9593 40.4372 38.3087 41.0967C38.6581 41.7561 36.4003 45.9711 36.4003 45.9711C36.4003 45.9711 33.4168 51.2398 34.1023 52.1645C34.7877 53.089 34.6668 55.5025 34.6668 55.5025L38.4029 58.929V60.1595C38.4029 60.1595 41.9642 63.9394 41.7896 64.2454C41.6149 64.5513 42.0517 66.3121 42.3944 66.924C42.7438 67.5359 42.0046 70.6563 42.0046 70.6563V73.3349C42.0046 73.3349 42.7841 76.1902 43.3956 77.0671C44.0003 77.9441 43.1335 80.0516 43.1335 80.0516C43.1335 80.0516 42.9588 80.6634 43.1335 80.9286C43.3082 81.1937 47.4743 86.374 48.1261 86.8159C48.7779 87.2578 52.2518 91.0717 52.2518 91.0717V92.5197L53.2463 93.0024L54.1602 94.7564L54.2005 95.0011C57.1974 96.9387 54.2677 95.3886 54.3281 95.7625C54.5028 96.7687 53.2396 97.3397 53.2396 97.3397V98.611L54.1938 98.5703C54.1938 98.5703 55.0606 97.6933 56.1895 98.7878C57.3184 99.8891 58.7497 100.324 58.7497 100.324C58.7497 100.324 59.5762 101.82 60.4833 102.078C61.3972 102.343 60.7857 105.987 60.7857 105.987V107.7L61.8205 109.237C61.8205 109.237 62.1363 109.502 61.9146 110.739C61.6996 111.983 61.9146 113.404 61.9146 113.404L64.7368 116.565L66.9945 118.449C66.9945 118.449 67.3843 118.85 69.2523 118.496C71.1203 118.143 70.9456 119.815 70.9456 119.815L71.4227 122.052L70.5089 124.288C70.5089 124.288 70.8583 123.058 69.38 123.894C67.9084 124.73 69.595 127.626 69.595 127.626V129.863C69.595 129.863 70.9859 132.277 71.9402 132.195C72.8943 132.107 74.7623 135.968 74.7623 135.968L75.8912 136.233L76.1936 138.212L76.758 138.653C76.758 138.653 78.1893 139.619 78.4043 140.632C78.6194 141.638 77.5375 142.209 79.7079 142.739C81.8783 143.269 80.7965 142.522 81.1862 143.528C81.576 144.541 83.659 146.295 83.659 146.295L85.1776 149.191C84.9626 149.497 85.1776 150.904 85.1776 150.904L86.4812 151.822L88.6516 152.128C88.6516 152.128 88.954 152.61 88.954 153.317C88.954 154.018 88.4299 155.731 88.4299 156.655C88.4299 157.58 90.6876 157.227 89.5587 157.58C88.4299 157.934 89.0279 159.511 89.0279 159.511C89.0279 159.511 89.9082 159.157 89.2967 161.353C88.6919 163.549 89.2967 163.196 89.2967 163.196C89.2967 163.196 90.6003 163.019 91.0303 164.338C91.4671 165.657 92.7639 165.391 92.7639 165.391C92.7639 165.391 94.2422 165.126 96.2379 164.861C98.2337 164.596 100.928 165.65 100.928 165.65C100.928 165.65 106.223 166.262 106.66 166.262C106.989 166.262 108.286 166.262 109.227 167.084C109.234 167.268 109.247 167.451 109.254 167.635C110.255 168.172 111.384 168.757 111.619 168.757C112.056 168.757 111.007 170.075 114.307 169.199C117.606 168.322 118.13 168.933 118.13 168.933C118.13 168.933 120.126 171.918 119.689 172.536C119.252 173.155 119.427 174.025 119.864 174.025C120.301 174.025 120.905 174.902 121.86 174.991C122.814 175.079 122.814 173.413 122.814 173.413C122.814 173.413 123.338 172.536 124.635 173.679C125.938 174.821 127.934 176.133 127.934 176.133C127.934 176.133 129.063 177.894 131.751 178.941C134.446 179.994 138.262 187.813 137.825 189.478C137.389 191.144 138.175 191.144 138.175 191.144C138.175 191.144 137.825 192.021 139.216 192.728C140.607 193.428 139.653 197.031 140.869 195.712C142.086 194.393 177.322 192.116 180.709 188.778C184.096 185.44 180.622 181.925 180.367 182.544C180.105 183.156 177.329 183.421 178.196 181.844C179.063 180.266 178.371 179.036 178.371 179.036C178.371 179.036 177.417 176.752 177.934 176.752C178.458 176.752 179.063 175.263 179.325 174.556C179.587 173.855 179.668 172.625 179.668 172.625C179.668 172.625 179.755 169.987 180.36 170.164C180.965 170.341 177.672 168.587 179.836 165.949C182.006 163.311 178.707 164.195 180.965 163.141C183.222 162.087 184.439 160.157 184.956 159.014C185.5 157.899 180.81 156.057 180.81 156.057L180.812 156.059Z" fill="white"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/docs/images/partition-pruning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/partition-pruning.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # CalData Data Services and Engineering Infrastructure
2 | 
3 | This is the technical documentation for CalData's
4 | Data Services and Engineering (DSE) projects.
5 | It consists of processes, conventions, instructions, and architecture diagrams.
6 | 


--------------------------------------------------------------------------------
/docs/infra/cloud-infrastructure.md:
--------------------------------------------------------------------------------
 1 | # Cloud infrastructure
 2 | 
 3 | The DSE team [uses Terraform](../code/terraform-local-setup.md) to manage cloud infrastructure.
 4 | Our stack includes:
 5 | 
 6 | * An [AWS Batch](https://aws.amazon.com/batch/) environment for running arbitrary containerized jobs
 7 | * A [Managed Workflows on Apache Airflow](https://aws.amazon.com/managed-workflows-for-apache-airflow/) environment for orchestrating jobs
 8 | * A VPC and subnets for the above
 9 | * An ECR repository for hosting Docker images storing code and libraries for jobs
10 | * A bot user for running AWS operations in GitHub Actions
11 | * An S3 scratch bucket
12 | 
13 | ## Architecture
14 | 
15 | ```mermaid
16 | flowchart TD
17 |   subgraph AWS
18 |     J[GitHub CD\nbot user]
19 |     G[Artifact in S3]
20 |     subgraph VPC
21 |       subgraph Managed Airflow
22 |         K1[Scheduler]
23 |         K2[Worker]
24 |         K3[Webserver]
25 |       end
26 |       F[AWS Batch Job\n on Fargate]
27 |     end
28 |     E[AWS ECR Docker\nRepository]
29 |   end
30 |   subgraph GitHub
31 |     A[Code Repository]
32 |   end
33 |   E --> F
34 |   A -- Code quality check\n GitHub action --> A
35 |   A -- Job submission\nvia GitHub Action --> F
36 |   A -- Docker build \nGitHub Action --> E
37 |   A --> H[CalData\nadministrative\nuser]
38 |   H -- Terraform -----> AWS
39 |   K2 -- Job submission\nvia Airflow --> F
40 |   K1 <--> K2
41 |   K3 <--> K1
42 |   K3 <--> K2
43 |   F --> G
44 |   J -- Bot Credentials --> A
45 | ```
46 | 


--------------------------------------------------------------------------------
/docs/learning/dbt.md:
--------------------------------------------------------------------------------
 1 | # dbt
 2 | 
 3 | Many CalData projects use [dbt](https://www.getdbt.com/)
 4 | for transforming and modeling data within our cloud data warehouses.
 5 | dbt has become extremely popular over the last several years,
 6 | popularizing the practice and position of "analytics engineering".
 7 | It has a number of features that makes it valuable for data stacks:
 8 | 
 9 | * It works well with version control
10 | * It encourages modular, reusable SQL code
11 | * It makes it easier to track data lineage as it flows through your data warehouse
12 | * It has a large, active community with which you can share tips and techniques
13 | 
14 | ## Learning dbt
15 | 
16 | dbt provides a series of [free courses](https://courses.getdbt.com/collections)
17 | for learning how to use the project:
18 | 
19 | * [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals)
20 | * [Jinja, Macros, and Packages](https://courses.getdbt.com/courses/jinja-macros-packages)
21 | * [Advanced Materializations](https://courses.getdbt.com/courses/advanced-materializations)
22 | * [Refactoring SQL for Modularity](https://courses.getdbt.com/courses/refactoring-sql-for-modularity)
23 | 


--------------------------------------------------------------------------------
/docs/setup/dbt-setup.md:
--------------------------------------------------------------------------------
 1 | # dbt project setup
 2 | 
 3 | To set up a new project on dbt Cloud follow these steps:
 4 | 
 5 | 1. Give your new project a name.
 6 | 1. Click _Advanced settings_ and in the _Project subdirectory_ field, enter "transform"
 7 | 1. Select a data warehouse connection. (e.g. Snowflake, BigQuery, Redshift)
 8 | 1. For the _Development credentials_ section you'll want to choose between Snowflake OAuth or Key pair. In general, Snowflake OAuth is preferred for human users (which is what the development environment is for). It is also an enterprise dbt Cloud feature, so if working with a standard account, you'll need to use key pair.
 9 | 
10 |    1. For Snowflake OAuth:
11 | 
12 |       1. Follow dbt's instructions for set up [here](https://docs.getdbt.com/docs/cloud/manage-access/set-up-snowflake-oauth)
13 | 
14 |    1. For Key pair:
15 |       1. Under _Auth method_ select _Key pair_
16 |       1. Enter your data warehouse username
17 |       1. Enter the private key and private key passphrase
18 |       1. For more guidance, read [dbt's docs on connecting to Snowflake via key pair](https://docs.getdbt.com/docs/cloud/connect-data-platform/connect-snowflake#key-pair)
19 | 
20 | 1. Finally click the _Test Connection_ button.
21 | 1. Connect the appropriate repository. Read [dbt's docs on connecting to GitHub](https://docs.getdbt.com/docs/cloud/git/connect-github) or [dbt's docs on connecting to Azure DevOps](https://docs.getdbt.com/docs/cloud/git/setup-azure#register-an-azure-ad-app) and [Microsoft's docs on creating branch policies in DevOps](https://learn.microsoft.com/en-us/azure/devops/repos/git/pr-status-policy?view=azure-devops). To integrate dbtCloud with Azure DevOps, the service user (legacy) option must be used.  Complete the steps found in the [documentation](https://docs.getdbt.com/docs/cloud/git/setup-azure#register-an-azure-ad-app).
22 | 
23 | Once you're through the first five steps you can return to the dbt homepage and click the Settings button in the upper right corner. From there you can follow the steps to configure three environments for Continuous integration - CI, development, and production. Read [dbt's docs on CI in dbt Cloud](https://docs.getdbt.com/docs/deploy/continuous-integration). Read [dbt's docs on creating production (deployment) environments](https://docs.getdbt.com/docs/deploy/deploy-environments) and [dbt's docs on creating and scheduling deploy jobs](https://docs.getdbt.com/docs/deploy/deploy-jobs#create-and-schedule-jobs).
24 | 
25 | You'll also want to [configure notifications for job failures](https://docs.getdbt.com/docs/deploy/job-notifications).
26 | 
27 | Pictured below is an example of environment variables you can set for each environment. For more guidance, read [dbt's docs on environment variables](https://docs.getdbt.com/docs/build/environment-variables).
28 | 
29 | ![environment variables](../images/environment_variables.png)
30 | 


--------------------------------------------------------------------------------
/docs/setup/fivetran-setup.md:
--------------------------------------------------------------------------------
 1 | # Fivetran project setup
 2 | 
 3 | To set up a new project in Fivetran follow these steps:
 4 | 
 5 | 1. First, ensure you have met the following pre-requisites:
 6 |     - You have set up a Snowflake Account for the project (follow all instructions from [here](./snowflake-setup.md))
 7 |     - Ensure that your Snowflake project has a **LOADER_PRD** role with privileges to write data to the **RAW_PRD** database
 8 |     - You have created a Snowflake User called **FIVETRAN_SVC_USER_PRD** and ensured this user has the **LOADER_PRD** role
 9 |     - You have set up an auth key pair for this user and saved it to the ODI OnePass account
10 | 
11 | 2. In Fivetran, navigate to Organization -> Accounts
12 | 3. Click _Add Acount_
13 | 4. Choose an Account Name, select _Enterprise_ for Account Tier and _No restrictions_ for Required Authentication Type
14 | 5. Next, navigate to Destinations
15 | 6. Search for **Snowflake** and click _Select_
16 | 7. To set up the Snowflake connector:
17 |     1. Name the destination **RAW_PRD**
18 |     2. Add the Snowflake URL for your project as the _Host_
19 |     3. Add **FIVETRAN_SVC_USER_PRD** as the _User_
20 |     4. Add **RAW_PRD** as the _Database_
21 |     5. For _Auth_ select **KEY_PAIR** and enter the key pair details for the FIVETRAN_SVC_USER_PRD user
22 |     6. Add **LOADER_PRD** as the _Role_
23 |     7. Optional: Most of the time, the cloud provider and region don't matter, but if a client is operating in a particular cloud/region and wants to minimize data transfer, it makes sense to select the client's _Cloud service provider_, _Cloud region_, and _Default Time Zone_
24 |     8. Click the _Save & Test_ button
25 | 
26 | Once you are through with these steps, you can proceed to creating and assigning permissions to Users in the Fivetran account.
27 | 


--------------------------------------------------------------------------------
/docs/setup/project-teardown.md:
--------------------------------------------------------------------------------
 1 | # Tearing down a project
 2 | 
 3 | Upon completion of a project (or if you just went through project setup for testing purposes)
 4 | there are a few steps needed to tear down the infrastructure.
 5 | 
 6 | 1. If the GitHub repository is to be handed off a client, transfer ownership of it to them.
 7 |     Otherwise, delete or archive the GitHub repository.
 8 |     If archiving, delete the GitHub actions secrets.
 9 | 1. Open a Help Desk ticket with IT-Ops to remove Sentinel logging for the Snowflake account.
10 | 1. If the Snowflake account is to be handed off to a client, transfer ownership of it to them.
11 |     Otherwise, [drop the account](https://docs.snowflake.com/en/user-guide/organizations-manage-accounts-delete).
12 | 


--------------------------------------------------------------------------------
/docs/setup/repo-setup.md:
--------------------------------------------------------------------------------
 1 | ## Create project git repository
 2 | 
 3 | Create a new git repository from the CalData Infrastructure Template
 4 | following the instructions [here](https://github.com/cagov/caldata-infrastructure-template#usage).
 5 | 
 6 | Once you have created the repository, push it to a remote repository in GitHub.
 7 | There are some GitHub actions that will fail because the repository is not yet
 8 | configured to work with the new Snowflake account.
 9 | 
10 | ## Set up CI in GitHub
11 | 
12 | The projects generated from our infrastructure template need read access to the
13 | Snowflake account in order to do two things from GitHub actions:
14 | 
15 | 1. Verify that dbt models in branches compile and pass linter checks
16 | 1. Generate dbt docs upon merge to `main`.
17 | 
18 | The terraform configurations deployed above create two service accounts
19 | for GitHub actions, a production one for docs and a dev one for CI checks.
20 | 
21 | ### Add key pairs to the GitHub service accounts
22 | 
23 | Set up key pairs for the two GitHub actions service accounts
24 | (`GITHUB_ACTIONS_SVC_USER_DEV` and `GITHUB_ACTIONS_SVC_USER_PRD`).
25 | This follows a similar procedure to what you did for your personal key pair,
26 | though the project template currently does not assume an encrypted key pair.
27 | [This bash script](https://gist.github.com/ian-r-rose/35d49bd253194f57b57e9e59a595bed8)
28 | is a helpful shortcut for generating the key pair:
29 | ```bash
30 | bash generate_key.sh <key-name>
31 | ```
32 | 
33 | Once you have created and set the key pairs, add them to the DSE 1Password shared vault.
34 | Make sure to provide enough information to disambiguate the key pair from others stored in the vault,
35 | including:
36 | 
37 | * The account locator (legacy account identifier)
38 | * The organization name
39 | * The account name (distinct from the account locator)
40 |   * Note : The preferred account identifier is to use name of the account prefixed by its organization (e.g. myorg-account123)
41 | * The service account name
42 | * The public key
43 | * The private key
44 | 
45 | ### Set up GitHub actions secrets
46 | 
47 | You need to configure secrets in GitHub actions
48 | in order for the service accounts to be able to connect to your Snowflake account.
49 | From the repository page, go to "Settings", then to "Secrets and variables", then to "Actions".
50 | 
51 | Add the following repository secrets:
52 | 
53 | | Variable | Value |
54 | |----------|-------|
55 | | `SNOWFLAKE_ACCOUNT` | new account locator |
56 | | `SNOWFLAKE_USER_DEV` | `GITHUB_ACTIONS_SVC_USER_DEV` |
57 | | `SNOWFLAKE_USER_PRD` | `GITHUB_ACTIONS_SVC_USER_PRD` |
58 | | `SNOWFLAKE_PRIVATE_KEY_DEV` | dev service account private key |
59 | | `SNOWFLAKE_PRIVATE_KEY_PRD` | prd service account private key |
60 | 
61 | ## Enable GitHub pages for the repository
62 | 
63 | The repository must have GitHub pages enabled in order for it to deploy and be viewable.
64 | 
65 | 1. From the repository page, go to "Settings", then to "Pages".
66 | 1. Under "GitHub Pages visibility" select "Private" (unless the project is public!).
67 | 1. Under "Build and deployment" select "Deploy from a branch" and choose "gh-pages" as your branch.
68 | 


--------------------------------------------------------------------------------
/docs/setup/sentinel-setup.md:
--------------------------------------------------------------------------------
 1 | # Set up Sentinel logging
 2 | 
 3 | ODI IT requires that systems log to our Microsoft Sentinel instance
 4 | for compliance with security monitoring policies.
 5 | The terraform configuration deployed above creates a service account for Sentinel
 6 | which needs to be integrated.
 7 | 
 8 | 1. Create a password for the Sentinel service account.
 9 |     In other contexts we prefer key pairs for service accounts, but the Sentinel
10 |     integration requires password authentication. In a Snowflake worksheet run:
11 |     ```sql
12 |     use role securityadmin;
13 |     alter user sentinel_svc_user_prd set password = '<new-password>'
14 |     ```
15 | 1. Store the Sentinel service account authentication information in our shared
16 |     1Password vault.
17 |     Make sure to provide enough information to disambiguate it from others stored in the vault,
18 |     including:
19 | 
20 |     * The account locator (legacy account identifier)
21 |     * The organization name
22 |     * The account name (distinct from the account locator)
23 |       * Note : The preferred account identifier is to use name of the account prefixed by its organization (e.g. myorg-account123)
24 |     * The service account name
25 |     * The public key
26 |     * The private key
27 | 
28 | 1. Create an IT Help Desk ticket to add the new account to our Sentinel instance.
29 |     Share the 1Password item with the IT-Ops staff member who is implementing the ticket.
30 |     If you've included all of the above information in the vault item,
31 |     it should be all they need.
32 | 1. Within fifteen minutes or so of implementation it should be clear whether the integration is working.
33 |     IT-Ops should be able to see logs ingesting, and Snowflake account admins should see queries
34 |     from the Sentinel service user.
35 | 


--------------------------------------------------------------------------------
/docs/setup/snowflake-service-accounts.md:
--------------------------------------------------------------------------------
 1 | ### Create service accounts using Terraform
 2 | 
 3 | Service accounts aren't associated with a human user.
 4 | Instead, they are created by an account administrator for
 5 | the purposes of allowing another service to perform some action.
 6 | 
 7 | We currently use service accounts for:
 8 | 
 9 | * Fivetran loading raw data
10 | * Airflow loading raw data
11 | * dbt Cloud for transforming data
12 | * GitHub actions generating docs
13 | 
14 | These service accounts are created using Terraform
15 | and assigned roles according to the principle of least-privilege.
16 | They use key pair authentication, which is more secure than password-based authentication as no sensitive data are exchanged.
17 | Private keys for service accounts should be stored in CalData's 1Password vault.
18 | 
19 | The following are steps for creating a new service account with key pair authentication:
20 | 
21 | 1. Create a new key pair in accordance with [these docs](https://docs.snowflake.com/en/user-guide/key-pair-auth#configuring-key-pair-authentication).
22 |   Most of the time, you should create a key pair with encryption enabled for the private key.
23 | 1. Add the private key to the CalData 1Password vault, along with the intended service account user name and passphrase (if applicable)
24 | 1. Create a new user in the Snowflake Terraform configuration (`users.tf`) and assign it the appropriate functional role.
25 |   Once the user is created, add its public key in the Snowflake UI:
26 |   ```sql
27 |   ALTER USER <USERNAME> SET RSA_PUBLIC_KEY='MII...'
28 |   ```
29 |   Note that we need to remove the header and trailer (i.e. `-- BEGIN PUBLIC KEY --`) as well as any line breaks
30 |   in order for Snowflake to accept the public key as valid.
31 | 1. Add the *private* key for the user to whatever system needs to access Snowflake.
32 | 
33 | Service accounts should not be shared across different applications,
34 | so if one becomes compromised, the damage is more isolated.
35 | 


--------------------------------------------------------------------------------
/docs/setup/terraform-project-setup.md:
--------------------------------------------------------------------------------
 1 | # Deploy project infrastructure using Terraform
 2 | 
 3 | We will create two separate deployments of the project infrastructure,
 4 | one for development, and one for production.
 5 | In some places we will refer to project name and owner as `<project>` and `<owner>`, respectively,
 6 | following our [naming conventions](../learning/naming-conventions.md).
 7 | You should substitute the appropriate names there.
 8 | 
 9 | ## Create the dev configuration
10 | 
11 | 1. Ensure that your environment has environment variables set for
12 |     `SNOWFLAKE_ACCOUNT`, `SNOWFLAKE_USER`, `SNOWFLAKE_PRIVATE_KEY_PATH`, and `SNOWFLAKE_PRIVATE_KEY_PASSPHRASE`.
13 |     Make sure you *don't* have any other `SNOWFLAKE_*` variables set,
14 |     as they can interfere with authentication.
15 | 1. In the new git repository, create a directory to hold the development Terraform configuration:
16 |     ```bash
17 |     mkdir -p terraform/environments/dev/
18 |     ```
19 |     The location of this directory is by convention, and subject to change.
20 | 1. Copy the terraform configuration from
21 |     [here](https://github.com/cagov/data-infrastructure/blob/main/terraform/snowflake/environments/dev/main.tf)
22 |     to your `dev` directory.
23 | 1. In the "elt" module of `main.tf`, change the `source` parameter to point to
24 |     `"github.com/cagov/data-infrastructure.git//terraform/snowflake/modules/elt?ref=<ref>"`
25 |     where `<ref>` is the short hash of the most recent commit in the `data-infrastructure` repository.
26 | 1. In the `dev` directory, create a new backend configuration file called `<owner>-<project>-dev.tfbackend`.
27 |     The file will point to the S3 bucket in which we are storing terraform state. Populate the backend
28 |     configuration file with the following (making sure to substitute values for `<owner>` and `<project>`):
29 |     ```hcl
30 |     bucket = "dse-snowflake-dev-terraform-state"
31 |     dynamodb_table = "dse-snowflake-dev-terraform-state-lock"
32 |     key = "<owner>-<project>-dev.tfstate"
33 |     region = "us-west-2"
34 |     ```
35 | 1. In the `dev` directory, create a terraform variables file called `terraform.tfvars`,
36 |     and populate the "elt" module variables. These variables may expand in the future,
37 |     but at the moment they are just the new Snowflake organization name, account name and the environment
38 |     (in this case `"DEV"`):
39 |     ```hcl
40 |     organization_name = "<organization_name>"
41 |     account_name = "<account_name>"
42 |     environment = "DEV"
43 |     ```
44 | 1. Initialize the configuration:
45 |     ```bash
46 |     terraform init -backend-config <owner>-<project>-dev.tfbackend
47 |     ```
48 | 1. Include both Mac and Linux provider binaries in your terraform lock file.
49 |     This helps mitigate differences between CI environments and ODI Macs:
50 |     ```bash
51 |     terraform providers lock -platform=linux_amd64 -platform=darwin_amd64
52 |     ```
53 | 1. Add your new `main.tf`, `terraform.tfvars`, `<owner>-<project>-dev.tfbackend`,
54 |     and terraform lock file to the git repository. Do not add the `.terraform/` directory.
55 | 
56 | ## Deploy the dev configuration
57 | 
58 | 1. Ensure that your local environment has environment variables set for `SNOWFLAKE_ACCOUNT`,
59 |     `SNOWFLAKE_USER`, `SNOWFLAKE_PRIVATE_KEY_PATH`,  and `SNOWFLAKE_PRIVATE_KEY_PASSPHRASE`,
60 |     and that they are set to your new account, rather than any other accounts.
61 | 1. Run `terraform plan` to see the plan for the resources that will be created.
62 |     Inspect the plan to see that everything looks correct.
63 | 1. Run `terraform apply` to deploy the configuration. This will actually create the infrastructure!
64 | 
65 | ## Configure and deploy the production configuration
66 | 
67 | Re-run all of the steps above, but in a new directory `terraform/environments/prd`.
68 | Everywhere where there is a `dev` (or `DEV`), replace it with a `prd` (or `PRD`).
69 | 


--------------------------------------------------------------------------------
/docs/static/AccessFootprintsArcPro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/static/AccessFootprintsArcPro.pdf


--------------------------------------------------------------------------------
/docs/static/Download.MS.Global.Footprints.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/static/Download.MS.Global.Footprints.zip


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | :root,
 2 | [data-md-color-scheme="slate"],
 3 | [data-md-color-scheme="default"] {
 4 |   --md-primary-fg-color:        #00315F;
 5 |   --md-primary-fg-color--light: #0D4F8C;
 6 |   --md-primary-fg-color--dark:  #00315F;
 7 |   --md-primary-bg-color:        #ECF1F6;
 8 |   --md-primary-bg-color--light: #FAECDA;
 9 | 
10 |   --md-accent-fg-color:         #E3881B;
11 |   --md-accent-fg-color--transparent: #FAFAFA;
12 |   --md-accent-bg-color:         #FAECDA;
13 |   --md-accent-bg-color--light:  #FAFAFA;
14 | }
15 | 


--------------------------------------------------------------------------------
/images/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mambaorg/micromamba:1.2.0
 2 | 
 3 | COPY --chown=$MAMBA_USER:$MAMBA_USER images/environment.yml /tmp/environment.yml
 4 | 
 5 | RUN micromamba install --verbose -n base --file /tmp/environment.yml && \
 6 |     micromamba clean --all --yes
 7 | 
 8 | COPY --chown=$MAMBA_USER:$MAMBA_USER jobs /home/$MAMBA_USER/jobs
 9 | WORKDIR /home/$MAMBA_USER
10 | 


--------------------------------------------------------------------------------
/images/environment.yml:
--------------------------------------------------------------------------------
 1 | name: base
 2 | channels:
 3 |   - conda-forge
 4 |   - nodefaults
 5 | dependencies:
 6 |   - python=3.11
 7 |   - fsspec=2023.12.0
 8 |   - fiona=1.10.1
 9 |   - geopandas=1.0.1 # version that fixes the fiona path module issue
10 |   - mercantile=1.2.1
11 |   - pandas
12 |   - pyarrow
13 |   - s3fs=2023.12.0
14 |   - snowflake-connector-python=3.5.0
15 |   - pip
16 |   - pip:
17 |       - pygris
18 | 


--------------------------------------------------------------------------------
/jobs/__init__.py:
--------------------------------------------------------------------------------
1 | """Jobs for task runners or orchestrators."""
2 | 


--------------------------------------------------------------------------------
/jobs/geo/__init__.py:
--------------------------------------------------------------------------------
1 | """Geospatial jobs."""
2 | 


--------------------------------------------------------------------------------
/jobs/geo/load_global_ml_building_footprints.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | from jobs.utils.snowflake import gdf_to_snowflake, snowflake_connection_from_environment
 6 | 
 7 | HERE = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | 
10 | def load_state_footprints(conn) -> None:
11 |     """Load Microsoft Global ML building footprints dataset for California."""
12 |     import fsspec
13 |     import geopandas
14 |     import mercantile
15 |     import pandas
16 |     import shapely.geometry
17 | 
18 |     print("Identifying California quadkeys")
19 |     df = pandas.read_csv(
20 |         "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv",
21 |         dtype={"QuadKey": "str"},  # Don't use an int, since there are leading zeros!
22 |     )
23 | 
24 |     # Get the shape of California so that we can identify the quadkeys which intersect
25 |     california = (
26 |         geopandas.read_file(os.path.join(HERE, "data", "california.geojson"))
27 |         .iloc[0]
28 |         .geometry
29 |     )
30 | 
31 |     # As a first pass, find all the tiles which intersect the bounding box,
32 |     # since that's what mercantile knows how to do.
33 |     features = []
34 |     for tile in mercantile.tiles(*california.bounds, zooms=9):
35 |         features.append(
36 |             {
37 |                 "quadkey": mercantile.quadkey(tile),
38 |                 "geometry": shapely.geometry.shape(
39 |                     mercantile.feature(tile)["geometry"]
40 |                 ),
41 |             }
42 |         )
43 | 
44 |     # As a second pass, prune out the tiles which don't actually intersect California
45 |     quadkeys = geopandas.GeoDataFrame.from_records(features).set_geometry("geometry")
46 |     california_quadkeys = quadkeys[quadkeys.intersects(california)]
47 | 
48 |     # Now get a list of all the URLs which have a quadkey intersecting California
49 |     california_data = df[
50 |         df.QuadKey.isin(california_quadkeys.quadkey) & (df.Location == "UnitedStates")
51 |     ]
52 | 
53 |     overwrite = True  # For the first subset, overwrite any existing table
54 |     for _, row in california_data.iterrows():
55 |         print(f"Reading quadkey {row.QuadKey}")
56 |         with fsspec.open(row.Url, compression="infer") as f:
57 |             gdf = geopandas.read_file(f, driver="GeoJSONSeq")
58 |         # If we include quadkeys here it could help with Snowflake partitioning
59 |         gdf = gdf.assign(quadkey=row.QuadKey)
60 |         gdf_to_snowflake(
61 |             gdf,
62 |             conn,
63 |             table_name="GLOBAL_ML_BUILDING_FOOTPRINTS",
64 |             overwrite=overwrite,
65 |             strict_geometries=False,
66 |         )
67 |         overwrite = False  # For all subsequent gdfs, append
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     conn = snowflake_connection_from_environment(
72 |         schema="BUILDING_FOOTPRINTS",
73 |         client_session_keep_alive=True,  # This can be a slow job! Keep the session alive
74 |     )
75 |     load_state_footprints(conn)
76 | 


--------------------------------------------------------------------------------
/jobs/geo/load_us_building_footprints.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from jobs.utils.snowflake import gdf_to_snowflake, snowflake_connection_from_environment
 4 | 
 5 | 
 6 | def load_state_footprints(conn) -> None:
 7 |     """Load Microsoft state building footprints dataset for California."""
 8 |     import geopandas
 9 | 
10 |     print("Downloading data")
11 |     gdf = geopandas.read_file(
12 |         "https://minedbuildings.z5.web.core.windows.net/legacy/usbuildings-v2/California.geojson.zip"
13 |     )
14 | 
15 |     print("Writing data to snowflake")
16 |     gdf_to_snowflake(
17 |         gdf,
18 |         conn,
19 |         table_name="US_BUILDING_FOOTPRINTS",
20 |         cluster=False,
21 |     )
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     conn = snowflake_connection_from_environment(
26 |         schema="BUILDING_FOOTPRINTS",
27 |         client_session_keep_alive=True,  # This can be a slow job! Keep the session alive
28 |     )
29 |     load_state_footprints(conn)
30 | 


--------------------------------------------------------------------------------
/jobs/geo/tiger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from jobs.utils.snowflake import gdf_to_snowflake, snowflake_connection_from_environment
  4 | 
  5 | 
  6 | def load_geo_data(conn, year: int) -> None:
  7 |     """Load CA Census geo data into Snowflake."""
  8 |     from pygris import (
  9 |         block_groups,
 10 |         blocks,
 11 |         coastline,
 12 |         combined_statistical_areas,
 13 |         core_based_statistical_areas,
 14 |         counties,
 15 |         county_subdivisions,
 16 |         divisions,
 17 |         nation,
 18 |         native_areas,
 19 |         places,
 20 |         primary_roads,
 21 |         primary_secondary_roads,
 22 |         pumas,
 23 |         rails,
 24 |         regions,
 25 |         states,
 26 |         tracts,
 27 |         tribal_block_groups,
 28 |         tribal_subdivisions_national,
 29 |         urban_areas,
 30 |     )
 31 | 
 32 |     print(f"Downloading data for CA in year {year}")
 33 | 
 34 |     ca_loaders = {
 35 |         "COUNTIES": counties,
 36 |         "TRACTS": tracts,
 37 |         "BLOCK_GROUPS": block_groups,
 38 |         "BLOCKS": blocks,
 39 |         "PLACES": places,
 40 |         "PUMAS": pumas,
 41 |         "COUNTY_SUBDIVISIONS": county_subdivisions,
 42 |         "PRIMARY_SECONDARY_ROADS": primary_secondary_roads,
 43 |     }
 44 | 
 45 |     us_loaders = {
 46 |         "COASTLINE": coastline,
 47 |         "DIVISIONS": divisions,
 48 |         "NATION": nation,
 49 |         "NATIVE_AREAS": native_areas,
 50 |         "PRIMARY_ROADS": primary_roads,
 51 |         "RAILS": rails,
 52 |         "REGIONS": regions,
 53 |         "STATES": states,
 54 |         "TRIBAL_BLOCK_GROUPS": tribal_block_groups,
 55 |         "TRIBAL_SUBDIVISIONS_NATIONAL": tribal_subdivisions_national,
 56 |         "URBAN_AREAS": urban_areas,
 57 |         "CORE_BASED_STATISTICAL_AREAS": core_based_statistical_areas,
 58 |         "COMBINED_STATISTICAL_AREAS": combined_statistical_areas,
 59 |     }
 60 | 
 61 |     state = "CA"
 62 | 
 63 |     for table_name, loader in ca_loaders.items():
 64 |         try:
 65 |             gdf_to_snowflake(
 66 |                 loader(state=state, year=year)
 67 |                 .reset_index(drop=True)
 68 |                 .to_crs(
 69 |                     epsg=4326
 70 |                 ),  # using .reset_index(drop=True) to address the following UserWarning:
 71 |                 # Pandas Dataframe has non-standard index of type <class 'pandas.core.indexes.base.Index'> which will not be written. Consider changing the
 72 |                 # index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)
 73 |                 conn,
 74 |                 table_name=table_name,
 75 |                 cluster=False,
 76 |             )
 77 |         except ValueError as value_error:
 78 |             print(
 79 |                 f"This ValueError: {value_error} This pertains to this CA loader: {table_name}"
 80 |             )
 81 | 
 82 |     for table_name, loader in us_loaders.items():
 83 |         try:
 84 |             gdf_to_snowflake(
 85 |                 loader(year=year)
 86 |                 .reset_index(drop=True)
 87 |                 .to_crs(
 88 |                     epsg=4326
 89 |                 ),  # using .reset_index(drop=True) to address the following UserWarning:
 90 |                 # Pandas Dataframe has non-standard index of type <class 'pandas.core.indexes.base.Index'> which will not be written. Consider changing the
 91 |                 # index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)
 92 |                 conn,
 93 |                 table_name=table_name,
 94 |                 cluster=False,
 95 |             )
 96 |         except ValueError as value_error:
 97 |             print(
 98 |                 f"This ValueError: {value_error} This pertains to this US loader: {table_name}"
 99 |             )
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     # TODO: perhaps make a real CLI here.
104 |     import sys
105 | 
106 |     N_ARGS = 2
107 |     assert len(sys.argv) == N_ARGS, "Expecting 1 argument: year (four digits)"
108 | 
109 |     year = int(sys.argv[1])
110 | 
111 |     conn = snowflake_connection_from_environment(
112 |         schema=f"TIGER_{year}",
113 |         client_session_keep_alive=True,
114 |     )
115 | 
116 |     load_geo_data(conn, year)
117 | 


--------------------------------------------------------------------------------
/jobs/geo/write_building_footprints.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | from jobs.utils.snowflake import snowflake_connection_from_environment
 6 | 
 7 | 
 8 | def write_building_footprints(conn, kind: str):
 9 |     """Grab Microsoft building footprints data enriched with Census TIGER Blocks data for California from Snowflake and write to an S3 bucket."""
10 |     import geopandas
11 |     import s3fs
12 |     import shapely.wkb
13 | 
14 |     sql_alter = """
15 |     alter session set GEOGRAPHY_OUTPUT_FORMAT='WKB';
16 |     """
17 |     conn.cursor().execute(sql_alter)
18 | 
19 |     ref = (
20 |         "ANALYTICS_PRD.BUILDING_FOOTPRINTS"
21 |         f".GEO_REFERENCE__{kind.upper()}_BUILDING_FOOTPRINTS_WITH_TIGER"
22 |     )
23 |     sql_counties = f"""
24 |     SELECT DISTINCT "county_fips"
25 |     FROM {ref}
26 |     ORDER BY 1 ASC
27 |     """
28 | 
29 |     counties = conn.cursor().execute(sql_counties).fetchall()
30 |     counties = [x[0] for x in counties if x[0] is not None]
31 | 
32 |     for index, county in enumerate(counties):
33 |         sql_table = f"""
34 |         SELECT *
35 |         FROM {ref}
36 |         WHERE "county_fips" = {county}
37 |         """
38 |         df = conn.cursor().execute(sql_table).fetch_pandas_all()
39 |         gdf = geopandas.GeoDataFrame(
40 |             df.assign(geometry=df.geometry.apply(shapely.wkb.loads)),
41 |             crs="EPSG:4326",
42 |         )
43 | 
44 |         gdf = gdf[gdf.geometry.geom_type != "GeometryCollection"]
45 | 
46 |         file_prefix = f"county_fips_{county}"
47 |         gdf.to_parquet(f"{file_prefix}.parquet")
48 |         # .shz suffix triggers GDAL to write zipped shapefile
49 |         gdf.to_file(f"{file_prefix}.shz")
50 | 
51 |         print(
52 |             f"Loading {file_prefix}. This is number {index+1} out of {len(counties)} counties."
53 |         )
54 | 
55 |         s3 = s3fs.S3FileSystem(anon=False)
56 |         s3.put(
57 |             f"{file_prefix}.parquet",
58 |             f"s3://dof-demographics-dev-us-west-2-public/{kind}_building_footprints/parquet/{file_prefix}.parquet",
59 |         )
60 |         # Esri doesn't like .shp.zip or .shz, so rename to just be .zip.
61 |         s3.put(
62 |             f"{file_prefix}.shz",
63 |             f"s3://dof-demographics-dev-us-west-2-public/{kind}_building_footprints/shp/{file_prefix}.zip",
64 |         )
65 | 
66 |         os.remove(f"{file_prefix}.parquet")
67 |         os.remove(f"{file_prefix}.shz")
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     import sys
72 | 
73 |     N_ARGS = 1
74 | 
75 |     # This is a bit of a hack: our batch jobs are designed more around loading data
76 |     # to a data warehouse than unloading it, and so the default connection parameters
77 |     # specify a LOADER role. Here we replace that with a REPORTER role for grabbing
78 |     # data from the marts db.
79 |     os.environ["SNOWFLAKE_ROLE"] = os.environ["SNOWFLAKE_ROLE"].replace(
80 |         "LOADER", "REPORTER"
81 |     )
82 |     os.environ["SNOWFLAKE_WAREHOUSE"] = os.environ["SNOWFLAKE_WAREHOUSE"].replace(
83 |         "LOADING", "REPORTING"
84 |     )
85 | 
86 |     conn = snowflake_connection_from_environment(
87 |         client_session_keep_alive=True,  # This can be a slow job! Keep the session alive
88 |     )
89 | 
90 |     if len(sys.argv) != N_ARGS + 1 or sys.argv[1] not in ("global_ml", "us"):
91 |         raise ValueError(
92 |             "Must provide specify one of 'global_ml' or 'us' for building footprint source"
93 |         )
94 | 
95 |     write_building_footprints(conn, kind=sys.argv[1])
96 | 


--------------------------------------------------------------------------------
/jobs/test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | import geopandas
 6 | 
 7 | URL = "https://usbuildingdata.blob.core.windows.net/usbuildings-v2/Alaska.geojson.zip"
 8 | 
 9 | if __name__ == "__main__":
10 |     gdf = geopandas.read_file(URL)
11 |     gdf.to_parquet(f"s3://{os.environ['BUCKET']}/alaska.parquet")
12 | 


--------------------------------------------------------------------------------
/jobs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Common utilities."""
2 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: CalData Data Services and Engineering Infrastructure
 2 | theme:
 3 |   name: material
 4 |   logo: images/odi-square_logomark-blue.svg
 5 |   favicon: images/odi-circle_logomark-blue.png
 6 |   features:
 7 |     - content.code.copy
 8 |   palette:
 9 |     # Palette toggle for light mode
10 |     - scheme: default
11 |       toggle:
12 |         icon: material/weather-night
13 |         name: Switch to dark mode
14 | 
15 |     # Palette toggle for dark mode
16 |     - scheme: slate
17 |       toggle:
18 |         icon: material/weather-sunny
19 |         name: Switch to light mode
20 | extra_css:
21 |   - stylesheets/extra.css
22 | 
23 | repo_name: cagov/data-infrastructure
24 | repo_url: https://github.com/cagov/data-infrastructure
25 | edit_uri: ""
26 | 
27 | markdown_extensions:
28 |   - toc:
29 |       permalink: true
30 |   - admonition
31 |   - pymdownx.highlight:
32 |       auto_title: false
33 |   - pymdownx.superfences:
34 |       custom_fences:
35 |         - name: mermaid
36 |           class: mermaid
37 |           format: !!python/name:pymdownx.superfences.fence_code_format
38 | 
39 | nav:
40 |   - Introduction: index.md
41 |   - Code development:
42 |       - Local environment setup: code/local-setup.md
43 |       - Codespaces: code/codespaces.md
44 |       - Code Review: code/code-review.md
45 |       - Writing Documentation: code/writing-documentation.md
46 |       - Terraform Setup: code/terraform-local-setup.md
47 |       - GitHub Project Management: code/github-project-management.md
48 |       - Azure DevOps Project Management: code/azdevops-project-management.md
49 |   - Project infrastructure:
50 |       - Cloud infrastructure: infra/cloud-infrastructure.md
51 |       - Project architecture: infra/architecture.md
52 |       - Snowflake overview: infra/snowflake.md
53 |   - Project setup:
54 |       - Snowflake setup: setup/snowflake-setup.md
55 |       - git/Github setup: setup/repo-setup.md
56 |       - Terraform setup: setup/terraform-project-setup.md
57 |       - Sentinel setup: setup/sentinel-setup.md
58 |       - dbt Cloud setup: setup/dbt-setup.md
59 |       - Adding service accounts: setup/snowflake-service-accounts.md
60 |       - Project teardown: setup/project-teardown.md
61 |   - dbt:
62 |       - dbt overview: dbt/dbt.md
63 |       - dbt performance: dbt/dbt-performance.md
64 |       - dbt Cloud Snowflake project: dbt_docs_snowflake/index.html
65 |   - Data:
66 |       - Building footprints: data/footprints.md
67 |   - Learning:
68 |       - MDSA glossary: learning/glossary.md
69 |       - Security guidelines: learning/security.md
70 |       - Naming conventions: learning/naming-conventions.md
71 |       - git: learning/git.md
72 |       - dbt: learning/dbt.md
73 |       - Cloud data warehouses: learning/cloud-data-warehouses.md
74 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | line-length = 88
 3 | select = [
 4 |   "B", # flake8-bugbear
 5 |   "BLE", # flake8-blind-except
 6 |   "C4", # comprehensions
 7 |   "D", # pydocstyle
 8 |   "E", # pycodestyle
 9 |   "F", # pyflakes
10 |   "I", # isort
11 |   "ISC", # flake8-implicit-str-concat
12 |   "PGH", # pygrep-hooks
13 |   "PLC", # pylint
14 |   "PLE", # pylint
15 |   "PLR", # pylint import style
16 |   "PLW", # pylint
17 |   "RET", # flake8-return
18 |   "RUF", # ruff-specific rules
19 |   "SIM", # flake8-simplify
20 |   "T10", # flake8-debugger
21 |   "TID", # flake8-tidy-imports
22 |   "UP", # pyupgrade
23 |   "W", # pycodestyle
24 |   "YTT", # flake8-2020
25 | 
26 | ]
27 | respect-gitignore = true
28 | ignore = [
29 |   "D100", # public module
30 |   "D101", # public class
31 |   "D102", # public method
32 |   "D103", # public function
33 |   "D104", # public package
34 |   "D203", # blank line before docstring
35 |   "D212", # Start multi-line docstring at the second line.
36 |   "E501", # line length handled by black
37 |   "ISC001", # Handled by formatter
38 |   "PGH003", # specific mypy ignore codes
39 |   "PLR0913", # too many arguments
40 |   "PLR0912", # too many branches
41 |   "RET505", # no-else-return
42 |   "RET506", # no-else-raise
43 | ]
44 | target-version = "py310"
45 | 
46 | [tool.mypy]
47 | python_version = "3.10"
48 | allow_untyped_decorators = true # would love to enable this, but airflow decorators are untyped
49 | ignore_missing_imports = true
50 | no_implicit_optional = true
51 | show_error_codes = true
52 | warn_redundant_casts = true
53 | warn_unused_ignores = false
54 | warn_unreachable = true
55 | 
56 | [tool.poetry]
57 | name = "data-infrastructure"
58 | version = "0.1.0"
59 | description = "\"CalData Data Services and Engineering Infrastructure\""
60 | authors = ["Ian Rose <ian.rose@innovation.ca.gov>"]
61 | license = "MIT"
62 | readme = "README.md"
63 | package-mode = false
64 | 
65 | [tool.poetry.dependencies]
66 | python = "^3.10"
67 | mkdocs-material = "~9.1.3"
68 | dbt-core = "~1.8.0"
69 | dbt-snowflake = "~1.8.0"
70 | awscliv2 = "^2.2.0"
71 | 
72 | [tool.poetry.group.dev.dependencies]
73 | pre-commit = "^3.3.1"
74 | sqlfluff = "3.0.7"
75 | sqlfluff-templater-dbt = "3.0.7"
76 | 
77 | [build-system]
78 | requires = ["poetry-core"]
79 | build-backend = "poetry.core.masonry.api"
80 | 


--------------------------------------------------------------------------------
/terraform/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform
2 | *.tfstate
3 | *.tfstate.backup
4 | *.tfstate.*.backup
5 | # Created by pre-commit, but not intended to be a stand-alone module.
6 | s3-remote-state/.terraform.lock.hcl
7 | 


--------------------------------------------------------------------------------
/terraform/aws/environments/dev/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/aws" {
 5 |   version     = "4.56.0"
 6 |   constraints = "4.56.0"
 7 |   hashes = [
 8 |     "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=",
 9 |     "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=",
10 |     "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=",
11 |     "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab",
12 |     "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa",
13 |     "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03",
14 |     "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0",
15 |     "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39",
16 |     "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b",
17 |     "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa",
18 |     "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
19 |     "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1",
20 |     "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87",
21 |     "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711",
22 |     "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365",
23 |     "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054",
24 |     "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81",
25 |     "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef",
26 |   ]
27 | }
28 | 
29 | provider "registry.terraform.io/hashicorp/random" {
30 |   version     = "3.4.3"
31 |   constraints = "3.4.3"
32 |   hashes = [
33 |     "h1:saZR+mhthL0OZl4SyHXZraxyaBNVMxiZzks78nWcZ2o=",
34 |     "h1:tL3katm68lX+4lAncjQA9AXL4GR/VM+RPwqYf4D2X8Q=",
35 |     "h1:xZGZf18JjMS06pFa4NErzANI98qi59SEcBsOcS2P2yQ=",
36 |     "zh:41c53ba47085d8261590990f8633c8906696fa0a3c4b384ff6a7ecbf84339752",
37 |     "zh:59d98081c4475f2ad77d881c4412c5129c56214892f490adf11c7e7a5a47de9b",
38 |     "zh:686ad1ee40b812b9e016317e7f34c0d63ef837e084dea4a1f578f64a6314ad53",
39 |     "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
40 |     "zh:84103eae7251384c0d995f5a257c72b0096605048f757b749b7b62107a5dccb3",
41 |     "zh:8ee974b110adb78c7cd18aae82b2729e5124d8f115d484215fd5199451053de5",
42 |     "zh:9dd4561e3c847e45de603f17fa0c01ae14cae8c4b7b4e6423c9ef3904b308dda",
43 |     "zh:bb07bb3c2c0296beba0beec629ebc6474c70732387477a65966483b5efabdbc6",
44 |     "zh:e891339e96c9e5a888727b45b2e1bb3fcbdfe0fd7c5b4396e4695459b38c8cb1",
45 |     "zh:ea4739860c24dfeaac6c100b2a2e357106a89d18751f7693f3c31ecf6a996f8d",
46 |     "zh:f0c76ac303fd0ab59146c39bc121c5d7d86f878e9a69294e29444d4c653786f8",
47 |     "zh:f143a9a5af42b38fed328a161279906759ff39ac428ebcfe55606e05e1518b93",
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/terraform/aws/environments/dev/dse-infra-dev.tfbackend:
--------------------------------------------------------------------------------
1 | bucket = "dse-infra-dev-terraform-state"
2 | dynamodb_table = "dse-infra-dev-terraform-state-lock"
3 | key = "dse-infra-dev.tfstate"
4 | region = "us-west-1"
5 | 


--------------------------------------------------------------------------------
/terraform/aws/environments/dev/main.tf:
--------------------------------------------------------------------------------
 1 | ##################################
 2 | #        Terraform Setup         #
 3 | ##################################
 4 | 
 5 | terraform {
 6 |   required_version = ">= 1.0"
 7 | 
 8 |   required_providers {
 9 |     aws = {
10 |       source  = "hashicorp/aws"
11 |       version = "4.56.0"
12 |     }
13 |     random = {
14 |       source  = "hashicorp/random"
15 |       version = "3.4.3"
16 |     }
17 |   }
18 | 
19 |   backend "s3" {
20 |   }
21 | }
22 | 
23 | locals {
24 |   owner       = "dse"
25 |   environment = "dev"
26 |   project     = "infra"
27 |   region      = "us-west-2"
28 | }
29 | 
30 | provider "aws" {
31 |   region = local.region
32 | 
33 |   default_tags {
34 |     tags = {
35 |       Owner       = local.owner
36 |       Project     = local.project
37 |       Environment = local.environment
38 |     }
39 |   }
40 | }
41 | 
42 | ############################
43 | #      Infrastructure      #
44 | ############################
45 | 
46 | module "infra" {
47 |   source = "../../modules/infra"
48 | 
49 |   owner       = local.owner
50 |   environment = local.environment
51 |   project     = local.project
52 |   snowflake_loader_secret = {
53 |     test   = "dse-snowflake-dev-us-west-2-loader"
54 |     latest = "dse-snowflake-prd-us-west-2-loader"
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/terraform/aws/environments/dev/remote-state/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/aws" {
 5 |   version     = "4.56.0"
 6 |   constraints = "4.56.0"
 7 |   hashes = [
 8 |     "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=",
 9 |     "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=",
10 |     "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=",
11 |     "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab",
12 |     "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa",
13 |     "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03",
14 |     "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0",
15 |     "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39",
16 |     "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b",
17 |     "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa",
18 |     "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
19 |     "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1",
20 |     "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87",
21 |     "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711",
22 |     "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365",
23 |     "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054",
24 |     "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81",
25 |     "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef",
26 |   ]
27 | }
28 | 


--------------------------------------------------------------------------------
/terraform/aws/environments/dev/remote-state/main.tf:
--------------------------------------------------------------------------------
1 | ../../../../s3-remote-state/main.tf


--------------------------------------------------------------------------------
/terraform/aws/environments/dev/remote-state/terraform.tfvars:
--------------------------------------------------------------------------------
1 | owner       = "dse"
2 | environment = "dev"
3 | project     = "infra"
4 | region      = "us-west-1"
5 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/airflow.tf:
--------------------------------------------------------------------------------
  1 | resource "aws_iam_role" "mwaa" {
  2 |   name               = "${local.prefix}-${var.region}-mwaa-execution-role"
  3 |   assume_role_policy = data.aws_iam_policy_document.assume.json
  4 | }
  5 | 
  6 | resource "aws_iam_policy" "mwaa" {
  7 |   name   = "${local.prefix}-${var.region}-mwaa-execution-policy"
  8 |   policy = data.aws_iam_policy_document.mwaa.json
  9 | }
 10 | 
 11 | resource "aws_iam_role_policy_attachment" "mwaa_execution_role" {
 12 |   role       = aws_iam_role.mwaa.name
 13 |   policy_arn = aws_iam_policy.mwaa.arn
 14 | }
 15 | 
 16 | resource "aws_iam_role_policy_attachment" "mwaa_batch_submit_role" {
 17 |   role       = aws_iam_role.mwaa.name
 18 |   policy_arn = aws_iam_policy.batch_submit_policy.arn
 19 | }
 20 | 
 21 | locals {
 22 |   # Define the environment name as a `local` so we can refer to it in the
 23 |   # execution role policy without introducing a cycle.
 24 |   environment_name = "${local.prefix}-${var.region}-mwaa-environment"
 25 | }
 26 | 
 27 | data "aws_iam_policy_document" "assume" {
 28 |   version = "2012-10-17"
 29 |   statement {
 30 |     effect = "Allow"
 31 |     principals {
 32 |       identifiers = [
 33 |         "airflow-env.amazonaws.com",
 34 |         "airflow.amazonaws.com"
 35 |       ]
 36 |       type = "Service"
 37 |     }
 38 |     actions = [
 39 |       "sts:AssumeRole"
 40 |     ]
 41 |   }
 42 | }
 43 | 
 44 | data "aws_iam_policy_document" "mwaa" {
 45 |   version = "2012-10-17"
 46 |   statement {
 47 |     effect = "Allow"
 48 |     actions = [
 49 |       "airflow:PublishMetrics"
 50 |     ]
 51 |     resources = [
 52 |       "arn:aws:airflow:${var.region}:${data.aws_caller_identity.current.account_id}:environment/${local.environment_name}"
 53 |     ]
 54 |   }
 55 |   statement {
 56 |     effect  = "Deny"
 57 |     actions = ["s3:ListAllMyBuckets"]
 58 |     resources = [
 59 |       aws_s3_bucket.mwaa.arn,
 60 |       "${aws_s3_bucket.mwaa.arn}/*",
 61 |     ]
 62 |   }
 63 |   statement {
 64 |     effect = "Allow"
 65 |     actions = [
 66 |       "s3:GetObject*",
 67 |       "s3:GetBucket*",
 68 |       "s3:List*"
 69 |     ]
 70 |     resources = [
 71 |       aws_s3_bucket.mwaa.arn,
 72 |       "${aws_s3_bucket.mwaa.arn}/*",
 73 |     ]
 74 |   }
 75 |   statement {
 76 |     effect = "Allow"
 77 |     actions = [
 78 |       "s3:GetAccountPublicAccessBlock"
 79 |     ]
 80 |     resources = ["*"]
 81 |   }
 82 |   statement {
 83 |     effect = "Allow"
 84 |     actions = [
 85 |       "logs:CreateLogStream",
 86 |       "logs:CreateLogGroup",
 87 |       "logs:PutLogEvents",
 88 |       "logs:GetLogEvents",
 89 |       "logs:GetLogRecord",
 90 |       "logs:GetLogGroupFields",
 91 |       "logs:GetQueryResults"
 92 |     ]
 93 |     resources = [
 94 |       "arn:aws:logs:${var.region}:${data.aws_caller_identity.current.account_id}:log-group:airflow-${local.environment_name}-*"
 95 |     ]
 96 |   }
 97 |   statement {
 98 |     effect = "Allow"
 99 |     actions = [
100 |       "logs:DescribeLogGroups"
101 |     ]
102 |     resources = [
103 |       "*"
104 |     ]
105 |   }
106 |   statement {
107 | 
108 |     effect = "Allow"
109 |     actions = [
110 |       "cloudwatch:PutMetricData"
111 |     ]
112 |     resources = [
113 |       "*"
114 |     ]
115 |   }
116 |   statement {
117 |     effect = "Allow"
118 |     actions = [
119 |       "sqs:ChangeMessageVisibility",
120 |       "sqs:DeleteMessage",
121 |       "sqs:GetQueueAttributes",
122 |       "sqs:GetQueueUrl",
123 |       "sqs:ReceiveMessage",
124 |       "sqs:SendMessage"
125 |     ]
126 |     resources = [
127 |       "arn:aws:sqs:${var.region}:*:airflow-celery-*"
128 |     ]
129 |   }
130 |   statement {
131 |     effect = "Allow"
132 |     actions = [
133 |       "kms:Decrypt",
134 |       "kms:DescribeKey",
135 |       "kms:GenerateDataKey*",
136 |       "kms:Encrypt"
137 |     ]
138 |     resources     = []
139 |     not_resources = ["arn:aws:kms:*:${data.aws_caller_identity.current.account_id}:key/*"]
140 |     condition {
141 |       test = "StringLike"
142 |       values = [
143 |         "sqs.${var.region}.amazonaws.com"
144 |       ]
145 |       variable = "kms:ViaService"
146 |     }
147 |   }
148 | }
149 | 
150 | resource "aws_mwaa_environment" "this" {
151 |   execution_role_arn = aws_iam_role.mwaa.arn
152 |   name               = local.environment_name
153 |   schedulers         = 2
154 |   max_workers        = 5
155 |   min_workers        = 1
156 |   airflow_version    = "2.7.2"
157 | 
158 |   airflow_configuration_options = {
159 |     "custom.scratch_bucket"    = aws_s3_bucket.scratch.id
160 |     "custom.default_job_queue" = aws_batch_job_queue.default.name
161 |     # Note: default job definition to the "latest", rather than the "test" environment.
162 |     "custom.default_job_definition" = aws_batch_job_definition.default["latest"].name
163 |   }
164 | 
165 |   logging_configuration {
166 |     dag_processing_logs {
167 |       enabled   = true
168 |       log_level = "INFO"
169 |     }
170 | 
171 |     scheduler_logs {
172 |       enabled   = true
173 |       log_level = "INFO"
174 |     }
175 | 
176 |     task_logs {
177 |       enabled   = true
178 |       log_level = "INFO"
179 |     }
180 | 
181 |     webserver_logs {
182 |       enabled   = true
183 |       log_level = "INFO"
184 |     }
185 | 
186 |     worker_logs {
187 |       enabled   = true
188 |       log_level = "INFO"
189 |     }
190 |   }
191 | 
192 | 
193 |   source_bucket_arn    = aws_s3_bucket.mwaa.arn
194 |   dag_s3_path          = "dags/"
195 |   requirements_s3_path = "requirements.txt"
196 | 
197 |   network_configuration {
198 |     security_group_ids = [aws_security_group.mwaa.id]
199 |     subnet_ids         = aws_subnet.private[*].id
200 |   }
201 |   webserver_access_mode = "PUBLIC_ONLY"
202 |   depends_on            = [aws_iam_role_policy_attachment.mwaa_execution_role]
203 | }
204 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/batch.tf:
--------------------------------------------------------------------------------
  1 | ##################################
  2 | #          AWS Batch             #
  3 | ##################################
  4 | 
  5 | data "aws_iam_policy_document" "aws_batch_service_policy" {
  6 |   statement {
  7 |     actions = [
  8 |       "sts:AssumeRole"
  9 |     ]
 10 |     effect = "Allow"
 11 |     principals {
 12 |       type        = "Service"
 13 |       identifiers = ["batch.amazonaws.com"]
 14 |     }
 15 |   }
 16 | }
 17 | 
 18 | 
 19 | resource "aws_iam_role" "aws_batch_service_role" {
 20 |   name               = "${local.prefix}-${var.region}-batch-service-role"
 21 |   assume_role_policy = data.aws_iam_policy_document.aws_batch_service_policy.json
 22 | }
 23 | 
 24 | resource "aws_iam_role_policy_attachment" "aws_batch_service_role" {
 25 |   role       = aws_iam_role.aws_batch_service_role.name
 26 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"
 27 | }
 28 | 
 29 | resource "aws_batch_compute_environment" "default" {
 30 |   compute_environment_name = "${local.prefix}-${var.region}-default"
 31 | 
 32 |   compute_resources {
 33 |     max_vcpus = 16
 34 | 
 35 |     security_group_ids = [
 36 |       aws_security_group.batch.id
 37 |     ]
 38 | 
 39 |     subnets = aws_subnet.public[*].id
 40 | 
 41 |     type = "FARGATE"
 42 |   }
 43 | 
 44 |   service_role = aws_iam_role.aws_batch_service_role.arn
 45 |   type         = "MANAGED"
 46 |   depends_on   = [aws_iam_role_policy_attachment.aws_batch_service_role]
 47 | }
 48 | 
 49 | resource "aws_iam_role" "ecs_task_execution_role" {
 50 |   name               = "${local.prefix}-${var.region}-batch-exec-role"
 51 |   assume_role_policy = data.aws_iam_policy_document.assume_role_policy.json
 52 | }
 53 | 
 54 | data "aws_iam_policy_document" "assume_role_policy" {
 55 |   statement {
 56 |     actions = ["sts:AssumeRole"]
 57 | 
 58 |     principals {
 59 |       type        = "Service"
 60 |       identifiers = ["ecs-tasks.amazonaws.com"]
 61 |     }
 62 |   }
 63 | }
 64 | 
 65 | resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" {
 66 |   role       = aws_iam_role.ecs_task_execution_role.name
 67 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
 68 | }
 69 | 
 70 | resource "aws_iam_role_policy_attachment" "ecs_task_execution_access_snowflake_loader" {
 71 |   for_each   = toset(local.jobs)
 72 |   role       = aws_iam_role.ecs_task_execution_role.name
 73 |   policy_arn = aws_iam_policy.access_snowflake_loader[each.key].arn
 74 | }
 75 | 
 76 | resource "aws_iam_role" "batch_job_role" {
 77 |   name               = "${local.prefix}-${var.region}-batch-job-role"
 78 |   description        = "Role for AWS batch jobs"
 79 |   assume_role_policy = data.aws_iam_policy_document.assume_role_policy.json
 80 | }
 81 | 
 82 | resource "aws_iam_role_policy_attachment" "s3_scratch_policy_role_attachment" {
 83 |   role       = aws_iam_role.batch_job_role.name
 84 |   policy_arn = aws_iam_policy.s3_scratch_policy.arn
 85 | }
 86 | 
 87 | resource "aws_iam_role_policy_attachment" "dof_demographics_read_write_access" {
 88 |   role       = aws_iam_role.batch_job_role.name
 89 |   policy_arn = aws_iam_policy.dof_demographics_read_write_access.arn
 90 | }
 91 | 
 92 | resource "aws_batch_job_queue" "default" {
 93 |   name     = "${local.prefix}-${var.region}-default"
 94 |   state    = "ENABLED"
 95 |   priority = 1
 96 |   compute_environments = [
 97 |     aws_batch_compute_environment.default.arn,
 98 |   ]
 99 | }
100 | 
101 | resource "aws_batch_job_definition" "default" {
102 |   for_each              = toset(local.jobs)
103 |   name                  = "${local.prefix}-${var.region}-${each.key}"
104 |   type                  = "container"
105 |   platform_capabilities = ["FARGATE"]
106 | 
107 |   container_properties = jsonencode({
108 |     command = ["echo", "$SNOWFLAKE_USER", "$SNOWFLAKE_ROLE"]
109 |     image   = "${aws_ecr_repository.default.repository_url}:${each.key}"
110 |     fargatePlatformConfiguration = {
111 |       platformVersion = "LATEST"
112 |     }
113 |     resourceRequirements = [
114 |       { type = "VCPU", value = "0.25" },
115 |       { type = "MEMORY", value = "512" }
116 |     ]
117 |     # TODO: Figure out how to properly pass in a private key rather than a password.
118 |     # Ran into some issues with properly encoding it as an environment variable.
119 |     secrets = [
120 |       for s in local.snowflake_data : {
121 |         name      = "SNOWFLAKE_${upper(s)}",
122 |         valueFrom = data.aws_secretsmanager_secret.snowflake_loader_secret[each.key].arn != null ? "${data.aws_secretsmanager_secret.snowflake_loader_secret[each.key].arn}:${s}::" : ""
123 |       }
124 |     ]
125 |     networkConfiguration = {
126 |       assignPublicIp : "ENABLED"
127 |     },
128 |     executionRoleArn = aws_iam_role.ecs_task_execution_role.arn
129 |     jobRoleArn       = aws_iam_role.batch_job_role.arn
130 |   })
131 | }
132 | 
133 | data "aws_iam_policy_document" "batch_submit_policy_document" {
134 |   statement {
135 |     actions = [
136 |       "batch:SubmitJob",
137 |       "batch:CancelJob",
138 |       "batch:ListJobs",
139 |     ]
140 |     resources = [
141 |       "arn:aws:batch:${var.region}:${data.aws_caller_identity.current.account_id}:job-definition/${local.prefix}*",
142 |       aws_batch_job_queue.default.arn,
143 |     ]
144 |   }
145 |   statement {
146 |     actions = [
147 |       "batch:DescribeJobs",
148 |     ]
149 |     resources = ["*"]
150 |   }
151 | }
152 | 
153 | resource "aws_iam_policy" "batch_submit_policy" {
154 |   name        = "${local.prefix}-${var.region}-batch-submit-policy"
155 |   description = "Policy allowing to submit batch jobs for ${local.prefix}"
156 |   policy      = data.aws_iam_policy_document.batch_submit_policy_document.json
157 | }
158 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/ecr.tf:
--------------------------------------------------------------------------------
 1 | ##################################
 2 | #       Container registry       #
 3 | ##################################
 4 | 
 5 | resource "aws_ecr_repository" "default" {
 6 |   name                 = "${local.prefix}-${var.region}-default"
 7 |   image_tag_mutability = "MUTABLE"
 8 | 
 9 |   image_scanning_configuration {
10 |     scan_on_push = true
11 |   }
12 | }
13 | 
14 | data "aws_iam_policy_document" "default_ecr_policy_document" {
15 |   # Policy from https://github.com/aws-actions/amazon-ecr-login#permissions
16 |   statement {
17 |     actions = [
18 |       "ecr:BatchGetImage",
19 |       "ecr:BatchCheckLayerAvailability",
20 |       "ecr:CompleteLayerUpload",
21 |       "ecr:GetDownloadUrlForLayer",
22 |       "ecr:InitiateLayerUpload",
23 |       "ecr:PutImage",
24 |       "ecr:UploadLayerPart",
25 |     ]
26 |     resources = [aws_ecr_repository.default.arn]
27 |   }
28 |   statement {
29 |     actions = [
30 |       "ecr:GetAuthorizationToken"
31 |     ]
32 |     # Why does this need *? https://github.com/aws-actions/amazon-ecr-login#ecr-private
33 |     resources = ["*"]
34 |   }
35 | }
36 | 
37 | resource "aws_iam_policy" "default_ecr_policy" {
38 |   name        = "${local.prefix}-${var.region}-default-ecr-push-policy"
39 |   description = "Policy allowing pushing to the default ecr repository for ${local.prefix}"
40 |   policy      = data.aws_iam_policy_document.default_ecr_policy_document.json
41 | }
42 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/iam.tf:
--------------------------------------------------------------------------------
  1 | ##################################
  2 | #          IAM Policies          #
  3 | ##################################
  4 | 
  5 | # Adapted from https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_examples_aws_my-sec-creds-self-manage.html
  6 | data "aws_iam_policy_document" "self_manage_credentials" {
  7 |   statement {
  8 |     sid    = "AllowViewAccountInfo"
  9 |     effect = "Allow"
 10 |     actions = [
 11 |       "iam:GetAccountPasswordPolicy",
 12 |       "iam:GetAccountSummary",
 13 |       "iam:ListVirtualMFADevices",
 14 |     ]
 15 |     resources = ["*"]
 16 |   }
 17 |   statement {
 18 |     sid    = "AllowManageOwnPasswords"
 19 |     effect = "Allow"
 20 |     actions = [
 21 |       "iam:ChangePassword",
 22 |       "iam:GetUser",
 23 |       "iam:CreateLoginProfile",
 24 |       "iam:DeleteLoginProfile",
 25 |       "iam:GetLoginProfile",
 26 |       "iam:UpdateLoginProfile",
 27 |     ]
 28 |     resources = ["arn:aws:iam::*:user/$${aws:username}"]
 29 |   }
 30 |   statement {
 31 |     sid    = "AllowManageOwnAccessKeys"
 32 |     effect = "Allow"
 33 |     actions = [
 34 |       "iam:CreateAccessKey",
 35 |       "iam:DeleteAccessKey",
 36 |       "iam:ListAccessKeys",
 37 |       "iam:UpdateAccessKey",
 38 |     ]
 39 |     resources = ["arn:aws:iam::*:user/$${aws:username}"]
 40 |   }
 41 |   statement {
 42 |     sid    = "AllowManageOwnVirtualMFADevice"
 43 |     effect = "Allow"
 44 |     actions = [
 45 |       "iam:CreateVirtualMFADevice"
 46 |     ]
 47 |     resources = ["arn:aws:iam::*:mfa/*"]
 48 |   }
 49 |   statement {
 50 |     sid    = "AllowManageOwnUserMFA"
 51 |     effect = "Allow"
 52 |     actions = [
 53 |       "iam:DeactivateMFADevice",
 54 |       "iam:EnableMFADevice",
 55 |       "iam:ListMFADevices",
 56 |       "iam:ResyncMFADevice"
 57 |     ]
 58 |     resources = ["arn:aws:iam::*:user/$${aws:username}"]
 59 |   }
 60 | }
 61 | 
 62 | resource "aws_iam_policy" "self_manage_credentials" {
 63 |   name        = "${local.prefix}-self-manage-credentials-policy"
 64 |   description = "Allow a user to manage their own credentials"
 65 |   policy      = data.aws_iam_policy_document.self_manage_credentials.json
 66 | }
 67 | 
 68 | data "aws_iam_policy_document" "access_snowflake_loader" {
 69 |   for_each = toset(local.jobs)
 70 |   statement {
 71 |     actions = ["secretsmanager:GetSecretValue"]
 72 |     resources = [
 73 |       data.aws_secretsmanager_secret.snowflake_loader_secret[each.key].arn
 74 |     ]
 75 |   }
 76 | }
 77 | 
 78 | resource "aws_iam_policy" "access_snowflake_loader" {
 79 |   for_each    = toset(local.jobs)
 80 |   name        = "${local.prefix}-access-snowflake-loader-${each.key}"
 81 |   description = "Allow a user/role to access Snowflake loader role in SecretsManager for the ${each.key} secret"
 82 |   policy      = data.aws_iam_policy_document.access_snowflake_loader[each.key].json
 83 | }
 84 | 
 85 | ##################################
 86 | #        IAM Service Users       #
 87 | ##################################
 88 | 
 89 | # NOTE: in general, policies and roles are defined close to the resources
 90 | # they support.
 91 | 
 92 | # CD bot for GitHub actions
 93 | resource "aws_iam_user" "cd_bot" {
 94 |   name = "${local.prefix}-cd-bot"
 95 | }
 96 | 
 97 | resource "aws_iam_user_policy_attachment" "ecr_cd_bot_policy_attachment" {
 98 |   user       = aws_iam_user.cd_bot.name
 99 |   policy_arn = aws_iam_policy.default_ecr_policy.arn
100 | }
101 | 
102 | resource "aws_iam_user_policy_attachment" "batch_cd_bot_policy_attachment" {
103 |   user       = aws_iam_user.cd_bot.name
104 |   policy_arn = aws_iam_policy.batch_submit_policy.arn
105 | }
106 | 
107 | ##################################
108 | #         IAM Human Users        #
109 | ##################################
110 | 
111 | resource "aws_iam_user" "arman" {
112 |   name = "ArmanMadani"
113 | }
114 | 
115 | resource "aws_iam_user" "esa" {
116 |   name = "EsaEslami"
117 | }
118 | 
119 | resource "aws_iam_user" "kim" {
120 |   name = "KimHicks"
121 | }
122 | 
123 | resource "aws_iam_user" "monica" {
124 |   name = "MonicaBobra"
125 | }
126 | 
127 | resource "aws_iam_user" "rocio" {
128 |   name = "RocioMora"
129 | }
130 | 
131 | ##################################
132 | #         IAM User Groups        #
133 | ##################################
134 | 
135 | resource "aws_iam_group" "aae" {
136 |   name = "odi-advanced-analytics-${var.environment}"
137 | }
138 | 
139 | resource "aws_iam_group_policy_attachment" "aae_dsa_project" {
140 |   for_each   = toset(local.dsa_projects)
141 |   group      = aws_iam_group.aae.name
142 |   policy_arn = aws_iam_policy.s3_dsa_project_policy[each.key].arn
143 | }
144 | 
145 | resource "aws_iam_group_policy_attachment" "aae_list_all_my_buckets" {
146 |   group      = aws_iam_group.aae.name
147 |   policy_arn = aws_iam_policy.s3_list_all_my_buckets.arn
148 | }
149 | 
150 | resource "aws_iam_group_policy_attachment" "aae_self_manage_creentials" {
151 |   group      = aws_iam_group.aae.name
152 |   policy_arn = aws_iam_policy.self_manage_credentials.arn
153 | }
154 | 
155 | resource "aws_iam_group_membership" "aae" {
156 |   name  = "${aws_iam_group.aae.name}-membership"
157 |   group = aws_iam_group.aae.name
158 |   users = [
159 |     aws_iam_user.arman.name,
160 |     aws_iam_user.esa.name,
161 |     aws_iam_user.kim.name,
162 |     aws_iam_user.monica.name,
163 |     aws_iam_user.rocio.name,
164 |   ]
165 | }
166 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/main.tf:
--------------------------------------------------------------------------------
 1 | ##################################
 2 | #        Terraform Setup         #
 3 | ##################################
 4 | 
 5 | terraform {
 6 |   # Note: when a package is added or updated, we have to update the lockfile in a
 7 |   # platform-independent way, cf. https://github.com/hashicorp/terraform/issues/28041
 8 |   # To update the lockfile run:
 9 |   #
10 |   # terraform providers lock -platform=linux_amd64 -platform=darwin_amd64
11 |   required_providers {
12 |     aws = {
13 |       source  = "hashicorp/aws"
14 |       version = "4.56.0"
15 |     }
16 |     random = {
17 |       source  = "hashicorp/random"
18 |       version = "3.4.3"
19 |     }
20 |   }
21 |   required_version = ">= 1.0"
22 | }
23 | 
24 | data "aws_caller_identity" "current" {}
25 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/network.tf:
--------------------------------------------------------------------------------
  1 | ##################################
  2 | #          Networking            #
  3 | ##################################
  4 | 
  5 | data "aws_availability_zones" "available" {
  6 |   state = "available"
  7 | }
  8 | 
  9 | resource "aws_vpc" "this" {
 10 |   cidr_block = "10.0.0.0/16"
 11 | 
 12 |   tags = {
 13 |     Name = "${local.prefix}-main"
 14 |   }
 15 | }
 16 | 
 17 | resource "aws_security_group" "batch" {
 18 |   name        = "${local.prefix}-batch-sg"
 19 |   description = "Allow ECS tasks to reach out to internet"
 20 |   vpc_id      = aws_vpc.this.id
 21 | 
 22 |   egress {
 23 |     description = "Allow ECS tasks to talk to the internet"
 24 |     from_port   = 0
 25 |     to_port     = 0
 26 |     protocol    = "-1"
 27 |     cidr_blocks = ["0.0.0.0/0"]
 28 |   }
 29 | 
 30 |   tags = {
 31 |     Name = "${local.prefix}-batch-sg"
 32 |   }
 33 | }
 34 | 
 35 | resource "aws_security_group" "mwaa" {
 36 |   vpc_id = aws_vpc.this.id
 37 |   name   = "${local.prefix}-mwaa-no-ingress-sg"
 38 |   ingress {
 39 |     from_port = 0
 40 |     to_port   = 0
 41 |     protocol  = "-1"
 42 |     self      = true
 43 |   }
 44 |   egress {
 45 |     from_port = 0
 46 |     to_port   = 0
 47 |     protocol  = "-1"
 48 |     cidr_blocks = [
 49 |       "0.0.0.0/0"
 50 |     ]
 51 |   }
 52 | 
 53 |   tags = {
 54 |     Name = "${local.prefix}-mwaa-no-ingress-sg"
 55 |   }
 56 | }
 57 | 
 58 | resource "aws_internet_gateway" "this" {
 59 |   vpc_id = aws_vpc.this.id
 60 | 
 61 |   tags = {
 62 |     Name = "${local.prefix}-main"
 63 |   }
 64 | }
 65 | 
 66 | resource "aws_route_table" "public" {
 67 |   vpc_id = aws_vpc.this.id
 68 |   route {
 69 |     cidr_block = "0.0.0.0/0"
 70 |     gateway_id = aws_internet_gateway.this.id
 71 |   }
 72 | 
 73 |   tags = {
 74 |     Name = "${local.prefix}-public"
 75 |   }
 76 | }
 77 | 
 78 | resource "random_id" "public_subnet" {
 79 |   count       = 2
 80 |   byte_length = 3
 81 | }
 82 | 
 83 | resource "random_id" "private_subnet" {
 84 |   count       = 2
 85 |   byte_length = 3
 86 | }
 87 | 
 88 | resource "aws_eip" "this" {
 89 |   count = 2
 90 |   vpc   = true
 91 | 
 92 |   tags = {
 93 |     Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-nat-${random_id.private_subnet[count.index].hex}"
 94 |   }
 95 | }
 96 | 
 97 | resource "aws_nat_gateway" "this" {
 98 |   count         = length(aws_subnet.public)
 99 |   allocation_id = aws_eip.this[count.index].id
100 |   subnet_id     = aws_subnet.public[count.index].id
101 |   tags = {
102 |     Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-nat-${random_id.private_subnet[count.index].hex}"
103 |   }
104 | }
105 | 
106 | resource "aws_route_table" "private" {
107 |   count  = length(aws_nat_gateway.this)
108 |   vpc_id = aws_vpc.this.id
109 |   route {
110 |     cidr_block     = "0.0.0.0/0"
111 |     nat_gateway_id = aws_nat_gateway.this[count.index].id
112 |   }
113 |   tags = {
114 |     Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-nat-${random_id.private_subnet[count.index].hex}"
115 |   }
116 | }
117 | 
118 | resource "aws_subnet" "public" {
119 |   count                   = 2
120 |   vpc_id                  = aws_vpc.this.id
121 |   cidr_block              = "10.0.${count.index}.0/24"
122 |   availability_zone       = data.aws_availability_zones.available.names[count.index]
123 |   map_public_ip_on_launch = true
124 | 
125 |   tags = {
126 |     Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-public-${random_id.public_subnet[count.index].hex}"
127 |   }
128 | }
129 | 
130 | resource "aws_route_table_association" "public" {
131 |   count          = length(aws_subnet.public)
132 |   subnet_id      = aws_subnet.public[count.index].id
133 |   route_table_id = aws_route_table.public.id
134 | }
135 | 
136 | resource "aws_subnet" "private" {
137 |   count                   = 2
138 |   vpc_id                  = aws_vpc.this.id
139 |   cidr_block              = "10.0.${count.index + length(aws_subnet.public)}.0/24"
140 |   availability_zone       = data.aws_availability_zones.available.names[count.index]
141 |   map_public_ip_on_launch = false
142 | 
143 |   tags = {
144 |     Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-private-${random_id.private_subnet[count.index].hex}"
145 |   }
146 | }
147 | 
148 | resource "aws_route_table_association" "private" {
149 |   count          = length(aws_subnet.private)
150 |   route_table_id = aws_route_table.private[count.index].id
151 |   subnet_id      = aws_subnet.private[count.index].id
152 | }
153 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "state" {
 2 |   description = "Resources from terraform-state"
 3 |   value = {
 4 |     repository_url     = aws_ecr_repository.default.repository_url
 5 |     scratch_bucket     = aws_s3_bucket.scratch.id
 6 |     mwaa_bucket        = aws_s3_bucket.mwaa.id
 7 |     github_actions_bot = aws_iam_user.cd_bot.name
 8 |     batch_job_queue    = aws_batch_job_queue.default.name
 9 |     batch_job_definitions = {
10 |       test   = aws_batch_job_definition.default["test"].name
11 |       latest = aws_batch_job_definition.default["latest"].name
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/secrets.tf:
--------------------------------------------------------------------------------
 1 | ##################################
 2 | #      AWS Secrets Manager       #
 3 | ##################################
 4 | 
 5 | locals {
 6 |   snowflake_data = ["account", "user", "database", "warehouse", "role", "password"]
 7 | 
 8 |   jobs = ["test", "latest"]
 9 | }
10 | 
11 | data "aws_secretsmanager_secret" "snowflake_loader_secret" {
12 |   for_each = toset(local.jobs)
13 |   name     = var.snowflake_loader_secret[each.key]
14 | }
15 | 


--------------------------------------------------------------------------------
/terraform/aws/modules/infra/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "owner" {
 2 |   description = "Owner of the resource"
 3 |   type        = string
 4 |   default     = "dse"
 5 | }
 6 | 
 7 | variable "project" {
 8 |   description = "Name of the project the resource is serving"
 9 |   type        = string
10 |   default     = "infra"
11 | }
12 | 
13 | variable "environment" {
14 |   description = "Deployment environment of the resource"
15 |   type        = string
16 |   default     = "dev"
17 | }
18 | 
19 | variable "region" {
20 |   description = "Region for AWS resources"
21 |   type        = string
22 |   default     = "us-west-2"
23 | }
24 | 
25 | variable "snowflake_loader_secret" {
26 |   description = "ARN for SecretsManager login info to Snowflake with loader role"
27 |   type        = object({ test = string, latest = string })
28 |   default     = null
29 | }
30 | 
31 | locals {
32 |   prefix = "${var.owner}-${var.project}-${var.environment}"
33 | }
34 | 


--------------------------------------------------------------------------------
/terraform/s3-remote-state/README.md:
--------------------------------------------------------------------------------
 1 | # Terraform S3 remote state
 2 | 
 3 | This Terraform module is intended to bootstrap remote state for the main terraform
 4 | project in the parent directory.
 5 | 
 6 | It uses the [S3 backend](https://developer.hashicorp.com/terraform/language/settings/backends/s3),
 7 | which stores the state in an S3 bucket and uses a DynamoDB table for locking.
 8 | 
 9 | Ideally, this should only need to be set up once per project:
10 | 
11 | ```bash
12 | terraform apply
13 | ```
14 | 
15 | <!-- BEGIN_TF_DOCS -->
16 | ## Requirements
17 | 
18 | | Name | Version |
19 | |------|---------|
20 | | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.0 |
21 | | <a name="requirement_aws"></a> [aws](#requirement\_aws) | 4.56.0 |
22 | 
23 | ## Providers
24 | 
25 | | Name | Version |
26 | |------|---------|
27 | | <a name="provider_aws"></a> [aws](#provider\_aws) | 4.56.0 |
28 | 
29 | ## Modules
30 | 
31 | No modules.
32 | 
33 | ## Resources
34 | 
35 | | Name | Type |
36 | |------|------|
37 | | [aws_dynamodb_table.terraform_state_lock](https://registry.terraform.io/providers/hashicorp/aws/4.56.0/docs/resources/dynamodb_table) | resource |
38 | | [aws_s3_bucket.terraform_state](https://registry.terraform.io/providers/hashicorp/aws/4.56.0/docs/resources/s3_bucket) | resource |
39 | | [aws_s3_bucket_versioning.terraform_state](https://registry.terraform.io/providers/hashicorp/aws/4.56.0/docs/resources/s3_bucket_versioning) | resource |
40 | 
41 | ## Inputs
42 | 
43 | | Name | Description | Type | Default | Required |
44 | |------|-------------|------|---------|:--------:|
45 | | <a name="input_environment"></a> [environment](#input\_environment) | Deployment environment of the resource | `string` | n/a | yes |
46 | | <a name="input_owner"></a> [owner](#input\_owner) | Owner of the resource | `string` | `"dse"` | no |
47 | | <a name="input_project"></a> [project](#input\_project) | Name of the project the resource is serving | `string` | n/a | yes |
48 | | <a name="input_region"></a> [region](#input\_region) | AWS Region | `string` | `"us-west-2"` | no |
49 | 
50 | ## Outputs
51 | 
52 | | Name | Description |
53 | |------|-------------|
54 | | <a name="output_bucket"></a> [bucket](#output\_bucket) | State bucket |
55 | | <a name="output_dynamodb_table"></a> [dynamodb\_table](#output\_dynamodb\_table) | State lock |
56 | | <a name="output_key"></a> [key](#output\_key) | State object key |
57 | | <a name="output_region"></a> [region](#output\_region) | AWS Region |
58 | <!-- END_TF_DOCS -->
59 | 


--------------------------------------------------------------------------------
/terraform/s3-remote-state/main.tf:
--------------------------------------------------------------------------------
  1 | ################################
  2 | #          Variables           #
  3 | ################################
  4 | 
  5 | variable "owner" {
  6 |   description = "Owner of the resource"
  7 |   type        = string
  8 |   default     = "dse"
  9 | }
 10 | 
 11 | variable "project" {
 12 |   description = "Name of the project the resource is serving"
 13 |   type        = string
 14 | }
 15 | 
 16 | variable "environment" {
 17 |   description = "Deployment environment of the resource"
 18 |   type        = string
 19 | }
 20 | 
 21 | variable "region" {
 22 |   description = "AWS Region"
 23 |   type        = string
 24 |   default     = "us-west-2"
 25 | }
 26 | 
 27 | locals {
 28 |   prefix = "${var.owner}-${var.project}-${var.environment}"
 29 | }
 30 | 
 31 | ################################
 32 | #       Terraform setup        #
 33 | ################################
 34 | 
 35 | terraform {
 36 |   required_providers {
 37 |     aws = {
 38 |       source  = "hashicorp/aws"
 39 |       version = "4.56.0"
 40 |     }
 41 |   }
 42 |   required_version = ">= 1.0"
 43 | }
 44 | 
 45 | provider "aws" {
 46 |   region = var.region
 47 | 
 48 |   default_tags {
 49 |     tags = {
 50 |       Owner       = var.owner
 51 |       Project     = var.project
 52 |       Environment = var.environment
 53 |     }
 54 |   }
 55 | }
 56 | 
 57 | 
 58 | ################################
 59 | #        State backend         #
 60 | ################################
 61 | 
 62 | resource "aws_s3_bucket" "terraform_state" {
 63 |   bucket = "${local.prefix}-terraform-state"
 64 | 
 65 |   lifecycle {
 66 |     prevent_destroy = true
 67 |   }
 68 | }
 69 | 
 70 | resource "aws_s3_bucket_versioning" "terraform_state" {
 71 |   bucket = aws_s3_bucket.terraform_state.id
 72 | 
 73 |   versioning_configuration {
 74 |     status = "Enabled"
 75 |   }
 76 | }
 77 | 
 78 | resource "aws_dynamodb_table" "terraform_state_lock" {
 79 |   name           = "${local.prefix}-terraform-state-lock"
 80 |   read_capacity  = 1
 81 |   write_capacity = 1
 82 |   hash_key       = "LockID"
 83 | 
 84 |   attribute {
 85 |     name = "LockID"
 86 |     type = "S"
 87 |   }
 88 | }
 89 | 
 90 | ################################
 91 | # Outputs for a tfbackend file #
 92 | ################################
 93 | 
 94 | output "bucket" {
 95 |   description = "State bucket"
 96 |   value       = aws_s3_bucket.terraform_state.bucket
 97 | }
 98 | 
 99 | output "key" {
100 |   description = "State object key"
101 |   value       = "${local.prefix}.tfstate"
102 | }
103 | 
104 | output "region" {
105 |   description = "AWS Region"
106 |   value       = var.region
107 | }
108 | 
109 | output "dynamodb_table" {
110 |   description = "State lock"
111 |   value       = aws_dynamodb_table.terraform_state_lock.name
112 | }
113 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/snowflake-labs/snowflake" {
 5 |   version     = "1.0.1"
 6 |   constraints = "~> 1.0, 1.0.1"
 7 |   hashes = [
 8 |     "h1:KbiPdzWifjw7jNSiQlIT4b8TyDTfMOhRdYdFcOvt9lA=",
 9 |     "h1:RW6Tbf/g9fmY/WOQY6WktxQ9TywBpJi9Lw5O1BqnCs4=",
10 |     "h1:glVuLBCPg23s0K4Vzjwy+CBLXCMRhKZHuF/6yxIof+I=",
11 |     "zh:1a8c1c8d7003943d0c8ab492ec2d352f3552ae1e5be6ae2ced16da95b9859769",
12 |     "zh:2bc7c58adbc504f6aa61774a7bbf99bdfbf7bbf691182d01518146bb28c8e2fa",
13 |     "zh:30482878d46ac18624daf6559b2ee294aa24c7bffff5bf2d2a2133072db4aa8a",
14 |     "zh:3f1f1088375fde975993029be32955881ba71d84e24db20e69bb9d437305780f",
15 |     "zh:42510e778b420295461179eb97f5c436edc157c8980c7b3c0db71eb08c063d49",
16 |     "zh:475ee5e75e4b93e3e939cd5b2d803e1c3f31d22963bdc49a21d4536afa6eaf90",
17 |     "zh:55918ef218513ea1e2b916893aa1272e327beeeb80b205efaffcdefbb2b52ba0",
18 |     "zh:651c8526a9d4bd834fa623a74737bf485fc64e383a5e32d3531cf0fa146863a9",
19 |     "zh:892f03d08fdff2746e1d2acd5bf520a764a07a00e177fe1fbb2521daccd62523",
20 |     "zh:a8a999d555aae9d205b0c1c2432a94c37e8630bddb4357ccaf2e44911dede481",
21 |     "zh:cba89d14632697d219e4f848ac206d16cc152c65b7740fb6c5c08ed98dd054ba",
22 |     "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
23 |   ]
24 | }
25 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/dse-snowflake-dev.tfbackend:
--------------------------------------------------------------------------------
1 | bucket = "dse-snowflake-dev-terraform-state"
2 | dynamodb_table = "dse-snowflake-dev-terraform-state-lock"
3 | key = "dse-snowflake-dev.tfstate"
4 | region = "us-west-2"
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/main.tf:
--------------------------------------------------------------------------------
  1 | ############################
  2 | #         Variables        #
  3 | ############################
  4 | 
  5 | variable "environment" {
  6 |   description = "Environment suffix"
  7 |   type        = string
  8 | }
  9 | 
 10 | variable "account_name" {
 11 |   description = "Snowflake account name"
 12 |   type        = string
 13 | }
 14 | 
 15 | variable "organization_name" {
 16 |   description = "Snowflake account organization"
 17 |   type        = string
 18 | }
 19 | 
 20 | ############################
 21 | #         Providers        #
 22 | ############################
 23 | 
 24 | terraform {
 25 |   required_providers {
 26 |     snowflake = {
 27 |       source  = "Snowflake-Labs/snowflake"
 28 |       version = "1.0.1"
 29 |     }
 30 |   }
 31 |   required_version = ">= 1.0"
 32 | 
 33 |   backend "s3" {
 34 |   }
 35 | }
 36 | 
 37 | # This provider is intentionally low-permission. In Snowflake, object creators are
 38 | # the default owners of the object. To control the owner, we create different provider
 39 | # blocks with different roles, and require that all snowflake resources explicitly
 40 | # flag the role they want for the creator.
 41 | provider "snowflake" {
 42 |   account_name      = var.account_name
 43 |   organization_name = var.organization_name
 44 |   role    = "PUBLIC"
 45 | }
 46 | 
 47 | # Snowflake provider for account administration (to be used only when necessary).
 48 | provider "snowflake" {
 49 |   alias   = "accountadmin"
 50 |   role    = "ACCOUNTADMIN"
 51 |   account_name      = var.account_name
 52 |   organization_name = var.organization_name
 53 | }
 54 | 
 55 | # Snowflake provider for creating databases, warehouses, etc.
 56 | provider "snowflake" {
 57 |   alias   = "sysadmin"
 58 |   role    = "SYSADMIN"
 59 |   account_name      = var.account_name
 60 |   organization_name = var.organization_name
 61 | }
 62 | 
 63 | # Snowflake provider for managing grants to roles.
 64 | provider "snowflake" {
 65 |   alias   = "securityadmin"
 66 |   role    = "SECURITYADMIN"
 67 |   account_name      = var.account_name
 68 |   organization_name = var.organization_name
 69 | }
 70 | 
 71 | # Snowflake provider for managing user accounts and roles.
 72 | provider "snowflake" {
 73 |   alias   = "useradmin"
 74 |   role    = "USERADMIN"
 75 |   account_name      = var.account_name
 76 |   organization_name = var.organization_name
 77 | }
 78 | 
 79 | ############################
 80 | #       Environment        #
 81 | ############################
 82 | 
 83 | module "elt" {
 84 |   source = "../../modules/elt"
 85 |   providers = {
 86 |     snowflake.accountadmin  = snowflake.accountadmin,
 87 |     snowflake.securityadmin = snowflake.securityadmin,
 88 |     snowflake.sysadmin      = snowflake.sysadmin,
 89 |     snowflake.useradmin     = snowflake.useradmin,
 90 |   }
 91 | 
 92 |   environment = var.environment
 93 | }
 94 | 
 95 | ##############################################################
 96 | # Assign LOGGER role to TRANSFORMER role
 97 | # This is only needed for the ODI default snowflake instance
 98 | # More backgorund information related to this is found
 99 | # here - https://github.com/cagov/data-infrastructure/issues/428
100 | ##############################################################
101 | 
102 | resource "snowflake_grant_account_role" "logger_to_transformer" {
103 |   provider         = snowflake.useradmin
104 |   role_name        = "LOGGER_${var.environment}"
105 |   parent_role_name = "TRANSFORMER_${var.environment}"
106 | }
107 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/remote-state/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/aws" {
 5 |   version     = "4.56.0"
 6 |   constraints = "4.56.0"
 7 |   hashes = [
 8 |     "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=",
 9 |     "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=",
10 |     "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=",
11 |     "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab",
12 |     "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa",
13 |     "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03",
14 |     "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0",
15 |     "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39",
16 |     "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b",
17 |     "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa",
18 |     "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
19 |     "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1",
20 |     "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87",
21 |     "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711",
22 |     "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365",
23 |     "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054",
24 |     "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81",
25 |     "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef",
26 |   ]
27 | }
28 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/remote-state/main.tf:
--------------------------------------------------------------------------------
1 | ../../../../s3-remote-state/main.tf


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/remote-state/terraform.tfvars:
--------------------------------------------------------------------------------
1 | owner       = "dse"
2 | environment = "dev"
3 | project     = "snowflake"
4 | region      = "us-west-2"
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/dev/terraform.tfvars:
--------------------------------------------------------------------------------
1 | account_name = "HJB86910"
2 | environment = "DEV"
3 | organization_name= "VSB79059"
4 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/prd/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/snowflake-labs/snowflake" {
 5 |   version     = "1.0.1"
 6 |   constraints = "~> 1.0, 1.0.1"
 7 |   hashes = [
 8 |     "h1:KbiPdzWifjw7jNSiQlIT4b8TyDTfMOhRdYdFcOvt9lA=",
 9 |     "h1:RW6Tbf/g9fmY/WOQY6WktxQ9TywBpJi9Lw5O1BqnCs4=",
10 |     "h1:glVuLBCPg23s0K4Vzjwy+CBLXCMRhKZHuF/6yxIof+I=",
11 |     "zh:1a8c1c8d7003943d0c8ab492ec2d352f3552ae1e5be6ae2ced16da95b9859769",
12 |     "zh:2bc7c58adbc504f6aa61774a7bbf99bdfbf7bbf691182d01518146bb28c8e2fa",
13 |     "zh:30482878d46ac18624daf6559b2ee294aa24c7bffff5bf2d2a2133072db4aa8a",
14 |     "zh:3f1f1088375fde975993029be32955881ba71d84e24db20e69bb9d437305780f",
15 |     "zh:42510e778b420295461179eb97f5c436edc157c8980c7b3c0db71eb08c063d49",
16 |     "zh:475ee5e75e4b93e3e939cd5b2d803e1c3f31d22963bdc49a21d4536afa6eaf90",
17 |     "zh:55918ef218513ea1e2b916893aa1272e327beeeb80b205efaffcdefbb2b52ba0",
18 |     "zh:651c8526a9d4bd834fa623a74737bf485fc64e383a5e32d3531cf0fa146863a9",
19 |     "zh:892f03d08fdff2746e1d2acd5bf520a764a07a00e177fe1fbb2521daccd62523",
20 |     "zh:a8a999d555aae9d205b0c1c2432a94c37e8630bddb4357ccaf2e44911dede481",
21 |     "zh:cba89d14632697d219e4f848ac206d16cc152c65b7740fb6c5c08ed98dd054ba",
22 |     "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
23 |   ]
24 | }
25 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/prd/dse-snowflake-prd.tfbackend:
--------------------------------------------------------------------------------
1 | bucket = "dse-snowflake-prd-terraform-state"
2 | dynamodb_table = "dse-snowflake-prd-terraform-state-lock"
3 | key = "dse-snowflake-prd.tfstate"
4 | region = "us-west-2"
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/prd/remote-state/.terraform.lock.hcl:
--------------------------------------------------------------------------------
 1 | # This file is maintained automatically by "terraform init".
 2 | # Manual edits may be lost in future updates.
 3 | 
 4 | provider "registry.terraform.io/hashicorp/aws" {
 5 |   version     = "4.56.0"
 6 |   constraints = "4.56.0"
 7 |   hashes = [
 8 |     "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=",
 9 |     "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=",
10 |     "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=",
11 |     "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab",
12 |     "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa",
13 |     "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03",
14 |     "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0",
15 |     "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39",
16 |     "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b",
17 |     "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa",
18 |     "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
19 |     "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1",
20 |     "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87",
21 |     "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711",
22 |     "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365",
23 |     "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054",
24 |     "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81",
25 |     "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef",
26 |   ]
27 | }
28 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/prd/remote-state/main.tf:
--------------------------------------------------------------------------------
1 | ../../../../s3-remote-state/main.tf


--------------------------------------------------------------------------------
/terraform/snowflake/environments/prd/remote-state/terraform.tfvars:
--------------------------------------------------------------------------------
1 | owner       = "dse"
2 | environment = "prd"
3 | project     = "snowflake"
4 | region      = "us-west-2"
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/environments/prd/terraform.tfvars:
--------------------------------------------------------------------------------
1 | account_name = "HJB86910"
2 | environment = "PRD"
3 | organization_name= "VSB79059"
4 | okta_integration_name= "OKTAINTEGRATION"
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/database/outputs.tf:
--------------------------------------------------------------------------------
1 | output "name" {
2 |   description = "Database name"
3 |   value       = snowflake_database.this.name
4 | }
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/database/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "name" {
 2 |   description = "Database name"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "comment" {
 7 |   description = "Comment to apply to warehouse"
 8 |   type        = string
 9 |   default     = null
10 | }
11 | 
12 | variable "data_retention_time_in_days" {
13 |   description = "Data retention time in days"
14 |   type        = number
15 |   default     = 7
16 | }
17 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/elt/databases.tf:
--------------------------------------------------------------------------------
 1 | #######################################
 2 | #            Databases                #
 3 | #######################################
 4 | 
 5 | # The primary database where transformation tools like dbt operate.
 6 | module "transform" {
 7 |   source = "../database"
 8 |   providers = {
 9 |     snowflake.securityadmin = snowflake.securityadmin,
10 |     snowflake.sysadmin      = snowflake.sysadmin,
11 |     snowflake.useradmin     = snowflake.useradmin,
12 |   }
13 |   name                        = "TRANSFORM_${var.environment}"
14 |   comment                     = "Transformation database"
15 |   data_retention_time_in_days = 7
16 | }
17 | 
18 | # The primary raw database, where ELT tools land data.
19 | module "raw" {
20 |   source = "../database"
21 |   providers = {
22 |     snowflake.securityadmin = snowflake.securityadmin,
23 |     snowflake.sysadmin      = snowflake.sysadmin,
24 |     snowflake.useradmin     = snowflake.useradmin,
25 |   }
26 |   name                        = "RAW_${var.environment}"
27 |   comment                     = "Raw database, intended for ingest of raw data from source systems prior to any modeling or transformation"
28 |   data_retention_time_in_days = 7
29 | }
30 | 
31 | # The primary reporting database.
32 | module "analytics" {
33 |   source = "../database"
34 |   providers = {
35 |     snowflake.securityadmin = snowflake.securityadmin,
36 |     snowflake.sysadmin      = snowflake.sysadmin,
37 |     snowflake.useradmin     = snowflake.useradmin,
38 |   }
39 |   name                        = "ANALYTICS_${var.environment}"
40 |   comment                     = "Analytics database for data consumers, holding analysis-ready data marts/models"
41 |   data_retention_time_in_days = 7
42 | }
43 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/elt/main.tf:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | #            Terraform               #
 3 | ######################################
 4 | 
 5 | terraform {
 6 |   required_providers {
 7 |     snowflake = {
 8 |       source  = "Snowflake-Labs/snowflake"
 9 |       version = "~> 1.0"
10 |       configuration_aliases = [
11 |         snowflake.accountadmin,
12 |         snowflake.securityadmin,
13 |         snowflake.sysadmin,
14 |         snowflake.useradmin,
15 |       ]
16 |     }
17 |   }
18 |   required_version = ">= 1.0"
19 | }
20 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/elt/users.tf:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | #      Service Accounts/Users        #
 3 | ######################################
 4 | 
 5 | resource "snowflake_service_user" "dbt" {
 6 |   provider = snowflake.useradmin
 7 |   name     = "DBT_CLOUD_SVC_USER_${var.environment}"
 8 |   comment  = "Service user for dbt Cloud"
 9 |   lifecycle {
10 |     ignore_changes = [rsa_public_key]
11 |   }
12 | 
13 |   default_warehouse = module.transforming["XS"].name
14 |   default_role      = snowflake_account_role.transformer.name
15 | }
16 | 
17 | resource "snowflake_service_user" "airflow" {
18 |   provider = snowflake.useradmin
19 |   name     = "MWAA_SVC_USER_${var.environment}"
20 |   comment  = "Service user for Airflow"
21 |   lifecycle {
22 |     ignore_changes = [rsa_public_key]
23 |   }
24 | 
25 |   default_warehouse = module.loading["XS"].name
26 |   default_role      = snowflake_account_role.loader.name
27 | }
28 | 
29 | resource "snowflake_service_user" "fivetran" {
30 |   provider = snowflake.useradmin
31 |   name     = "FIVETRAN_SVC_USER_${var.environment}"
32 |   comment  = "Service user for Fivetran"
33 |   lifecycle {
34 |     ignore_changes = [rsa_public_key]
35 |   }
36 | 
37 |   default_warehouse = module.loading["XS"].name
38 |   default_role      = snowflake_account_role.loader.name
39 | }
40 | 
41 | resource "snowflake_service_user" "github_ci" {
42 |   provider = snowflake.useradmin
43 |   name     = "GITHUB_ACTIONS_SVC_USER_${var.environment}"
44 |   comment  = "Service user for GitHub CI"
45 |   lifecycle {
46 |     ignore_changes = [rsa_public_key]
47 |   }
48 | 
49 |   default_warehouse = module.reporting["XS"].name
50 |   default_role      = snowflake_account_role.reader.name
51 | }
52 | 
53 | resource "snowflake_legacy_service_user" "sentinel" {
54 |   provider = snowflake.useradmin
55 |   name     = "SENTINEL_SVC_USER_${var.environment}"
56 |   comment  = "Service user for Sentinel"
57 |   lifecycle {
58 |     ignore_changes = [rsa_public_key]
59 |   }
60 | 
61 |   default_warehouse = module.logging.name
62 |   default_role      = snowflake_account_role.logger.name
63 | }
64 | 
65 | ######################################
66 | #            Role Grants             #
67 | ######################################
68 | 
69 | resource "snowflake_grant_account_role" "transformer_to_dbt" {
70 |   provider  = snowflake.useradmin
71 |   role_name = snowflake_account_role.transformer.name
72 |   user_name = snowflake_service_user.dbt.name
73 | }
74 | 
75 | resource "snowflake_grant_account_role" "loader_to_airflow" {
76 |   provider  = snowflake.useradmin
77 |   role_name = snowflake_account_role.loader.name
78 |   user_name = snowflake_service_user.airflow.name
79 | }
80 | 
81 | resource "snowflake_grant_account_role" "loader_to_fivetran" {
82 |   provider  = snowflake.useradmin
83 |   role_name = snowflake_account_role.loader.name
84 |   user_name = snowflake_service_user.fivetran.name
85 | }
86 | 
87 | resource "snowflake_grant_account_role" "reader_to_github_ci" {
88 |   provider  = snowflake.useradmin
89 |   role_name = snowflake_account_role.reader.name
90 |   user_name = snowflake_service_user.github_ci.name
91 | }
92 | 
93 | resource "snowflake_grant_account_role" "logger_to_sentinel" {
94 |   provider  = snowflake.useradmin
95 |   role_name = snowflake_account_role.logger.name
96 |   user_name = snowflake_legacy_service_user.sentinel.name
97 | }
98 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/elt/variables.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {
2 |   description = "Environment suffix"
3 |   type        = string
4 | }
5 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/elt/warehouses.tf:
--------------------------------------------------------------------------------
 1 | #################################
 2 | #           Warehouses          #
 3 | #################################
 4 | 
 5 | #X-Small: Good for small tasks and experimenting.
 6 | #Small: Suitable for single-user workloads and development.
 7 | #Medium: Handles moderate concurrency and data volumes.
 8 | #Large: Manages larger queries and higher concurrency.
 9 | #X-Large: Powerful for demanding workloads and data-intensive operations.
10 | #2X-Large: Double the capacity of X-Large.
11 | #3X-Large: Triple the capacity of X-Large.
12 | #4X-Large: Quadruple the capacity of X-Large.
13 | 
14 | locals {
15 |   sizes = {
16 |     "XS"  = "X-SMALL",
17 |     "S"   = "SMALL",
18 |     "M"   = "MEDIUM",
19 |     "L"   = "LARGE",
20 |     "XL"  = "X-LARGE",
21 |     "2XL" = "2X-LARGE",
22 |     "3XL" = "3X-LARGE",
23 |     "4XL" = "4X-LARGE",
24 |   }
25 | }
26 | 
27 | # Primary warehouse for loading data to Snowflake from ELT/ETL tools
28 | module "loading" {
29 |   source   = "../warehouse"
30 |   for_each = local.sizes
31 |   providers = {
32 |     snowflake.securityadmin = snowflake.securityadmin,
33 |     snowflake.sysadmin      = snowflake.sysadmin,
34 |     snowflake.useradmin     = snowflake.useradmin,
35 |   }
36 | 
37 |   name    = "LOADING_${each.key}_${var.environment}"
38 |   comment = "Primary warehouse for loading data to Snowflake from ELT/ETL tools"
39 |   size    = each.value
40 | }
41 | 
42 | # Primary warehouse for transforming data. Analytics engineers and automated
43 | # transformation tools should use this warehouse.
44 | module "transforming" {
45 |   source   = "../warehouse"
46 |   for_each = local.sizes
47 |   providers = {
48 |     snowflake.securityadmin = snowflake.securityadmin,
49 |     snowflake.sysadmin      = snowflake.sysadmin,
50 |     snowflake.useradmin     = snowflake.useradmin,
51 |   }
52 | 
53 |   name    = "TRANSFORMING_${each.key}_${var.environment}"
54 |   comment = "Primary warehouse for transforming data. Analytics engineers and automated transformation tools should use this warehouse"
55 |   size    = each.value
56 | }
57 | 
58 | # Primary warehouse for reporting. End-users and BI tools should use this warehouse.
59 | module "reporting" {
60 |   source   = "../warehouse"
61 |   for_each = local.sizes
62 |   providers = {
63 |     snowflake.securityadmin = snowflake.securityadmin,
64 |     snowflake.sysadmin      = snowflake.sysadmin,
65 |     snowflake.useradmin     = snowflake.useradmin,
66 |   }
67 | 
68 |   name    = "REPORTING_${each.key}_${var.environment}"
69 |   comment = "Primary warehouse for reporting. End-users and BI tools should use this warehouse"
70 |   size    = each.value
71 | }
72 | 
73 | # Primary warehouse for logging. Logging tools like Sentinel should use this warehouse.
74 | module "logging" {
75 |   source = "../warehouse"
76 |   providers = {
77 |     snowflake.securityadmin = snowflake.securityadmin,
78 |     snowflake.sysadmin      = snowflake.sysadmin,
79 |     snowflake.useradmin     = snowflake.useradmin,
80 |   }
81 | 
82 |   name         = "LOGGING_XS_${var.environment}"
83 |   comment      = "Primary warehouse for logging. Logging tools like Sentinel should use this warehouse."
84 |   size         = "X-SMALL"
85 |   auto_suspend = 1
86 | }
87 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/warehouse/main.tf:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | #            Terraform               #
 3 | ######################################
 4 | 
 5 | terraform {
 6 |   required_providers {
 7 |     snowflake = {
 8 |       source  = "Snowflake-Labs/snowflake"
 9 |       version = "~> 1.0"
10 |       configuration_aliases = [
11 |         snowflake.securityadmin,
12 |         snowflake.sysadmin,
13 |         snowflake.useradmin,
14 |       ]
15 |     }
16 |   }
17 |   required_version = ">= 1.0"
18 | }
19 | 
20 | ######################################
21 | #           Permissions              #
22 | ######################################
23 | 
24 | locals {
25 |   # Permissions to use a data warehouse. No role is given the MODIFY permission,
26 |   # instead warehouses should be treated as stateless, and if we need a larger
27 |   # one it should be created individually.
28 |   warehouse = {
29 |     MOU = ["MONITOR", "OPERATE", "USAGE"]
30 |   }
31 | }
32 | 
33 | #################################
34 | #           Warehouses          #
35 | #################################
36 | 
37 | 
38 | resource "snowflake_warehouse" "this" {
39 |   name                = var.name
40 |   provider            = snowflake.sysadmin
41 |   auto_suspend        = var.auto_suspend
42 |   auto_resume         = true
43 |   initially_suspended = true
44 |   comment             = var.comment
45 |   warehouse_size      = var.size
46 | }
47 | 
48 | #################################
49 | #     Warehouse Access Roles    #
50 | #################################
51 | 
52 | # Monitoring, usage, and operating permissions for the LOADING warehouse.
53 | resource "snowflake_account_role" "this" {
54 |   name     = "${var.name}_WH_MOU"
55 |   provider = snowflake.useradmin
56 |   comment  = "Monitoring, usage, and operating permissions for the ${var.name} warehouse"
57 | }
58 | 
59 | #################################
60 | #          Role Grants          #
61 | #################################
62 | 
63 | resource "snowflake_grant_account_role" "this_to_sysadmin" {
64 |   provider         = snowflake.useradmin
65 |   role_name        = snowflake_account_role.this.name
66 |   parent_role_name = "SYSADMIN"
67 | }
68 | 
69 | #################################
70 | #       Warehouse Grants        #
71 | #################################
72 | 
73 | resource "snowflake_grant_privileges_to_account_role" "this" {
74 |   provider          = snowflake.securityadmin
75 |   privileges        = local.warehouse.MOU
76 |   account_role_name = snowflake_account_role.this.name
77 |   on_account_object {
78 |     object_type = "WAREHOUSE"
79 |     object_name = snowflake_warehouse.this.name
80 |   }
81 |   with_grant_option = false
82 | }
83 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/warehouse/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "access_role_name" {
 2 |   description = "Warehouse access_role"
 3 |   value       = snowflake_account_role.this.name
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "Warehouse name"
 8 |   value       = snowflake_warehouse.this.name
 9 | }
10 | 


--------------------------------------------------------------------------------
/terraform/snowflake/modules/warehouse/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "name" {
 2 |   description = "Warehouse name"
 3 |   type        = string
 4 | }
 5 | 
 6 | variable "comment" {
 7 |   description = "Comment to apply to warehouse"
 8 |   type        = string
 9 |   default     = null
10 | }
11 | 
12 | variable "auto_suspend" {
13 |   description = "Auto-suspend time for warehouse"
14 |   type        = number
15 |   default     = 300
16 | }
17 | 
18 | variable "size" {
19 |   description = "Size of warehouse"
20 |   type        = string
21 |   default     = "x-small"
22 | }
23 | 


--------------------------------------------------------------------------------
/transform/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | dbt_packages/
3 | logs/
4 | .vscode/
5 | 


--------------------------------------------------------------------------------
/transform/.sqlfluff:
--------------------------------------------------------------------------------
 1 | [sqlfluff]
 2 | dialect = snowflake
 3 | templater = dbt
 4 | max_line_length = 120
 5 | 
 6 | # I decide the column ordering!
 7 | exclude_rules = structure.column_order
 8 | 
 9 | # Probably a controversial exclusion, so adding some justification here:
10 | # This disables the rule that prevents unreserved keywords from being
11 | # used as column aliases. The rule is intended to prevent accidental shadowing
12 | # of SQL keywords, which, honestly, sounds like a good idea! **However**,
13 | # this can result in some awkward contortions in final dataset column names,
14 | # which are intended for end users who shouldn't care about our query language
15 | # limitations. Since SQL tends to have quite a long list of keywords, the
16 | # restriction actually prevents some quite natural column names, preventing
17 | # legibility for end users (e.g., date, timestamp, name).
18 | [sqlfluff:rules:references.keywords]
19 | quoted_identifiers_policy = none
20 | unquoted_identifiers_policy = none
21 | 


--------------------------------------------------------------------------------
/transform/.sqlfluffignore:
--------------------------------------------------------------------------------
1 | target/
2 | dbt_packages/
3 | 


--------------------------------------------------------------------------------
/transform/README.md:
--------------------------------------------------------------------------------
 1 | # CalData dbt project
 2 | 
 3 | This is the primary dbt project for the CalData Data Services and Engineering (DSE) team.
 4 | It targets Snowflake as its data warehouse.
 5 | Linting and testing are driven through GitHub actions.
 6 | 
 7 | ## Building the docs
 8 | 
 9 | To build and view the docs locally, run
10 | 
11 | ```bash
12 | (dbt docs generate && cd target/ && python -m http.server)
13 | ```
14 | 
15 | in a terminal, then navigate to `http://localhost:8000` in your web browser.
16 | 
17 | ## Resources:
18 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
19 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
20 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers
21 | - Find [dbt events](https://events.getdbt.com) near you
22 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
23 | 


--------------------------------------------------------------------------------
/transform/analyses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/analyses/.gitkeep


--------------------------------------------------------------------------------
/transform/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | # Project name
 2 | name: "dse_analytics"
 3 | version: "1.0.0"
 4 | config-version: 2
 5 | 
 6 | flags:
 7 |   send_anonymous_usage_stats: false
 8 |   use_colors: true
 9 |   warn_error: false
10 |   state_modified_compare_more_unrendered_values: true
11 |   skip_nodes_if_on_run_start_fails: true
12 |   require_explicit_package_overrides_for_builtin_materializations: true
13 |   source_freshness_run_project_hooks: false
14 | 
15 | # This setting configures which "profile" dbt uses for this project.
16 | profile: "dse_snowflake"
17 | 
18 | # These configurations specify where dbt should look for different types of files.
19 | # The `source-paths` config, for example, states that models in this project can be
20 | # found in the "models/" directory. You probably won't need to change these!
21 | model-paths: ["models"]
22 | analysis-paths: ["analyses"]
23 | test-paths: ["tests"]
24 | seed-paths: ["seeds"]
25 | macro-paths: ["macros"]
26 | snapshot-paths: ["snapshots"]
27 | 
28 | target-path: "target" # directory which will store compiled SQL files
29 | clean-targets: # directories to be removed by `dbt clean`
30 |   - "target"
31 |   - "dbt_packages"
32 | 
33 | models:
34 |   dse_analytics:
35 |     staging:
36 |       +database: "{{ env_var('DBT_TRANSFORM_DB', 'TRANSFORM_DEV') }}"
37 |       department_of_finance:
38 |         +schema: department_of_finance
39 |       snowflake_cost_tracking:
40 |         +schema: snowflake_cost_tracking
41 | 
42 |         # These staging models are a little unusual for two reasons:
43 |         #
44 |         #   1. They are incremental
45 |         #   2. They do some very light aggregation
46 |         #
47 |         # We do this because the source views in the SNOWFLAKE meta-database
48 |         # have a retention time of one year, and don't have very strong
49 |         # uniqueness constraints for their data grain.  By making the models
50 |         # incremental we ensure that we retain data that is older than the retention
51 |         # time. By aggregating to the usage date (and table/warehouse, if applicable),
52 |         # we ensure that we can correctly merge in the incremental updates
53 |         # without resulting in duplicated rows.
54 |         +materialized: incremental
55 | 
56 |         # Never do a full refresh so that we avoid overwriting any old data.
57 |         # Otherwise we risk losing data beyond the 1 year retention window
58 |         +full_refresh: false
59 | 
60 |     intermediate:
61 |       +database: "{{ env_var('DBT_TRANSFORM_DB', 'TRANSFORM_DEV') }}"
62 |       state_entities:
63 |         +schema: state_entities
64 |       snowflake_cost_tracking:
65 |         +schema: snowflake_cost_tracking
66 |         +materialized: view
67 | 
68 |     marts:
69 |       # All marts models as tables to avoid needing write access to TRANSFORM
70 |       # https://community.snowflake.com/s/article/SQL-compilation-error-Failure-during-expansion-of-view-mySecureView
71 |       +materialized: table
72 |       +database: "{{ env_var('DBT_ANALYTICS_DB', 'ANALYTICS_DEV') }}"
73 |       state_entities:
74 |         +schema: state_entities
75 |       snowflake_cost_tracking:
76 |         +schema: snowflake_cost_tracking
77 |         +materialized: table
78 | 


--------------------------------------------------------------------------------
/transform/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/macros/.gitkeep


--------------------------------------------------------------------------------
/transform/macros/_macros.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | macros:
 4 |   - name: spatial_join_with_deduplication
 5 |     description: |
 6 |       Macro to perform a spatial join between two relations with deduplication of the
 7 |       geometries in the left table. For all left geometries that satisfy the predicate
 8 |       for more than one geometry in the right table, we compute their intersection and
 9 |       then choose the left geometry with the greatest intersection.
10 |     arguments:
11 |       - name: left_model
12 |         type: string
13 |         description: The left model to join. Can be a relation or CTE.
14 |       - name: right_model
15 |         type: string
16 |         description: The right model to join. Can be a relation or CTE.
17 |       - name: left_cols
18 |         type: list of strings
19 |         description: |
20 |           List columns to keep from the left table
21 |           (excluding the geometry column, which is always retained)
22 |       - name: right_cols
23 |         type: list of strings
24 |         description: |
25 |           List of columns to keep from the right table
26 |           (excluding the geometry column, which is never retained).
27 |           Cannot share any names with left_cols
28 |       - name: left_geom
29 |         type: string
30 |         description: The name of the left geometry column, defaults to "geometry"
31 |       - name: right_geom
32 |         type: string
33 |         description: The name of the right geometry column, defaults to "geometry"
34 |       - name: op
35 |         description: |
36 |           The spatial predicate function to choose,
37 |           defaults to "st_intersects"
38 |       - name: kind
39 |         type: string
40 |         description: The kind of join, either "left" or "inner". Defaults to "left"
41 |       - name: prefix
42 |         type: string
43 |         description: |
44 |           An optional prefix to give to temporary CTEs to improve legibility and
45 |           avoid name collisions.
46 | 


--------------------------------------------------------------------------------
/transform/macros/get_custom_schema.sql:
--------------------------------------------------------------------------------
 1 | {% macro generate_schema_name(custom_schema_name, node) -%}
 2 | 
 3 | {#
 4 |         Definitions:
 5 |             - custom_schema_name: schema provided via dbt_project.yml or model config
 6 |             - target.name: name of the target (dev for local development, prod for production, etc.)
 7 |             - target.schema: schema provided by the target defined in profiles.yml
 8 | 
 9 |         Rather than write to a schema prefixed with target.schema, we instead just write
10 |         to the actual schema name, and get safety by separating dev and prod databases.
11 |         If we start to experience analytics engineers stepping on each others toes in
12 |         dev, we may want to restore prefixes there (while maintaining a prefix-free
13 |         lifestyle in prod).
14 |     #}
15 |     {%- if custom_schema_name is none -%} {{ target.schema.lower() | trim }}
16 | 
17 | {%- elif target.name == 'prd' -%} {{ custom_schema_name.lower() | trim }}
18 | 
19 | {%- else -%} {{ target.schema.lower() | trim }}_{{ custom_schema_name | trim }}
20 | 
21 |     {%- endif -%}
22 | 
23 | {%- endmacro %}
24 | 


--------------------------------------------------------------------------------
/transform/macros/map_class_fp.sql:
--------------------------------------------------------------------------------
 1 | {% macro map_class_fips(class_fips, k, v) -%}
 2 | 
 3 | {#
 4 |     Class Codes source: https://www.census.gov/library/reference/code-lists/class-codes.html
 5 | #}
 6 | 
 7 | {% set class_fips_dict = {
 8 |     "M2" : "A military or other defense installation entirely within a place",
 9 |     "C1" : "An active incorporated place that does not serve as a county subdivision equivalent",
10 |     "U1" : "A census designated place with an official federally recognized name",
11 |     "U2" : "A census designated place without an official federally recognized name"
12 | } -%}
13 | 
14 | case
15 |     {% for k, v in class_fips_dict.items() -%}
16 |     when "{{ class_fips }}" = '{{ k }}'
17 |     then '{{ v }}'
18 |     {% endfor -%}
19 | end
20 | 
21 | {%- endmacro %}
22 | 


--------------------------------------------------------------------------------
/transform/macros/spatial_join_with_deduplication.sql:
--------------------------------------------------------------------------------
 1 | {# Macro to perform a spatial join between two relations with deduplication of the
 2 |    geometries in the left table. For all left geometries that satisfy the predicate for
 3 |    more than one geometry in the right table, we compute their intersection and then
 4 |    choose the left geometry with the greatest intersection.
 5 | #}
 6 | 
 7 | {% macro spatial_join_with_deduplication(left_model, right_model, left_cols, right_cols, left_geom="geometry", right_geom="geometry", op="st_intersects", kind="left", prefix="") %}
 8 | 
 9 | with {{ prefix }}_left_model_with_id as (
10 |     select
11 |         /* Generate a temporary ID for footprints. We will need this to group/partition
12 |         by unique footprints further down. We could use a UUID, but integers are
13 |         cheaper to generate and compare. */
14 |         *, seq4() as _tmp_sjoin_id
15 |     from {{ left_model }}
16 | ),
17 | 
18 | {{ prefix }}_joined as (
19 |     select
20 |       {% for lcol in left_cols -%}
21 |       {{ prefix }}_left_model_with_id.{{ lcol }},
22 |       {% endfor -%}
23 |       {% for rcol in right_cols -%}
24 |       {{ right_model }}.{{ rcol }},
25 |       {% endfor -%}
26 |       {{ prefix }}_left_model_with_id.{{ left_geom }},
27 |       /* We don't actually need the intersection for every geometry, only for the
28 |        ones that intersect more than one. However, in order to establish which
29 |        ones intersect more than one, we need a windowed COUNT partitioned by
30 |        _tmp_sjoin_id. This is an expensive operation, as it likely triggers a shuffle
31 |        (even though it should already be sorted by _tmp_id). In testing we've found
32 |        that it's cheaper to just do the intersection for all the geometries. */
33 |       st_area(
34 |         st_intersection({{ prefix }}_left_model_with_id.{{ left_geom }}, {{ right_model }}.{{ right_geom }})
35 |       ) as _tmp_sjoin_intersection,
36 |       {{ prefix }}_left_model_with_id._tmp_sjoin_id
37 |     from {{ prefix }}_left_model_with_id
38 |     {{ kind }} join {{ right_model }}
39 |     on {{ op }}({{ prefix }}_left_model_with_id.{{ left_geom }}, {{ right_model }}.{{ right_geom }})
40 | ),
41 | 
42 | {{ prefix }}_deduplicated as (
43 |     select
44 |       -- Snowflake doesn't support geometries in max_by. It should, but it doesn't.
45 |       -- Fortunately, we know that the geometries are identical when partitioned
46 |       -- by _tmp_sjoin_id, so we can just choose any_value.
47 |       any_value({{ left_geom }}) as {{ left_geom }},
48 |       {% for lcol in left_cols -%}
49 |       -- max_by returns null if all the values in a group are null. So if we have a left
50 |       -- join, we need to guard against nulls with a coalesce to return the single value
51 |       max_by({{ lcol }}, coalesce(_tmp_sjoin_intersection, 1.0)) as {{ lcol }},
52 |       {% endfor -%}
53 |       {% for rcol in right_cols -%}
54 |       -- max_by returns null if all the values in a group are null. So if we have a left
55 |       -- join, we need to guard against nulls with a coalesce to return the single value
56 |       max_by({{ rcol }}, coalesce(_tmp_sjoin_intersection, 1.0)) as {{ rcol }}{{ "," if not loop.last }}
57 |       {% endfor -%}
58 |     from {{ prefix }}_joined
59 |     group by _tmp_sjoin_id
60 | )
61 | 
62 | select * from {{ prefix }}_deduplicated
63 | {%- endmacro -%}
64 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/_snowflake_cost_tracking.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: int_automatic_clustering_history
 5 |     description: |
 6 |       Credits used by automatic clustering, aggregated to account and usage date.
 7 |     columns:
 8 |       - name: organization_name
 9 |         description: Organization name
10 |       - name: account_name
11 |         description: Account name
12 |       - name: usage_date
13 |         description: The date on which the usage occurred.
14 |       - name: credits_used
15 |         description: The total credits used for automatic clustering
16 | 
17 |   - name: int_materialized_view_refresh_history
18 |     description: |
19 |       Credits used by materialized view refreshes, aggregated to account and usage date.
20 |     columns:
21 |       - name: organization_name
22 |         description: Organization name
23 |       - name: account_name
24 |         description: Account name
25 |       - name: usage_date
26 |         description: The date on which the usage occurred.
27 |       - name: credits_used
28 |         description: The total credits used for materialized view refreshes
29 | 
30 |   - name: int_pipe_usage_history
31 |     description: |
32 |       Credits used by pipes, aggregated to account and usage date.
33 |     columns:
34 |       - name: organization_name
35 |         description: Organization name
36 |       - name: account_name
37 |         description: Account name
38 |       - name: usage_date
39 |         description: The date on which the usage occurred.
40 |       - name: credits_used
41 |         description: The total credits used by pipes
42 | 
43 |   - name: int_storage_daily_history
44 |     description: |
45 |       Credits used by storage, aggregated to account and usage date.
46 |     columns:
47 |       - name: organization_name
48 |         description: Organization name
49 |       - name: account_name
50 |         description: Account name
51 |       - name: usage_date
52 |         description: The date on which the usage occurred.
53 |       - name: credits_used
54 |         description: The total credits used by storage
55 | 
56 |   - name: int_warehouse_metering_history
57 |     description: |
58 |       Credits used by compute warehouses, aggregated to account and usage date.
59 |     columns:
60 |       - name: organization_name
61 |         description: Organization name
62 |       - name: account_name
63 |         description: Account name
64 |       - name: usage_date
65 |         description: The date on which the usage occurred.
66 |       - name: credits_used
67 |         description: The total credits used by warehouses
68 | 
69 |   - name: int_cortex_usage_daily_history
70 |     description: |
71 |       Credits used by Cortex AI services, aggregated to account and usage date.
72 |     columns:
73 |       - name: organization_name
74 |         description: Organization name
75 |       - name: account_name
76 |         description: Account name
77 |       - name: usage_date
78 |         description: The date on which the usage occurred.
79 |       - name: credits_used
80 |         description: The total credits used by Cortex
81 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/int_automatic_clustering_history.sql:
--------------------------------------------------------------------------------
 1 | with source as (
 2 |     select * from {{ ref('stg_automatic_clustering_history') }}
 3 | ),
 4 | 
 5 | usage_history as (
 6 |     select
 7 |         organization_name,
 8 |         account_name,
 9 |         usage_date,
10 |         sum(credits_used) as credits_used
11 |     from source
12 |     group by all
13 | )
14 | 
15 | select * from usage_history
16 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/int_cortex_usage_daily_history.sql:
--------------------------------------------------------------------------------
 1 | with source as (
 2 |     select * from {{ ref('stg_cortex_usage_daily_history') }}
 3 | ),
 4 | 
 5 | usage_history as (
 6 |     select
 7 |         organization_name,
 8 |         account_name,
 9 |         usage_date,
10 |         sum(credits_used) as credits_used
11 |     from source
12 |     group by organization_name, account_name, usage_date
13 | )
14 | 
15 | select * from usage_history
16 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/int_materialized_view_refresh_history.sql:
--------------------------------------------------------------------------------
 1 | with source as (
 2 |     select * from {{ ref('stg_materialized_view_refresh_history') }}
 3 | ),
 4 | 
 5 | usage_history as (
 6 |     select
 7 |         organization_name,
 8 |         account_name,
 9 |         usage_date,
10 |         sum(credits_used) as credits_used
11 |     from source
12 |     group by all
13 | )
14 | 
15 | select * from usage_history
16 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/int_pipe_usage_history.sql:
--------------------------------------------------------------------------------
 1 | with source as (
 2 |     select * from {{ ref('stg_pipe_usage_history') }}
 3 | ),
 4 | 
 5 | usage_history as (
 6 |     select
 7 |         organization_name,
 8 |         account_name,
 9 |         usage_date,
10 |         sum(credits_used) as credits_used
11 |     from source
12 |     group by all
13 | )
14 | 
15 | select * from usage_history
16 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/int_storage_daily_history.sql:
--------------------------------------------------------------------------------
 1 | with source as (
 2 |     select * from {{ ref('stg_storage_daily_history') }}
 3 | ),
 4 | 
 5 | usage_history as (
 6 |     select
 7 |         organization_name,
 8 |         account_name,
 9 |         usage_date,
10 |         sum(credits_used) as credits_used
11 |     from source
12 |     group by all
13 | )
14 | 
15 | select * from usage_history
16 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/snowflake_cost_tracking/int_warehouse_metering_history.sql:
--------------------------------------------------------------------------------
 1 | with source as (
 2 |     select * from {{ ref('stg_warehouse_metering_history') }}
 3 | ),
 4 | 
 5 | usage_history as (
 6 |     select
 7 |         organization_name,
 8 |         account_name,
 9 |         usage_date,
10 |         sum(credits_used) as credits_used
11 |     from source
12 |     group by all
13 | )
14 | 
15 | select * from usage_history
16 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/state_entities/_int_state_entities__models.yml:
--------------------------------------------------------------------------------
  1 | version: 2
  2 | 
  3 | models:
  4 |   - name: int_state_entities__active
  5 |     description: |
  6 |       Active state entities from the Department of Finance list.
  7 |       Entities which are flagged as "DO NOT USE", "abolished", or
  8 |       are technical entities (e.g. "DOF USE ONLY") are filtered out.
  9 |     columns:
 10 |       - name: name
 11 |         description: Name of the entity
 12 |       - name: primary_code
 13 |         description: The most specific non-null entity code
 14 |         data_tests:
 15 |           - not_null
 16 |           - unique
 17 |       - name: agency_code
 18 |         description: Agency code for entity
 19 |       - name: subagency_code
 20 |         description: Subagency code for entity
 21 |       - name: L1
 22 |         description: Level beneath subagency
 23 |       - name: L2
 24 |         description: Level beneath L1
 25 |       - name: L3
 26 |         description: Level beneath L2
 27 |       - name: parenthetical
 28 |         description: |
 29 |           Any text extracted from a paranthetical statement in the original text
 30 |       - name: do_not_use
 31 |         description: Whether any entity features "DO NOT USE" in the description
 32 |       - name: abolished
 33 |         description: Whether the entity features "abolished" in the description
 34 |       - name: restricted_use
 35 |         description: |
 36 |           Whether the entity contains a directive like "DOF USE ONLY" or "SCO USE ONLY "
 37 |           in the description.
 38 |       - name: name_raw
 39 |         description: |
 40 |           The original name, as well as any parentheticals or directives for the entity.
 41 |       - name: name_alpha
 42 |         description: |
 43 |           The name with things like "Office of" moved to the end,
 44 |           suitable for alphabetization.
 45 |       - name: ucm_level
 46 |         description: |
 47 |           The level in the hierarchy of the Uniform Control Manual
 48 |           (agency, subagency, L1, L2, or L3)
 49 |   - name: int_state_entities__technical
 50 |     description: |
 51 |       Acitve technical entities from the Department of Finance list.
 52 |     columns:
 53 |       - name: name
 54 |         description: Name of the entity
 55 |       - name: primary_code
 56 |         description: The most specific non-null entity code
 57 |         data_tests:
 58 |           - not_null
 59 |       - name: agency_code
 60 |         description: Agency code for entity
 61 |       - name: subagency_code
 62 |         description: Subagency code for entity
 63 |       - name: L1
 64 |         description: Level beneath subagency
 65 |       - name: L2
 66 |         description: Level beneath L1
 67 |       - name: L3
 68 |         description: Level beneath L2
 69 |       - name: parenthetical
 70 |         description: |
 71 |           Any text extracted from a paranthetical statement in the original text
 72 |       - name: do_not_use
 73 |         description: Whether any entity features "DO NOT USE" in the description
 74 |       - name: abolished
 75 |         description: Whether the entity features "abolished" in the description
 76 |       - name: restricted_use
 77 |         description: |
 78 |           Whether the entity contains a directive like "DOF USE ONLY" or "SCO USE ONLY "
 79 |           in the description.
 80 |       - name: name_raw
 81 |         description: |
 82 |           The original name, as well as any parentheticals or directives for the entity.
 83 |       - name: name_alpha
 84 |         description: |
 85 |           The name with things like "Office of" moved to the end,
 86 |           suitable for alphabetization.
 87 |       - name: ucm_level
 88 |         description: |
 89 |           The level in the hierarchy of the Uniform Control Manual
 90 |           (agency, subagency, L1, L2, or L3)
 91 | 
 92 |   - name: int_state_entities__budgets
 93 |     description: Fiscal year budgets for state entities
 94 |     columns:
 95 |       - name: primary_code
 96 |         description: Four digit business unit code for entity.
 97 |         data_tests:
 98 |           # There are duplicates!
 99 |           # - unique
100 |           - not_null
101 |       - name: name
102 |         description: Entity name
103 |       - name: ucm_level
104 |         description: |
105 |           The level in the hierarchy of the Uniform Control Manual
106 |           (agency, subagency, L1, L2, or L3)
107 |       - name: name_alpha
108 |         description: Variant of name for easier alphabetization
109 |       - name: budget_year_dollars
110 |         description: Budget for current fiscal year.
111 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/state_entities/int_state_entities__active.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized="view") }}
 2 | 
 3 | with
 4 | active_entities as (
 5 |     select *
 6 |     from {{ ref("stg_department_of_finance__entities") }}
 7 |     where
 8 |         do_not_use = false
 9 |         and abolished = false
10 |         and restricted_use is null
11 |         and cast(primary_code as int) < 9000
12 |         and not regexp_like(lower(name_raw), 'moved to|renum\.? to')
13 | )
14 | 
15 | select *
16 | from active_entities
17 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/state_entities/int_state_entities__budgets.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized="view") }}
 2 | 
 3 | with
 4 | active_entities as (select * from {{ ref("int_state_entities__active") }}),
 5 | 
 6 | budgets as (select * from {{ ref("stg_ebudget__budgets") }}),
 7 | 
 8 | active_agencies_and_departments as (
 9 |     -- only select at deparment level or higher
10 |     select * from active_entities where coalesce(l2, l3) is null
11 | ),
12 | 
13 | active_entity_budgets as (
14 |     select
15 |         active_agencies_and_departments.primary_code,
16 |         active_agencies_and_departments.ucm_level,
17 |         active_agencies_and_departments.name,
18 |         active_agencies_and_departments.name_alpha,
19 |         budgets.name as budget_name,
20 |         budgets.budget_year_dollars
21 |     from active_agencies_and_departments
22 |     left join
23 |         budgets
24 |         on active_agencies_and_departments.primary_code = budgets.primary_code
25 | )
26 | 
27 | select *
28 | from active_entity_budgets
29 | order by primary_code asc
30 | 


--------------------------------------------------------------------------------
/transform/models/intermediate/state_entities/int_state_entities__technical.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized="view") }}
 2 | 
 3 | with
 4 | technical_entities as (
 5 |     select *
 6 |     from {{ ref("stg_department_of_finance__entities") }}
 7 |     where
 8 |         (do_not_use = false and abolished = false)
 9 |         and (restricted_use is not null or cast(primary_code as int) >= 9000)
10 | )
11 | 
12 | select *
13 | from technical_entities
14 | 


--------------------------------------------------------------------------------
/transform/models/marts/geo_reference/_geo_reference__models.yml:
--------------------------------------------------------------------------------
  1 | version: 2
  2 | 
  3 | sources:
  4 |   - name: building_footprints
  5 |     database: "{{ env_var('DBT_RAW_DB', 'RAW_DEV') }}"
  6 |     config:
  7 |       schema: building_footprints
  8 |     tables:
  9 |       - name: us_building_footprints
 10 |         description: "[Microsoft US Building Footprints]\
 11 |           (https://github.com/Microsoft/USBuildingFootprints) \
 12 |           dataset for California."
 13 |       - name: global_ml_building_footprints
 14 |         description: "[Microsoft Global ML Building Footprints]\
 15 |           (https://github.com/microsoft/GlobalMLBuildingFootprints) \
 16 |           dataset for California. This contains some null geometries,\
 17 |           as well as geometries that fall somewhat outside of California"
 18 | 
 19 |   - name: tiger_2022
 20 |     database: "{{ env_var('DBT_RAW_DB', 'RAW_DEV') }}"
 21 |     schema: tiger_2022
 22 |     tables:
 23 |       - name: blocks
 24 |       - name: places
 25 | 
 26 | models:
 27 |   - name: geo_reference__us_building_footprints_with_tiger
 28 |     config:
 29 |       schema: building_footprints
 30 |       tags: building_footprints
 31 |     description: |
 32 |       This data table is a join of the TIGER data for blocks, tracts, counties, and
 33 |       places with the Microsoft US Building Footprints data for the state of CA.
 34 |     columns:
 35 |       - name: release
 36 |         description: The version of the data
 37 |       - name: capture_dates_range
 38 |         description: Each building footprint has a capture date tag from 2019-2020
 39 |       - name: county_fips
 40 |         description: 2020 Census county FIPS code
 41 |       - name: tract
 42 |         description: 2020 Census tract code
 43 |       - name: block
 44 |         description: 2020 Census tabulation block number
 45 |       - name: block_geoid
 46 |         description: >
 47 |           Census block identifier; a concatenation of 2020 Census state FIPS code, 2020
 48 |           Census county FIPS code, 2020 Census tract code, and 2020 Census block number
 49 |       - name: place_fips
 50 |         description: Current place FIPS code
 51 |       - name: place_ns
 52 |         description: Current place GNIS code
 53 |       - name: place_geoid
 54 |         description: >
 55 |           Place identifier; a concatenation of the current state
 56 |           FIPS code and place FIPS code
 57 |       - name: place_name
 58 |         description: >
 59 |           Current name and the translated legal/statistical
 60 |           area description for place
 61 |       - name: class_fips_code
 62 |         description: Current FIPS class code
 63 |       - name: class_fips
 64 |         description: Current FIPS class definition
 65 |       - name: geometry
 66 |         description: The footprint geometry
 67 |       - name: area_sqm
 68 |         description: The area of the footprint in square meters
 69 |   - name: geo_reference__global_ml_building_footprints_with_tiger
 70 |     config:
 71 |       schema: building_footprints
 72 |       tags: building_footprints
 73 |     description: |
 74 |       This data table is a join of the TIGER data for blocks, tracts, counties, and
 75 |       places with the Microsoft Global ML Building Footprints data for the state of CA.
 76 |     columns:
 77 |       - name: height
 78 |         description: The height of the building (negative indicates unknown height)
 79 |       - name: county_fips
 80 |         description: 2020 Census county FIPS code
 81 |       - name: tract
 82 |         description: 2020 Census tract code
 83 |       - name: block
 84 |         description: 2020 Census tabulation block number
 85 |       - name: block_geoid
 86 |         description: >
 87 |           Census block identifier; a concatenation of 2020 Census state FIPS code, 2020
 88 |           Census county FIPS code, 2020 Census tract code, and 2020 Census block number
 89 |       - name: place_fips
 90 |         description: Current place FIPS code
 91 |       - name: place_ns
 92 |         description: Current place GNIS code
 93 |       - name: place_geoid
 94 |         description: >
 95 |           Place identifier; a concatenation of the current state
 96 |           FIPS code and place FIPS code
 97 |       - name: place_name
 98 |         description: >
 99 |           Current name and the translated legal/statistical
100 |           area description for place
101 |       - name: class_fips_code
102 |         description: Current FIPS class code
103 |       - name: class_fips
104 |         description: Current FIPS class definition
105 |       - name: geometry
106 |         description: The footprint geometry
107 |       - name: area_sqm
108 |         description: The area of the footprint in square meters
109 | 


--------------------------------------------------------------------------------
/transform/models/marts/geo_reference/geo_reference__global_ml_building_footprints_with_tiger.sql:
--------------------------------------------------------------------------------
 1 | with footprints as ( -- noqa: ST03
 2 |     select
 3 |         "height",
 4 |         "geometry"
 5 |     from {{ source('building_footprints', 'global_ml_building_footprints') }}
 6 | ),
 7 | 
 8 | blocks_source as (
 9 |     select *
10 |     from {{ source('tiger_2022', 'blocks') }}
11 | ),
12 | 
13 | places_source as (
14 |     select * from {{ source('tiger_2022', 'places') }}
15 | ),
16 | 
17 | blocks as ( -- noqa: ST03
18 |     select
19 |         "COUNTYFP20" as "county_fips",
20 |         "TRACTCE20" as "tract",
21 |         "BLOCKCE20" as "block",
22 |         "GEOID20" as "block_geoid",
23 |         "geometry"
24 |     from blocks_source
25 | ),
26 | 
27 | places as ( -- noqa: ST03
28 |     select
29 |         "PLACEFP" as "place_fips",
30 |         "PLACENS" as "place_ns",
31 |         "GEOID" as "place_geoid",
32 |         "NAME" as "place_name",
33 |         "CLASSFP" as "class_fips_code",
34 |         {{ map_class_fips("CLASSFP") }} as "class_fips",
35 |         "geometry"
36 |     from places_source
37 | ),
38 | 
39 | footprints_with_blocks as ( -- noqa: ST03
40 |     {{ spatial_join_with_deduplication(
41 |        "footprints",
42 |        "blocks",
43 |        ['"height"'],
44 |        ['"county_fips"', '"tract"', '"block"', '"block_geoid"'],
45 |        left_geom='"geometry"',
46 |        right_geom='"geometry"',
47 |        kind="inner",
48 |        prefix="b",
49 |     ) }}
50 | ),
51 | 
52 | footprints_with_blocks_and_places as (
53 |     {{ spatial_join_with_deduplication(
54 |        "footprints_with_blocks",
55 |        "places",
56 |        ['"height"', '"county_fips"', '"tract"', '"block"', '"block_geoid"'],
57 |        ['"place_fips"', '"place_ns"', '"place_geoid"', '"place_name"', '"class_fips_code"', '"class_fips"'],
58 |        left_geom='"geometry"',
59 |        right_geom='"geometry"',
60 |        kind="left",
61 |        prefix="p",
62 |     ) }}
63 | ),
64 | 
65 | footprints_with_blocks_and_places_final as (
66 |     select
67 |         *,
68 |         st_area("geometry") as "area_sqm"
69 |     from footprints_with_blocks_and_places
70 | )
71 | 
72 | select * from footprints_with_blocks_and_places_final
73 | 


--------------------------------------------------------------------------------
/transform/models/marts/geo_reference/geo_reference__us_building_footprints_with_tiger.sql:
--------------------------------------------------------------------------------
 1 | with footprints as ( -- noqa: ST03
 2 |     select
 3 |         "release",
 4 |         "capture_dates_range",
 5 |         "geometry"
 6 |     from {{ source('building_footprints', 'us_building_footprints') }}
 7 | ),
 8 | 
 9 | blocks_source as (
10 |     select *
11 |     from {{ source('tiger_2022', 'blocks') }}
12 | ),
13 | 
14 | places_source as (
15 |     select * from {{ source('tiger_2022', 'places') }}
16 | ),
17 | 
18 | blocks as ( -- noqa: ST03
19 |     select
20 |         "COUNTYFP20" as "county_fips",
21 |         "TRACTCE20" as "tract",
22 |         "BLOCKCE20" as "block",
23 |         "GEOID20" as "block_geoid",
24 |         "geometry"
25 |     from blocks_source
26 | ),
27 | 
28 | places as ( -- noqa: ST03
29 |     select
30 |         "PLACEFP" as "place_fips",
31 |         "PLACENS" as "place_ns",
32 |         "GEOID" as "place_geoid",
33 |         "NAME" as "place_name",
34 |         "CLASSFP" as "class_fips_code",
35 |         {{ map_class_fips("CLASSFP") }} as "class_fips",
36 |         "geometry"
37 |     from places_source
38 | ),
39 | 
40 | footprints_with_blocks as ( -- noqa: ST03
41 |     {{ spatial_join_with_deduplication(
42 |        "footprints",
43 |        "blocks",
44 |        ['"release"', '"capture_dates_range"'],
45 |        ['"county_fips"', '"tract"', '"block"', '"block_geoid"'],
46 |        left_geom='"geometry"',
47 |        right_geom='"geometry"',
48 |        kind="inner",
49 |        prefix="b",
50 |     ) }}
51 | ),
52 | 
53 | footprints_with_blocks_and_places as (
54 |     {{ spatial_join_with_deduplication(
55 |        "footprints_with_blocks",
56 |        "places",
57 |        ['"release"', '"capture_dates_range"', '"county_fips"', '"tract"', '"block"', '"block_geoid"'],
58 |        ['"place_fips"', '"place_ns"', '"place_geoid"', '"place_name"', '"class_fips_code"', '"class_fips"'],
59 |        left_geom='"geometry"',
60 |        right_geom='"geometry"',
61 |        kind="left",
62 |        prefix="p",
63 |     ) }}
64 | ),
65 | 
66 | footprints_with_blocks_and_places_final as (
67 |     select
68 |         *,
69 |         st_area("geometry") as "area_sqm"
70 |     from footprints_with_blocks_and_places
71 | )
72 | 
73 | select * from footprints_with_blocks_and_places_final
74 | 


--------------------------------------------------------------------------------
/transform/models/marts/snowflake_cost_tracking/_snowflake_cost_tracking.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: snowflake_costs_by_date
 5 |     description: |
 6 |       Snowflake costs by date for the following:
 7 | 
 8 |         * Automatic clustering
 9 |         * Materialized view refreshes
10 |         * Pipe usage
11 |         * Storage
12 |         * Warehouse usage
13 |         * Cortex (GenAI) usage
14 | 
15 |       Data are in long form, where `usage_type` indicates which
16 |       type of usage is measured in credits.
17 |     columns:
18 |       - name: account_name
19 |         description: Account name
20 |       - name: usage_date
21 |         description: Date on which the usage occurred
22 |       - name: usage_type
23 |         description: |
24 |           One of the following usage types:
25 | 
26 |           * `'automatic clustering'`
27 |           * `'materialized view'`
28 |           * `'pipe'`
29 |           * `'storage'`
30 |           * `'warehouse'`
31 |           * `'cortex'`
32 |       - name: credits_used
33 |         description: The credits used for the usage type and date.
34 | 


--------------------------------------------------------------------------------
/transform/models/marts/snowflake_cost_tracking/snowflake_costs_by_date.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | TODO: this does not yet account for credits consumed by:
 3 | 
 4 |   * Query acceleration
 5 |   * Search optimization
 6 |   * Replication/failover groups
 7 | */
 8 | 
 9 | with automatic_clustering_history as (
10 |     select
11 |         account_name,
12 |         usage_date,
13 |         'automatic clustering' as usage_type,
14 |         credits_used
15 |     from {{ ref('int_automatic_clustering_history') }}
16 | ),
17 | 
18 | materialized_view_refresh_history as (
19 |     select
20 |         account_name,
21 |         usage_date,
22 |         'materialized view' as usage_type,
23 |         credits_used
24 |     from {{ ref('int_materialized_view_refresh_history') }}
25 | ),
26 | 
27 | pipe_usage_history as (
28 |     select
29 |         account_name,
30 |         usage_date,
31 |         'pipe' as usage_type,
32 |         credits_used
33 |     from {{ ref('int_pipe_usage_history') }}
34 | ),
35 | 
36 | storage_daily_history as (
37 |     select
38 |         account_name,
39 |         usage_date,
40 |         'storage' as usage_type,
41 |         credits_used
42 |     from {{ ref('int_storage_daily_history') }}
43 | ),
44 | 
45 | warehouse_metering_history as (
46 |     select
47 |         account_name,
48 |         usage_date,
49 |         'warehouse' as usage_type,
50 |         credits_used
51 |     from {{ ref('int_warehouse_metering_history') }}
52 | ),
53 | 
54 | cortex_usage_daily_history as (
55 |     select
56 |         account_name,
57 |         usage_date,
58 |         'cortex' as usage_type,
59 |         credits_used
60 |     from {{ ref('int_cortex_usage_daily_history') }}
61 | ),
62 | 
63 | -- Combine the data in long form to allow for easy
64 | -- aggregations and visualizations.
65 | combined as (
66 |     select * from automatic_clustering_history
67 |     union all
68 |     select * from materialized_view_refresh_history
69 |     union all
70 |     select * from pipe_usage_history
71 |     union all
72 |     select * from storage_daily_history
73 |     union all
74 |     select * from warehouse_metering_history
75 |     union all
76 |     select * from cortex_usage_daily_history
77 | )
78 | 
79 | select * from combined
80 | 


--------------------------------------------------------------------------------
/transform/models/marts/state_entities/_state_entities__models.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: dim_state_entities__agencies
 5 |     description: Agency-level state entities.
 6 |     columns:
 7 |       - name: name
 8 |         description: The name of the state agency
 9 |         data_tests:
10 |           - unique
11 |           - not_null
12 |       - name: agency_code
13 |         description: The numeric code of the state agency
14 |         data_tests:
15 |           - unique
16 |           - not_null
17 | 


--------------------------------------------------------------------------------
/transform/models/marts/state_entities/dim_state_entities__agencies.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized="table") }}
 2 | 
 3 | with
 4 | agencies as (
 5 |     select
 6 |         name,
 7 |         agency_code
 8 |     from {{ ref("int_state_entities__active") }}
 9 |     where subagency_code is null and l1 is null
10 | )
11 | 
12 | select *
13 | from agencies
14 | 


--------------------------------------------------------------------------------
/transform/models/overview.md:
--------------------------------------------------------------------------------
 1 | {% docs __overview__ %}
 2 | 
 3 | # CalData dbt Documentation
 4 | 
 5 | Welcome to the CalData Data Services and Engineering `dbt` Snowflake docs.
 6 | To go back to the top-level docs, follow [this link](../)
 7 | 
 8 | ## Navigation
 9 | 
10 | You can use the `Project` and `Database` navigation tabs on the left side of the window to explore the models in your project.
11 | 
12 | ### Project Tab
13 | 
14 | The Project tab mirrors the directory structure of your dbt project.
15 | In this tab, you can see all of the models defined in your dbt project, as well as models imported from dbt packages.
16 | 
17 | ### Database Tab
18 | 
19 | The Database tab also exposes your models, but in a format that looks more like a database explorer.
20 | This view shows relations (tables and views) grouped into database schemas.
21 | Note that ephemeral models are not shown in this interface, as they do not exist in the database.
22 | 
23 | ## Graph Exploration
24 | 
25 | You can click the blue icon on the bottom-right corner of the page to view the lineage graph of your models.
26 | 
27 | On model pages, you'll see the immediate parents and children of the model you're exploring.
28 | By clicking the Expand button at the top-right of this lineage pane,
29 | you'll be able to see all of the models that are used to build, or are built from,
30 | the model you're exploring.
31 | 
32 | Once expanded, you'll be able to use the `--select` and `--exclude` model selection syntax to filter the models in the graph.
33 | For more information on model selection, check out the [dbt docs](https://docs.getdbt.com/reference/node-selection/syntax).
34 | 
35 | Note that you can also right-click on models to interactively filter and explore the graph.
36 | 
37 | {% enddocs %}
38 | 


--------------------------------------------------------------------------------
/transform/models/staging/department_of_finance/stg_department_of_finance__entities.sql:
--------------------------------------------------------------------------------
  1 | {{ config(materialized="table") }}
  2 | 
  3 | {% set udf_schema = "PUBLIC" %}
  4 | 
  5 | {% call set_sql_header(config) %}
  6 | 
  7 | -- Warning! The SQL header is rendered separately from the rest of the template,
  8 | -- so we redefine the udf_schema in this block:
  9 | -- https://github.com/dbt-labs/dbt-core/issues/2793
 10 | {% set udf_schema = "PUBLIC" %}
 11 | 
 12 | create or replace temp function
 13 |     {{ udf_schema }}.reorder_name_for_alphabetization(name string)
 14 | returns string
 15 | language javascript
 16 | as
 17 |     $$
 18 |   // Replace fancy quotes with normal ones.
 19 |   const name = NAME.replace("’", "'");
 20 | 
 21 |   // Skip some exceptions
 22 |   const skip = ["Governor's Office"];
 23 |   if (skip.includes(name)) {
 24 |     return name;
 25 |   }
 26 | 
 27 |   // Annoying exceptions
 28 |   if (name.includes("Milton Marks") && name.includes("Little Hoover")) {
 29 |     return "Little Hoover Commission";
 30 |   }
 31 | 
 32 |   // Basic organizational types by which we don't want to organize.
 33 |   const patterns = [
 34 |     "Office of the Secretary (?:for|of)?",
 35 |     "Commission (?:on|for)?",
 36 |     "Board of Governors (?:for|of)?",
 37 |     "Board (?:of|on|for)?",
 38 |     "Agency (?:on|for)?",
 39 |     "(?:Department|Dept\\.) of",
 40 |     "Commission (?:on|for)?",
 41 |     "Committee (?:on|for)?",
 42 |     "Bureau of",
 43 |     "Council on",
 44 |     "Policy Council on",
 45 |     "Institute of",
 46 |     "Office (?:for|of)?",
 47 |     "Secretary (?:for|of)?",
 48 |     "", // Empty pattern to catch the prefixes below.
 49 |   ].map(
 50 |     // Lots of entities also start with throat clearing like "California this"
 51 |     // or "State that", which we also want to skip. Some also include a definite
 52 |     // article after the organizational unit.
 53 |     (p) =>
 54 |       "(?:California\\s+)?(?:Governor's\\s+)?(?:State\\s+|St\\.\\s+)?(?:Intergovernmental\\s+)?" +
 55 |       p +
 56 |       "(?:\\s+the)?"
 57 |   );
 58 | 
 59 |   const all_patterns = `(${patterns.join("|")})`;
 60 |   const re = RegExp(`^${all_patterns}\\s*(.+)$`); // \s* because some of the above eat spaces.
 61 |   const match = name.match(re);
 62 |   // Empty prefixes are matched, so skip if we don't get a full match.
 63 |   if (match && match[1] && match[2]) {
 64 |     return `${match[2].trim()}, ${match[1].trim()}`;
 65 |   } else {
 66 |     return name;
 67 |   }
 68 | $$
 69 | ;
 70 | 
 71 | create or replace temp function {{ udf_schema }}.extract_name(name string)
 72 | returns string
 73 | language javascript
 74 | as $$
 75 |   const match = NAME.match(/^(.+?)(?:(?:\s*\(.*\)\s*|\s*[-–]+\s*[A-Z/ ]+)*)$/);
 76 |   if (match && match[1]) {
 77 |     return match[1];
 78 |   }
 79 |   return NAME;
 80 | $$
 81 | ;
 82 | {%- endcall %}
 83 | 
 84 | with
 85 | base_entities as (select * from {{ source("state_entities", "base_entities") }}),
 86 | 
 87 | invalid_subagencies as (
 88 |     select *
 89 |     from base_entities
 90 |     where contains("name", 'no subagency') and contains("name", 'do not use')
 91 | ),
 92 | 
 93 | entities as (
 94 |     select
 95 |         -- Extract the first portion of the entity as the name. The other
 96 |         -- two (optional) groups match parentheticals and things like
 97 |         -- "-- DO NOT USE" or " -- DOF USE ONLY"
 98 |         {{ udf_schema }}.extract_name("name") as name,
 99 |         coalesce(l3, l2, l1, b, a) as primary_code,
100 |         a as agency_code,
101 |         case
102 |             when b in (select b from invalid_subagencies) then null else b
103 |         end as subagency_code,
104 |         l1,
105 |         l2,
106 |         l3,
107 |         regexp_substr("name", '\\((.+?)\\)') as parenthetical,
108 |         contains(lower("name"), 'do not use') as do_not_use,
109 |         contains(lower("name"), 'abolished') as abolished,
110 |         regexp_substr("name", '[A-Z/]+ USE ONLY') as restricted_use,
111 |         "name" as name_raw
112 |     from base_entities
113 | ),
114 | 
115 | entities_with_extras as (
116 |     select
117 |         *,
118 |         {{ udf_schema }}.reorder_name_for_alphabetization(name) as name_alpha,
119 |         case
120 |             when coalesce(l3, l2, l1, subagency_code) is null
121 |                 then 'agency'
122 |             when coalesce(l3, l2, l1) is null
123 |                 then 'subagency'
124 |             when coalesce(l3, l2) is null
125 |                 then 'L1'
126 |             when l3 is null
127 |                 then 'L2'
128 |             else 'L3'
129 |         end as ucm_level
130 |     from entities
131 | )
132 | 
133 | select *
134 | from entities_with_extras
135 | 


--------------------------------------------------------------------------------
/transform/models/staging/department_of_finance/stg_ebudget__budgets.sql:
--------------------------------------------------------------------------------
 1 | with
 2 | agencies_and_departments as (
 3 |     select *
 4 |     from {{ source('state_entities', 'ebudget_agency_and_department_budgets') }}
 5 | ),
 6 | 
 7 | ebudget_budgets as (
 8 |     select
 9 |         "web_agency_cd" as primary_code,
10 |         "legal_titl" as name,
11 |         "all_budget_year_dols" as budget_year_dollars
12 |     from agencies_and_departments
13 | )
14 | 
15 | select *
16 | from ebudget_budgets
17 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_automatic_clustering_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "DATABASE_NAME",
 7 |     "SCHEMA_NAME",
 8 |     "TABLE_NAME",
 9 |     "USAGE_DATE",
10 |     ],
11 |   )
12 | }}
13 | 
14 | WITH source AS (
15 |     SELECT
16 |         num_rows_reclustered,
17 |         account_locator,
18 |         database_id,
19 |         schema_name,
20 |         database_name,
21 |         table_id,
22 |         schema_id,
23 |         account_name,
24 |         region,
25 |         credits_used,
26 |         organization_name,
27 |         table_name,
28 |         usage_date,
29 |         num_bytes_reclustered
30 |     FROM {{ source('organization_usage', 'automatic_clustering_history') }}
31 | ),
32 | 
33 | automatic_clustering_history AS (
34 |     SELECT
35 |         organization_name,
36 |         account_name,
37 |         database_name,
38 |         schema_name,
39 |         table_name,
40 |         usage_date,
41 |         sum(credits_used) AS credits_used,
42 |         sum(num_rows_reclustered) AS num_rows_reclustered,
43 |         sum(num_bytes_reclustered) AS num_bytes_reclustered
44 |     FROM source
45 |     GROUP BY ALL
46 | )
47 | 
48 | SELECT *
49 | FROM automatic_clustering_history
50 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_cortex_usage_daily_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "USAGE_DATE",
 7 |     ],
 8 |   )
 9 | }}
10 | 
11 | -- The ORGANIZATION_USAGE schema does not provide a specific
12 | -- view on Cortex usage, so we need to get it from the overall
13 | -- metering daily history table.
14 | -- https://docs.snowflake.com/en/user-guide/snowflake-cortex/aisql#track-costs-for-ai-services
15 | WITH source AS (
16 |     SELECT
17 |         credits_adjustment_cloud_services,
18 |         region,
19 |         credits_used,
20 |         service_type,
21 |         account_locator,
22 |         usage_date,
23 |         account_name,
24 |         credits_billed,
25 |         credits_used_cloud_services,
26 |         organization_name,
27 |         credits_used_compute
28 |     FROM {{ source('organization_usage', 'metering_daily_history') }}
29 |     WHERE service_type = 'AI_SERVICES'
30 | ),
31 | 
32 | metering_daily_history AS (
33 |     SELECT
34 |         organization_name,
35 |         account_name,
36 |         usage_date,
37 |         sum(credits_used_compute) AS credits_used_compute,
38 |         sum(credits_used_cloud_services) AS credits_used_cloud_services,
39 |         sum(credits_adjustment_cloud_services) AS credits_adjustment_cloud_services,
40 |         sum(credits_used) AS credits_used,
41 |         sum(credits_billed) AS credits_billed
42 |     FROM source
43 |     GROUP BY organization_name, account_name, usage_date
44 | )
45 | 
46 | SELECT *
47 | FROM metering_daily_history
48 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_data_transfer_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "USAGE_DATE",
 7 |     "SOURCE_CLOUD",
 8 |     "SOURCE_REGION",
 9 |     "TARGET_CLOUD",
10 |     "TARGET_REGION",
11 |     ],
12 |   )
13 | }}
14 | 
15 | WITH source AS (
16 |     SELECT
17 |         organization_name,
18 |         account_name,
19 |         account_locator,
20 |         region,
21 |         usage_date,
22 |         source_cloud,
23 |         source_region,
24 |         target_cloud,
25 |         target_region,
26 |         bytes_transferred,
27 |         transfer_type
28 |     FROM {{ source('organization_usage', 'data_transfer_history') }}
29 | ),
30 | 
31 | data_transfer_history AS (
32 |     SELECT
33 |         organization_name,
34 |         account_name,
35 |         usage_date,
36 |         source_cloud,
37 |         source_region,
38 |         target_cloud,
39 |         target_region,
40 |         sum(bytes_transferred) AS bytes_transferred
41 |     FROM source
42 |     GROUP BY ALL
43 | )
44 | 
45 | SELECT *
46 | FROM data_transfer_history
47 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_database_storage_usage_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "DATABASE_NAME",
 7 |     "USAGE_DATE",
 8 |     ],
 9 |   )
10 | }}
11 | 
12 | WITH source AS (
13 |     SELECT
14 |         region,
15 |         average_hybrid_table_storage_bytes,
16 |         organization_name,
17 |         usage_date,
18 |         database_id,
19 |         database_name,
20 |         account_name,
21 |         average_database_bytes,
22 |         account_locator,
23 |         average_failsafe_bytes
24 |     FROM {{ source('organization_usage', 'database_storage_usage_history') }}
25 | ),
26 | 
27 | database_storage_usage_history AS (
28 |     SELECT
29 |         organization_name,
30 |         account_name,
31 |         database_name,
32 |         usage_date,
33 |         AVG(average_hybrid_table_storage_bytes) AS average_hybrid_table_storage_bytes,
34 |         AVG(average_database_bytes) AS average_database_bytes,
35 |         AVG(average_failsafe_bytes) AS average_failsafe_bytes
36 |     FROM source
37 |     GROUP BY ALL
38 | )
39 | 
40 | SELECT *
41 | FROM database_storage_usage_history
42 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_materialized_view_refresh_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "DATABASE_NAME",
 7 |     "SCHEMA_NAME",
 8 |     "TABLE_NAME",
 9 |     "USAGE_DATE",
10 |     ],
11 |   )
12 | }}
13 | 
14 | WITH source AS (
15 |     SELECT
16 |         schema_name,
17 |         credits_used,
18 |         organization_name,
19 |         database_id,
20 |         schema_id,
21 |         table_id,
22 |         account_locator,
23 |         account_name,
24 |         region,
25 |         database_name,
26 |         table_name,
27 |         usage_date
28 |     FROM {{ source('organization_usage', 'materialized_view_refresh_history') }}
29 | ),
30 | 
31 | materialized_view_refresh_history AS (
32 |     SELECT
33 |         organization_name,
34 |         account_name,
35 |         database_name,
36 |         schema_name,
37 |         table_name,
38 |         usage_date,
39 |         sum(credits_used) AS credits_used
40 |     FROM source
41 |     GROUP BY ALL
42 | )
43 | 
44 | SELECT *
45 | FROM materialized_view_refresh_history
46 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_metering_daily_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "USAGE_DATE",
 7 |     ],
 8 |   )
 9 | }}
10 | 
11 | WITH source AS (
12 |     SELECT
13 |         credits_adjustment_cloud_services,
14 |         region,
15 |         credits_used,
16 |         service_type,
17 |         account_locator,
18 |         usage_date,
19 |         account_name,
20 |         credits_billed,
21 |         credits_used_cloud_services,
22 |         organization_name,
23 |         credits_used_compute
24 |     FROM {{ source('organization_usage', 'metering_daily_history') }}
25 | ),
26 | 
27 | metering_daily_history AS (
28 |     SELECT
29 |         organization_name,
30 |         account_name,
31 |         usage_date,
32 |         sum(credits_used_compute) AS credits_used_compute,
33 |         sum(credits_used_cloud_services) AS credits_used_cloud_services,
34 |         sum(credits_adjustment_cloud_services) AS credits_adjustment_cloud_services,
35 |         sum(credits_used) AS credits_used,
36 |         sum(credits_billed) AS credits_billed
37 |     FROM source
38 |     GROUP BY ALL
39 | )
40 | 
41 | SELECT *
42 | FROM metering_daily_history
43 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_pipe_usage_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "PIPE_NAME",
 7 |     "USAGE_DATE",
 8 |     ],
 9 |   )
10 | }}
11 | 
12 | WITH source AS (
13 |     SELECT
14 |         region,
15 |         organization_name,
16 |         bytes_inserted,
17 |         files_inserted,
18 |         usage_date,
19 |         account_locator,
20 |         credits_used,
21 |         account_name,
22 |         pipe_id,
23 |         pipe_name
24 |     FROM {{ source('organization_usage', 'pipe_usage_history') }}
25 | ),
26 | 
27 | pipe_usage_history AS (
28 |     SELECT
29 |         organization_name,
30 |         account_name,
31 |         pipe_name,
32 |         usage_date,
33 |         sum(bytes_inserted) AS bytes_inserted,
34 |         sum(files_inserted) AS files_inserted,
35 |         sum(credits_used) AS credits_used
36 |     FROM source
37 |     GROUP BY ALL
38 | )
39 | 
40 | SELECT *
41 | FROM pipe_usage_history
42 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_stage_storage_usage_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "USAGE_DATE",
 7 |     ],
 8 |   )
 9 | }}
10 | 
11 | WITH source AS (
12 |     SELECT
13 |         organization_name,
14 |         account_locator,
15 |         account_name,
16 |         region,
17 |         usage_date,
18 |         average_stage_bytes
19 |     FROM {{ source('organization_usage', 'stage_storage_usage_history') }}
20 | ),
21 | 
22 | stage_storage_usage_history AS (
23 |     SELECT
24 |         organization_name,
25 |         account_name,
26 |         usage_date,
27 |         avg(average_stage_bytes) AS average_stage_bytes
28 |     FROM source
29 |     GROUP BY ALL
30 | )
31 | 
32 | SELECT *
33 | FROM stage_storage_usage_history
34 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_storage_daily_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "USAGE_DATE",
 7 |     ],
 8 |   )
 9 | }}
10 | 
11 | WITH source AS (
12 |     SELECT
13 |         organization_name,
14 |         account_name,
15 |         account_locator,
16 |         region,
17 |         usage_date,
18 |         service_type,
19 |         average_bytes,
20 |         credits
21 |     FROM {{ source('organization_usage', 'storage_daily_history') }}
22 | ),
23 | 
24 | storage_daily_history AS (
25 |     SELECT
26 |         organization_name,
27 |         account_name,
28 |         usage_date,
29 |         avg(average_bytes) AS average_bytes,
30 |         sum(credits) AS credits_used
31 |     FROM source
32 |     GROUP BY ALL
33 | )
34 | 
35 | SELECT *
36 | FROM storage_daily_history
37 | 


--------------------------------------------------------------------------------
/transform/models/staging/snowflake_cost_tracking/stg_warehouse_metering_history.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized="incremental",
 3 |   unique_key=[
 4 |     "ORGANIZATION_NAME",
 5 |     "ACCOUNT_NAME",
 6 |     "WAREHOUSE_NAME",
 7 |     "USAGE_DATE",
 8 |     ],
 9 |   )
10 | }}
11 | 
12 | WITH source AS (
13 |     SELECT
14 |         account_name,
15 |         warehouse_id,
16 |         credits_used,
17 |         credits_used_compute,
18 |         region,
19 |         start_time,
20 |         credits_used_cloud_services,
21 |         warehouse_name,
22 |         organization_name,
23 |         service_type,
24 |         account_locator,
25 |         end_time
26 |     FROM {{ source('organization_usage', 'warehouse_metering_history') }}
27 | ),
28 | 
29 | warehouse_metering_history AS (
30 |     SELECT
31 |         organization_name,
32 |         account_name,
33 |         warehouse_name,
34 |         to_date(start_time) AS usage_date,
35 |         sum(credits_used) AS credits_used,
36 |         sum(credits_used_compute) AS credits_used_compute,
37 |         sum(credits_used_cloud_services) AS credits_used_cloud_services
38 |     FROM source
39 |     GROUP BY ALL
40 | )
41 | 
42 | SELECT *
43 | FROM warehouse_metering_history
44 | 


--------------------------------------------------------------------------------
/transform/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.3.0
4 | sha1_hash: 226ae69cdfbc9367e2aa2c472b01f99dbce11de0
5 | 


--------------------------------------------------------------------------------
/transform/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.3.0
4 | 


--------------------------------------------------------------------------------
/transform/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/seeds/.gitkeep


--------------------------------------------------------------------------------
/transform/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/snapshots/.gitkeep


--------------------------------------------------------------------------------
/transform/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/tests/.gitkeep


--------------------------------------------------------------------------------