├── .devcontainer ├── devcontainer.json └── profiles.yml ├── .github └── workflows │ ├── build-image.yml │ ├── docs.yml │ ├── pre-commit.yml │ ├── submit-job.yml │ └── terraform-validation.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .prettierrc.yml ├── .sqlfluff ├── .terraform-docs.yml ├── .tflint.hcl ├── .yamllint ├── LICENSE ├── README.md ├── airflow ├── .gitignore ├── aws_cli.py ├── dags │ ├── common │ │ ├── __init__.py │ │ ├── defaults.py │ │ ├── requests.py │ │ └── slack.py │ ├── geo_reference │ │ └── load_building_footprints.py │ └── state_entities │ │ ├── base_entities.py │ │ └── budgets.py ├── deploy.sh ├── plugins │ └── .gitkeep └── requirements │ └── requirements.txt ├── ci └── profiles.yml ├── docs ├── code │ ├── azdevops-project-management.md │ ├── code-review.md │ ├── codespaces.md │ ├── github-project-management.md │ ├── local-setup.md │ ├── terraform-local-setup.md │ └── writing-documentation.md ├── data │ └── footprints.md ├── dbt │ ├── dbt-performance.md │ └── dbt.md ├── images │ ├── codespace-secrets.png │ ├── column-pruning.png │ ├── columnar-storage.png │ ├── consumers.png │ ├── create-new-codespace.png │ ├── databases.png │ ├── dbt_model_timing.png │ ├── dbt_run_summary.png │ ├── developer.png │ ├── devops │ │ ├── workitemdetails.png │ │ ├── workitemexamples.png │ │ └── workitemtypes.png │ ├── environment_variables.png │ ├── github │ │ ├── commit-changes.png │ │ ├── comparing-changes.png │ │ ├── conflict-markers.png │ │ ├── conflict-sections.png │ │ ├── create-a-pull-request-github.png │ │ ├── issues-pr-actions.png │ │ ├── merging-in-vs-code.png │ │ ├── merging-in-vs-code2.png │ │ ├── open-a-pr.png │ │ ├── page4.png │ │ ├── pm-eg-metaissue.png │ │ ├── pm-subtasks.png │ │ ├── pr-description.png │ │ ├── request-for-review.png │ │ ├── suggest-a-change.png │ │ └── version-control.png │ ├── initial-query.png │ ├── launch-codespace.png │ ├── nightly.png │ ├── odi-circle_logomark-blue.png │ ├── odi-square_logomark-blue.svg │ └── partition-pruning.png ├── index.md ├── infra │ ├── architecture.md │ ├── cloud-infrastructure.md │ └── snowflake.md ├── learning │ ├── cloud-data-warehouses.md │ ├── dbt.md │ ├── git.md │ ├── glossary.md │ ├── naming-conventions.md │ └── security.md ├── setup │ ├── dbt-setup.md │ ├── fivetran-setup.md │ ├── project-teardown.md │ ├── repo-setup.md │ ├── sentinel-setup.md │ ├── snowflake-service-accounts.md │ ├── snowflake-setup.md │ └── terraform-project-setup.md ├── static │ ├── AccessFootprintsArcPro.pdf │ └── Download.MS.Global.Footprints.zip └── stylesheets │ └── extra.css ├── images ├── Dockerfile └── environment.yml ├── jobs ├── __init__.py ├── geo │ ├── __init__.py │ ├── data │ │ └── california.geojson │ ├── load_global_ml_building_footprints.py │ ├── load_us_building_footprints.py │ ├── tiger.py │ └── write_building_footprints.py ├── test.py └── utils │ ├── __init__.py │ └── snowflake.py ├── mkdocs.yml ├── poetry.lock ├── pyproject.toml ├── terraform ├── .gitignore ├── aws │ ├── README.md │ ├── environments │ │ └── dev │ │ │ ├── .terraform.lock.hcl │ │ │ ├── dse-infra-dev.tfbackend │ │ │ ├── main.tf │ │ │ └── remote-state │ │ │ ├── .terraform.lock.hcl │ │ │ ├── main.tf │ │ │ └── terraform.tfvars │ └── modules │ │ └── infra │ │ ├── airflow.tf │ │ ├── batch.tf │ │ ├── ecr.tf │ │ ├── iam.tf │ │ ├── main.tf │ │ ├── network.tf │ │ ├── outputs.tf │ │ ├── s3.tf │ │ ├── secrets.tf │ │ └── variables.tf ├── s3-remote-state │ ├── README.md │ └── main.tf └── snowflake │ ├── environments │ ├── dev │ │ ├── .terraform.lock.hcl │ │ ├── dse-snowflake-dev.tfbackend │ │ ├── main.tf │ │ ├── remote-state │ │ │ ├── .terraform.lock.hcl │ │ │ ├── main.tf │ │ │ └── terraform.tfvars │ │ └── terraform.tfvars │ └── prd │ │ ├── .terraform.lock.hcl │ │ ├── dse-snowflake-prd.tfbackend │ │ ├── main.tf │ │ ├── remote-state │ │ ├── .terraform.lock.hcl │ │ ├── main.tf │ │ └── terraform.tfvars │ │ └── terraform.tfvars │ └── modules │ ├── database │ ├── main.tf │ ├── outputs.tf │ └── variables.tf │ ├── elt │ ├── databases.tf │ ├── main.tf │ ├── roles.tf │ ├── users.tf │ ├── variables.tf │ └── warehouses.tf │ └── warehouse │ ├── main.tf │ ├── outputs.tf │ └── variables.tf └── transform ├── .gitignore ├── .sqlfluff ├── .sqlfluffignore ├── README.md ├── analyses └── .gitkeep ├── dbt_project.yml ├── macros ├── .gitkeep ├── _macros.yml ├── get_custom_schema.sql ├── map_class_fp.sql └── spatial_join_with_deduplication.sql ├── models ├── intermediate │ ├── snowflake_cost_tracking │ │ ├── _snowflake_cost_tracking.yml │ │ ├── int_automatic_clustering_history.sql │ │ ├── int_cortex_usage_daily_history.sql │ │ ├── int_materialized_view_refresh_history.sql │ │ ├── int_pipe_usage_history.sql │ │ ├── int_storage_daily_history.sql │ │ └── int_warehouse_metering_history.sql │ └── state_entities │ │ ├── _int_state_entities__models.yml │ │ ├── int_state_entities__active.sql │ │ ├── int_state_entities__budgets.sql │ │ └── int_state_entities__technical.sql ├── marts │ ├── geo_reference │ │ ├── _geo_reference__models.yml │ │ ├── geo_reference__global_ml_building_footprints_with_tiger.sql │ │ └── geo_reference__us_building_footprints_with_tiger.sql │ ├── snowflake_cost_tracking │ │ ├── _snowflake_cost_tracking.yml │ │ └── snowflake_costs_by_date.sql │ └── state_entities │ │ ├── _state_entities__models.yml │ │ └── dim_state_entities__agencies.sql ├── overview.md └── staging │ ├── department_of_finance │ ├── _department_of_finance__models.yml │ ├── stg_department_of_finance__entities.sql │ └── stg_ebudget__budgets.sql │ └── snowflake_cost_tracking │ ├── _snowflake_cost_tracking__models.yml │ ├── stg_automatic_clustering_history.sql │ ├── stg_cortex_usage_daily_history.sql │ ├── stg_data_transfer_history.sql │ ├── stg_database_storage_usage_history.sql │ ├── stg_materialized_view_refresh_history.sql │ ├── stg_metering_daily_history.sql │ ├── stg_pipe_usage_history.sql │ ├── stg_stage_storage_usage_history.sql │ ├── stg_storage_daily_history.sql │ └── stg_warehouse_metering_history.sql ├── package-lock.yml ├── packages.yml ├── seeds └── .gitkeep ├── snapshots └── .gitkeep └── tests └── .gitkeep /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dbt", 3 | "image": "mcr.microsoft.com/devcontainers/python:1-3.10-bookworm", 4 | "features": { 5 | "ghcr.io/devcontainers-contrib/features/poetry:2": {}, 6 | "ghcr.io/devcontainers/features/terraform:1": {} 7 | }, 8 | "customizations": { 9 | "vscode": { 10 | "settings": { 11 | "dbt.queryLimit": 50, 12 | "git.allowNoVerifyCommit": true, 13 | "python.defaultInterpreterPath": ".venv/bin/python" 14 | }, 15 | "extensions": [ 16 | "innoverio.vscode-dbt-power-user" 17 | ] 18 | } 19 | }, 20 | "secrets": { 21 | "SNOWFLAKE_USER": { 22 | "description": "Your Snowflake Username" 23 | }, 24 | "SNOWFLAKE_PASSWORD": { 25 | "description": "Your Snowflake Password" 26 | }, 27 | "DBT_SCHEMA": { 28 | "description": "The dev schema into which to build your dbt models (e.g. DBT_)" 29 | } 30 | }, 31 | "postCreateCommand": "poetry config virtualenvs.in-project true && poetry install && poetry run pre-commit install && mkdir -p ~/.dbt && cp .devcontainer/profiles.yml ~/.dbt" 32 | } 33 | -------------------------------------------------------------------------------- /.devcontainer/profiles.yml: -------------------------------------------------------------------------------- 1 | dse_snowflake: 2 | target: snowflake_dev 3 | outputs: 4 | snowflake_dev: 5 | type: snowflake 6 | account: heb41095 7 | authenticator: username_password_mfa 8 | user: "{{ env_var('SNOWFLAKE_USER') }}" 9 | password: "{{ env_var('SNOWFLAKE_PASSWORD') }}" 10 | role: TRANSFORMER_DEV 11 | warehouse: TRANSFORMING_XS_DEV 12 | database: TRANSFORM_DEV 13 | schema: "{{ env_var('DBT_SCHEMA') }}" 14 | threads: 4 15 | -------------------------------------------------------------------------------- /.github/workflows/build-image.yml: -------------------------------------------------------------------------------- 1 | name: Build Image 2 | 3 | on: push 4 | 5 | jobs: 6 | build-image: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Check out repository 10 | uses: actions/checkout@v3 11 | 12 | - name: Set up Docker Buildx 13 | uses: docker/setup-buildx-action@v2 14 | 15 | - name: Configure AWS Credentials 16 | uses: aws-actions/configure-aws-credentials@v2 17 | with: 18 | # TODO: use OIDC for auth: 19 | # https://github.com/aws-actions/configure-aws-credentials#assuming-a-role 20 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 21 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 22 | aws-region: us-west-2 23 | 24 | - name: Login to Amazon ECR 25 | id: login-ecr 26 | uses: aws-actions/amazon-ecr-login@v1 27 | 28 | - name: Build, tag, and push docker image to Amazon ECR 29 | uses: docker/build-push-action@v4 30 | env: 31 | REGISTRY: ${{ steps.login-ecr.outputs.registry }} 32 | REPOSITORY: dse-infra-dev-us-west-2-default 33 | IMAGE_TAG: ${{ github.ref == 'refs/heads/main' && 'latest' || 'test' }} 34 | with: 35 | push: true 36 | context: "." 37 | file: "./images/Dockerfile" 38 | tags: ${{ env.REGISTRY }}/${{ env.REPOSITORY }}:${{ env.IMAGE_TAG }} 39 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | pull_request: 4 | push: 5 | branches: [main] 6 | 7 | permissions: 8 | contents: write 9 | 10 | env: 11 | DBT_PROFILES_DIR: ci 12 | SNOWFLAKE_PRIVATE_KEY: ${{ SECRETS.SNOWFLAKE_PRIVATE_KEY_DEV }} 13 | SNOWFLAKE_USER: GITHUB_ACTIONS_SVC_USER_DEV 14 | 15 | jobs: 16 | build-docs: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | - uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.10" 23 | - uses: snok/install-poetry@v1 24 | with: 25 | virtualenvs-path: .venv 26 | - name: Load cached venv 27 | id: cached-poetry-dependencies 28 | uses: actions/cache@v4 29 | with: 30 | path: .venv 31 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 32 | - name: Install dependencies 33 | run: poetry install --no-interaction --no-root 34 | - name: Build dbt docs 35 | run: | 36 | # Generate snowflake dbt docs 37 | poetry run dbt deps --project-dir=transform 38 | poetry run dbt docs generate --project-dir=transform 39 | cp -r transform/target docs/dbt_docs_snowflake 40 | - name: Deploy docs to GitHub Pages 41 | if: github.ref == 'refs/heads/main' 42 | run: poetry run mkdocs gh-deploy --force 43 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | env: 10 | DBT_PROFILES_DIR: ci 11 | SNOWFLAKE_PRIVATE_KEY: ${{ SECRETS.SNOWFLAKE_PRIVATE_KEY_DEV }} 12 | SNOWFLAKE_USER: GITHUB_ACTIONS_SVC_USER_DEV 13 | 14 | jobs: 15 | pre-commit: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.10" 22 | - uses: snok/install-poetry@v1 23 | with: 24 | virtualenvs-path: .venv 25 | - name: Load cached venv 26 | id: cached-poetry-dependencies 27 | uses: actions/cache@v4 28 | with: 29 | path: .venv 30 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 31 | - name: Install dependencies 32 | run: poetry install --no-interaction --no-root 33 | - name: Install dbt deps 34 | run: poetry run dbt deps --project-dir transform 35 | - uses: pre-commit/action@v3.0.0 36 | -------------------------------------------------------------------------------- /.github/workflows/submit-job.yml: -------------------------------------------------------------------------------- 1 | name: submit 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "23 12 * * *" 7 | 8 | jobs: 9 | submit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out repository 13 | uses: actions/checkout@v3 14 | 15 | - name: Configure AWS Credentials 16 | uses: aws-actions/configure-aws-credentials@v2 17 | with: 18 | # TODO: use OIDC for auth: 19 | # https://github.com/aws-actions/configure-aws-credentials#assuming-a-role 20 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 21 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 22 | aws-region: us-west-2 23 | - name: Submit batch job 24 | run: | 25 | aws batch submit-job \ 26 | --job-name test \ 27 | --job-queue dse-infra-dev-us-west-2-default \ 28 | --job-definition dse-infra-dev-us-west-2-latest \ 29 | --container-overrides '{ 30 | "resourceRequirements": 31 | [{"value": "2", "type": "VCPU"}, {"value": "4096", "type": "MEMORY"}] 32 | }' 33 | -------------------------------------------------------------------------------- /.github/workflows/terraform-validation.yml: -------------------------------------------------------------------------------- 1 | name: terraform-validation 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | terraform-validation: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - name: Setup terraform 15 | uses: hashicorp/setup-terraform@v2 16 | with: 17 | terraform_version: v1.4.0 18 | - name: Install tflint 19 | run: | 20 | curl -s https://raw.githubusercontent.com/terraform-linters/\ 21 | tflint/master/install_linux.sh | bash 22 | 23 | - name: Run terraform fmt 24 | run: | 25 | terraform fmt 26 | - name: Run terraform validate 27 | run: | 28 | terraform validate 29 | - name: Run terraform tflint 30 | run: | 31 | tflint --chdir=terraform/ --recursive 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | docs/dbt_docs* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | .DS_Store 165 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: check-yaml 6 | args: 7 | - --unsafe 8 | - id: check-json 9 | - id: end-of-file-fixer 10 | - id: trailing-whitespace 11 | - id: check-merge-conflict 12 | - id: detect-aws-credentials 13 | args: [--allow-missing-credentials] 14 | - repo: https://github.com/charliermarsh/ruff-pre-commit 15 | rev: v0.1.6 16 | hooks: 17 | - id: ruff 18 | args: [--show-source, --fix] 19 | - id: ruff-format 20 | - repo: https://github.com/codespell-project/codespell 21 | rev: v2.2.4 22 | hooks: 23 | - id: codespell 24 | types_or: [rst, markdown] 25 | files: docs 26 | - repo: https://github.com/pre-commit/mirrors-mypy 27 | rev: v1.1.1 28 | hooks: 29 | - id: mypy 30 | args: [--warn-unused-configs] 31 | additional_dependencies: 32 | # Type stubs 33 | - pandas-stubs==v1.5.3.230321 34 | - types-requests 35 | - numpy 36 | - repo: https://github.com/pre-commit/mirrors-prettier 37 | rev: v2.7.1 38 | hooks: 39 | - id: prettier 40 | types: [yaml] 41 | - repo: https://github.com/adrienverge/yamllint.git 42 | rev: v1.28.0 43 | hooks: 44 | - id: yamllint 45 | args: [] 46 | # Note: for SQLFluff we don't use the default pre-commit hook because 47 | # the pre-commit managed python environment can be difficult to install, 48 | # especially due to issues with pyarrow being brought in by Snowflake. 49 | # This keep things more predictable by using the poetry.lock environment. 50 | - repo: local 51 | hooks: 52 | - id: sqlfluff 53 | name: sqlfluff 54 | language: system 55 | description: "Lints sql files with `SQLFluff`" 56 | types: [sql] 57 | require_serial: true 58 | entry: poetry run sqlfluff fix --show-lint-violations --nocolor --disable-progress-bar 59 | pass_filenames: true 60 | -------------------------------------------------------------------------------- /.prettierrc.yml: -------------------------------------------------------------------------------- 1 | endOfLine: auto 2 | proseWrap: "preserve" 3 | -------------------------------------------------------------------------------- /.sqlfluff: -------------------------------------------------------------------------------- 1 | [sqlfluff] 2 | # For some reason this can only be set in the root directory, cf 3 | # https://docs.sqlfluff.com/en/stable/configuration.html#nesting. 4 | # Other config parameters are set in the dbt project directory, as 5 | # that's where dbt cloud looks for them. 6 | templater = dbt 7 | 8 | [sqlfluff:templater:dbt] 9 | project_dir = ./transform 10 | -------------------------------------------------------------------------------- /.terraform-docs.yml: -------------------------------------------------------------------------------- 1 | formatter: markdown table 2 | recursive: 3 | enabled: false 4 | output: 5 | file: README.md 6 | mode: inject 7 | sort: 8 | enabled: true 9 | by: name 10 | -------------------------------------------------------------------------------- /.tflint.hcl: -------------------------------------------------------------------------------- 1 | plugin "aws" { 2 | enabled = true 3 | version = "0.22.1" 4 | source = "github.com/terraform-linters/tflint-ruleset-aws" 5 | } 6 | -------------------------------------------------------------------------------- /.yamllint: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | rules: 4 | document-start: disable 5 | line-length: {max: 120} 6 | quoted-strings: 7 | quote-type: double 8 | required: false 9 | truthy: 10 | allowed-values: ["true", "false"] 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Office of Data & Innovation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CalData Data Services and Engineering Infrastructure 2 | 3 | ![deploy](https://github.com/cagov/data-infrastructure/actions/workflows/deploy.yml/badge.svg?branch=main) 4 | ![docs](https://github.com/cagov/data-infrastructure/actions/workflows/docs.yml/badge.svg?branch=main) 5 | 6 | Documentation for this project can be found [here](https://cagov.github.io/data-infrastructure/). 7 | -------------------------------------------------------------------------------- /airflow/.gitignore: -------------------------------------------------------------------------------- 1 | aws-mwaa-local-runner 2 | -------------------------------------------------------------------------------- /airflow/aws_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python script to run airflow CLI commands in a MWAA environment. 3 | 4 | Adapted from sample code here: 5 | https://docs.aws.amazon.com/mwaa/latest/userguide/airflow-cli-command-reference.html#airflow-cli-command-examples 6 | """ 7 | import base64 8 | import json 9 | import sys 10 | 11 | import boto3 12 | import requests 13 | 14 | mwaa_env_name = "dse-infra-dev-us-west-2-mwaa-environment" 15 | 16 | client = boto3.client("mwaa") 17 | 18 | mwaa_cli_token = client.create_cli_token(Name=mwaa_env_name) 19 | 20 | mwaa_auth_token = "Bearer " + mwaa_cli_token["CliToken"] 21 | mwaa_webserver_hostname = f"https://{mwaa_cli_token['WebServerHostname']}/aws_mwaa/cli" 22 | raw_data = " ".join(sys.argv[1:]) 23 | 24 | mwaa_response = requests.post( 25 | mwaa_webserver_hostname, 26 | headers={"Authorization": mwaa_auth_token, "Content-Type": "text/plain"}, 27 | data=raw_data, 28 | ) 29 | 30 | print(mwaa_response.status_code) 31 | try: 32 | mwaa_std_err_message = base64.b64decode(mwaa_response.json()["stderr"]).decode( 33 | "utf8" 34 | ) 35 | mwaa_std_out_message = base64.b64decode(mwaa_response.json()["stdout"]).decode( 36 | "utf8" 37 | ) 38 | print(mwaa_std_err_message) 39 | print(mwaa_std_out_message) 40 | except json.decoder.JSONDecodeError: 41 | print(mwaa_response.text) 42 | -------------------------------------------------------------------------------- /airflow/dags/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Package for shared utility functions and classes 2 | -------------------------------------------------------------------------------- /airflow/dags/common/defaults.py: -------------------------------------------------------------------------------- 1 | """Shared default arguments for DAGs.""" 2 | from __future__ import annotations 3 | 4 | from datetime import timedelta 5 | from typing import Any 6 | 7 | from common.slack import post_to_slack_on_failure 8 | 9 | DEFAULT_ARGS: dict[str, Any] = { 10 | "owner": "CalData", 11 | "depends_on_past": False, 12 | "email": ["odi-caldata-dse@innovation.ca.gov"], 13 | "email_on_failure": False, 14 | "email_on_retry": False, 15 | "retries": 2, 16 | "retry_delay": timedelta(minutes=5), 17 | "on_failure_callback": post_to_slack_on_failure, 18 | } 19 | -------------------------------------------------------------------------------- /airflow/dags/common/requests.py: -------------------------------------------------------------------------------- 1 | """Utilities for making HTTP requests.""" 2 | 3 | import backoff 4 | import requests 5 | 6 | 7 | @backoff.on_exception( 8 | backoff.expo, 9 | requests.exceptions.RequestException, 10 | max_time=30, 11 | max_tries=4, 12 | ) 13 | def get(url): 14 | return requests.get(url) 15 | -------------------------------------------------------------------------------- /airflow/dags/common/slack.py: -------------------------------------------------------------------------------- 1 | from airflow.providers.slack.hooks.slack_webhook import SlackWebhookHook 2 | 3 | 4 | def post_to_slack_on_failure(context): 5 | hook = SlackWebhookHook( 6 | slack_webhook_conn_id="caldata-dataservices-bot-notifications" 7 | ) 8 | msg = f""" 9 | :x: Task Failed. 10 | *Task*: {context.get('task_instance').task_id} 11 | *Dag*: {context.get('task_instance').dag_id} 12 | *Execution Time*: {context.get('execution_date')} 13 | <{context.get('task_instance').log_url}|*Logs*> 14 | """ 15 | hook.send_text(msg) 16 | -------------------------------------------------------------------------------- /airflow/dags/geo_reference/load_building_footprints.py: -------------------------------------------------------------------------------- 1 | """Load building footprints to Snowflake.""" 2 | from __future__ import annotations 3 | 4 | import os 5 | from datetime import datetime 6 | 7 | from common.defaults import DEFAULT_ARGS 8 | 9 | from airflow.decorators import dag 10 | from airflow.providers.amazon.aws.operators.batch import BatchOperator 11 | from airflow.providers.amazon.aws.sensors.batch import BatchSensor 12 | from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator 13 | 14 | 15 | def _construct_batch_args(name: str, command: list[str]) -> dict: 16 | return { 17 | "task_id": name, 18 | "job_name": name, 19 | "job_queue": os.environ["AIRFLOW__CUSTOM__DEFAULT_JOB_QUEUE"], 20 | "job_definition": os.environ["AIRFLOW__CUSTOM__DEFAULT_JOB_DEFINITION"], 21 | "overrides": { 22 | "command": command, 23 | "resourceRequirements": [ 24 | {"type": "VCPU", "value": "8"}, 25 | {"type": "MEMORY", "value": "32768"}, 26 | ], 27 | }, 28 | "region_name": "us-west-2", # TODO: can we make this unnecessary? 29 | } 30 | 31 | 32 | @dag( 33 | description="Test DAG", 34 | start_date=datetime(2023, 5, 23), 35 | schedule_interval="@monthly", 36 | default_args=DEFAULT_ARGS, 37 | catchup=False, 38 | ) 39 | def building_footprints_dag(): 40 | """DAG for loading MS Building footprints dataset.""" 41 | load_us_footprints = BatchOperator( 42 | **_construct_batch_args( 43 | name="load_us_building_footprints", 44 | command=["python", "-m", "jobs.geo.load_us_building_footprints"], 45 | ) 46 | ) 47 | wait_for_us_footprints_load = BatchSensor( 48 | task_id="wait_for_us_footprints_load", 49 | job_id=load_us_footprints.output, 50 | region_name="us-west-2", # TODO: can we make this unnecessary? 51 | ) 52 | 53 | load_global_ml_footprints = BatchOperator( 54 | **_construct_batch_args( 55 | name="load_global_ml_building_footprints", 56 | command=["python", "-m", "jobs.geo.load_global_ml_building_footprints"], 57 | ) 58 | ) 59 | wait_for_global_ml_footprints_load = BatchSensor( 60 | task_id="wait_for_global_ml_footprints_load", 61 | job_id=load_global_ml_footprints.output, 62 | region_name="us-west-2", # TODO: can we make this unnecessary? 63 | ) 64 | 65 | run_dbt_cloud_job = DbtCloudRunJobOperator( 66 | job_id=14, 67 | task_id="run_dbt_cloud_job", 68 | dbt_cloud_conn_id="dbt_cloud_default", 69 | wait_for_termination=True, 70 | timeout=1800, 71 | ) 72 | 73 | run_dbt_cloud_job.set_upstream(wait_for_us_footprints_load) 74 | run_dbt_cloud_job.set_upstream(wait_for_global_ml_footprints_load) 75 | 76 | unload_us_footprints = BatchOperator( 77 | **_construct_batch_args( 78 | name="unload_us_building_footprints", 79 | command=["python", "-m", "jobs.geo.write_building_footprints", "us"], 80 | ) 81 | ) 82 | _ = BatchSensor( 83 | task_id="wait_for_us_footprints_unload", 84 | job_id=unload_us_footprints.output, 85 | region_name="us-west-2", # TODO: can we make this unnecessary? 86 | ) 87 | 88 | unload_us_footprints.set_upstream(run_dbt_cloud_job) 89 | 90 | unload_global_ml_footprints = BatchOperator( 91 | **_construct_batch_args( 92 | name="unload_global_ml_building_footprints", 93 | command=["python", "-m", "jobs.geo.write_building_footprints", "global_ml"], 94 | ) 95 | ) 96 | _ = BatchSensor( 97 | task_id="wait_for_global_ml_footprints_unload", 98 | job_id=unload_global_ml_footprints.output, 99 | region_name="us-west-2", # TODO: can we make this unnecessary? 100 | ) 101 | 102 | unload_global_ml_footprints.set_upstream(run_dbt_cloud_job) 103 | 104 | 105 | run = building_footprints_dag() 106 | -------------------------------------------------------------------------------- /airflow/dags/state_entities/base_entities.py: -------------------------------------------------------------------------------- 1 | """Load state entities list from department of finance.""" 2 | from __future__ import annotations 3 | 4 | import io 5 | import re 6 | from datetime import datetime 7 | 8 | import pandas 9 | import requests 10 | from common.defaults import DEFAULT_ARGS 11 | from snowflake.connector.pandas_tools import write_pandas 12 | 13 | from airflow.decorators import dag, task 14 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook 15 | 16 | GBQ_DATASET = "state_entities" 17 | LEVEL_LABELS = ["A", "B", "1", "2", "3"] 18 | DATA_URL = ( 19 | "https://dof.ca.gov/wp-content/uploads/sites/352/Accounting/" 20 | "Policies_and_Procedures/Uniform_Codes_Manual/3orgstruc.pdf" 21 | ) 22 | 23 | 24 | def clean_name(name: str) -> str: 25 | """Strip leading/trailing whitespace and replace repeated spaces with single spaces.""" 26 | return re.sub(" {2,}", " ", name.strip()) 27 | 28 | 29 | @task 30 | def load_data() -> None: 31 | """### Load Department of Finance State Entities data.""" 32 | import pdfplumber 33 | 34 | hook = SnowflakeHook(snowflake_conn_id="raw") 35 | conn = hook.get_conn() 36 | 37 | # Regexes matching frontmatter and other lines we should skip 38 | skip = [ 39 | # Just white space 40 | r"^\s*$", 41 | # Header material 42 | r"REVISED(\s+)(\w+)(\s+)(\d+)", 43 | r"(\s*)DEPARTMENT(\s+)OF(\s+)FINANCE(\s*)", 44 | r"(\s*)UNIFORM(\s+)CODES(\s+)MANUAL(\s*)", 45 | r"(\s*)ORGANIZATION(\s+)CODES(\s*)", 46 | r"(\s*)BY(\s+)STRUCTURE(\s*)", 47 | # Column headers 48 | r"(\s*)A(_+)(\s+)B(_+)(\s+)1(_+)(\s*)", 49 | # Page number 50 | r"^(\s*)(\d+)(\s*)$", 51 | ] 52 | 53 | skip_re = re.compile("|".join(skip), flags=re.IGNORECASE) 54 | entity_re = re.compile(r"^( *)(\d+)\s+(.+)$") 55 | 56 | r = requests.get(DATA_URL) 57 | f = io.BytesIO(r.content) 58 | pdf = pdfplumber.open(f) # type: ignore 59 | 60 | levels: list[str | None] = [ 61 | None, 62 | ] * len(LEVEL_LABELS) 63 | indent = None 64 | ts = 5 65 | entities: list[tuple[str | None, ...]] = [] 66 | 67 | for page in pdf.pages: 68 | lines = page.extract_text(layout=True).split("\n") 69 | print(page) 70 | for line in lines: 71 | if skip_re.search(line): 72 | continue 73 | 74 | match = entity_re.match(line) 75 | if match is None: 76 | print( 77 | f'Unable to parse line "{clean_name(line)}", assigning to previous name' 78 | ) 79 | revised = list(entities[-1]) 80 | revised[-1] = revised[5] + " " + clean_name(line) # type: ignore 81 | entities[-1] = tuple(revised) 82 | continue 83 | 84 | # Get the raw matches 85 | spaces, code, name = match.groups() 86 | 87 | # Set the top-level indentation 88 | if indent is None: 89 | indent = len(spaces) 90 | 91 | # Strip excess whitespace from the name 92 | name = clean_name(name) 93 | 94 | # Get the level number from the whitespace 😬 95 | level_n = (len(spaces) - indent) // ts 96 | assert level_n <= len(LEVEL_LABELS) - 1 97 | 98 | # Fill the levels, null out everything after the current level 99 | levels[level_n] = code 100 | levels[level_n + 1 :] = [None] * (len(LEVEL_LABELS) - level_n - 1) 101 | 102 | entities.append((*levels, name)) 103 | 104 | df = ( 105 | pandas.DataFrame.from_records(entities, columns=[*LEVEL_LABELS, "name"]) 106 | .astype("string[python]") # type: ignore 107 | .rename(columns={"1": "L1", "2": "L2", "3": "L3"}) 108 | ) 109 | 110 | DB = conn.database 111 | SCHEMA = "STATE_ENTITIES" 112 | conn.cursor().execute(f"CREATE SCHEMA IF NOT EXISTS {DB}.{SCHEMA}") 113 | 114 | write_pandas( 115 | conn, 116 | df, 117 | database=DB, 118 | schema=SCHEMA, 119 | table_name="BASE_ENTITIES", 120 | auto_create_table=True, 121 | overwrite=True, 122 | ) 123 | 124 | 125 | @dag( 126 | description="Load department of finance state entities list", 127 | start_date=datetime(2022, 12, 19), 128 | schedule_interval="@monthly", 129 | default_args=DEFAULT_ARGS, 130 | catchup=False, 131 | ) 132 | def load_department_of_finance_state_entities(): 133 | load_data() 134 | 135 | 136 | run = load_department_of_finance_state_entities() 137 | -------------------------------------------------------------------------------- /airflow/dags/state_entities/budgets.py: -------------------------------------------------------------------------------- 1 | """Load state entity budgets from ebudget site.""" 2 | from __future__ import annotations 3 | 4 | import re 5 | from datetime import datetime 6 | 7 | import pandas 8 | from common.defaults import DEFAULT_ARGS 9 | from common.requests import get 10 | from snowflake.connector.pandas_tools import write_pandas 11 | 12 | from airflow.decorators import dag, task 13 | from airflow.providers.snowflake.hooks.snowflake import SnowflakeHook 14 | 15 | PREFIX = "https://ebudget.ca.gov/budget/publication/admin" 16 | 17 | 18 | def camel_to_snake(s: str) -> str: 19 | """ 20 | Convert a camel-cased name to a snake-cased one. 21 | 22 | Snake-cased names are more appropriate for case-insensitive systems like 23 | data warehouse backends. 24 | """ 25 | return re.sub(r"(? 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs/images/partition-pruning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/images/partition-pruning.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # CalData Data Services and Engineering Infrastructure 2 | 3 | This is the technical documentation for CalData's 4 | Data Services and Engineering (DSE) projects. 5 | It consists of processes, conventions, instructions, and architecture diagrams. 6 | -------------------------------------------------------------------------------- /docs/infra/cloud-infrastructure.md: -------------------------------------------------------------------------------- 1 | # Cloud infrastructure 2 | 3 | The DSE team [uses Terraform](../code/terraform-local-setup.md) to manage cloud infrastructure. 4 | Our stack includes: 5 | 6 | * An [AWS Batch](https://aws.amazon.com/batch/) environment for running arbitrary containerized jobs 7 | * A [Managed Workflows on Apache Airflow](https://aws.amazon.com/managed-workflows-for-apache-airflow/) environment for orchestrating jobs 8 | * A VPC and subnets for the above 9 | * An ECR repository for hosting Docker images storing code and libraries for jobs 10 | * A bot user for running AWS operations in GitHub Actions 11 | * An S3 scratch bucket 12 | 13 | ## Architecture 14 | 15 | ```mermaid 16 | flowchart TD 17 | subgraph AWS 18 | J[GitHub CD\nbot user] 19 | G[Artifact in S3] 20 | subgraph VPC 21 | subgraph Managed Airflow 22 | K1[Scheduler] 23 | K2[Worker] 24 | K3[Webserver] 25 | end 26 | F[AWS Batch Job\n on Fargate] 27 | end 28 | E[AWS ECR Docker\nRepository] 29 | end 30 | subgraph GitHub 31 | A[Code Repository] 32 | end 33 | E --> F 34 | A -- Code quality check\n GitHub action --> A 35 | A -- Job submission\nvia GitHub Action --> F 36 | A -- Docker build \nGitHub Action --> E 37 | A --> H[CalData\nadministrative\nuser] 38 | H -- Terraform -----> AWS 39 | K2 -- Job submission\nvia Airflow --> F 40 | K1 <--> K2 41 | K3 <--> K1 42 | K3 <--> K2 43 | F --> G 44 | J -- Bot Credentials --> A 45 | ``` 46 | -------------------------------------------------------------------------------- /docs/learning/dbt.md: -------------------------------------------------------------------------------- 1 | # dbt 2 | 3 | Many CalData projects use [dbt](https://www.getdbt.com/) 4 | for transforming and modeling data within our cloud data warehouses. 5 | dbt has become extremely popular over the last several years, 6 | popularizing the practice and position of "analytics engineering". 7 | It has a number of features that makes it valuable for data stacks: 8 | 9 | * It works well with version control 10 | * It encourages modular, reusable SQL code 11 | * It makes it easier to track data lineage as it flows through your data warehouse 12 | * It has a large, active community with which you can share tips and techniques 13 | 14 | ## Learning dbt 15 | 16 | dbt provides a series of [free courses](https://courses.getdbt.com/collections) 17 | for learning how to use the project: 18 | 19 | * [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) 20 | * [Jinja, Macros, and Packages](https://courses.getdbt.com/courses/jinja-macros-packages) 21 | * [Advanced Materializations](https://courses.getdbt.com/courses/advanced-materializations) 22 | * [Refactoring SQL for Modularity](https://courses.getdbt.com/courses/refactoring-sql-for-modularity) 23 | -------------------------------------------------------------------------------- /docs/setup/dbt-setup.md: -------------------------------------------------------------------------------- 1 | # dbt project setup 2 | 3 | To set up a new project on dbt Cloud follow these steps: 4 | 5 | 1. Give your new project a name. 6 | 1. Click _Advanced settings_ and in the _Project subdirectory_ field, enter "transform" 7 | 1. Select a data warehouse connection. (e.g. Snowflake, BigQuery, Redshift) 8 | 1. For the _Development credentials_ section you'll want to choose between Snowflake OAuth or Key pair. In general, Snowflake OAuth is preferred for human users (which is what the development environment is for). It is also an enterprise dbt Cloud feature, so if working with a standard account, you'll need to use key pair. 9 | 10 | 1. For Snowflake OAuth: 11 | 12 | 1. Follow dbt's instructions for set up [here](https://docs.getdbt.com/docs/cloud/manage-access/set-up-snowflake-oauth) 13 | 14 | 1. For Key pair: 15 | 1. Under _Auth method_ select _Key pair_ 16 | 1. Enter your data warehouse username 17 | 1. Enter the private key and private key passphrase 18 | 1. For more guidance, read [dbt's docs on connecting to Snowflake via key pair](https://docs.getdbt.com/docs/cloud/connect-data-platform/connect-snowflake#key-pair) 19 | 20 | 1. Finally click the _Test Connection_ button. 21 | 1. Connect the appropriate repository. Read [dbt's docs on connecting to GitHub](https://docs.getdbt.com/docs/cloud/git/connect-github) or [dbt's docs on connecting to Azure DevOps](https://docs.getdbt.com/docs/cloud/git/setup-azure#register-an-azure-ad-app) and [Microsoft's docs on creating branch policies in DevOps](https://learn.microsoft.com/en-us/azure/devops/repos/git/pr-status-policy?view=azure-devops). To integrate dbtCloud with Azure DevOps, the service user (legacy) option must be used. Complete the steps found in the [documentation](https://docs.getdbt.com/docs/cloud/git/setup-azure#register-an-azure-ad-app). 22 | 23 | Once you're through the first five steps you can return to the dbt homepage and click the Settings button in the upper right corner. From there you can follow the steps to configure three environments for Continuous integration - CI, development, and production. Read [dbt's docs on CI in dbt Cloud](https://docs.getdbt.com/docs/deploy/continuous-integration). Read [dbt's docs on creating production (deployment) environments](https://docs.getdbt.com/docs/deploy/deploy-environments) and [dbt's docs on creating and scheduling deploy jobs](https://docs.getdbt.com/docs/deploy/deploy-jobs#create-and-schedule-jobs). 24 | 25 | You'll also want to [configure notifications for job failures](https://docs.getdbt.com/docs/deploy/job-notifications). 26 | 27 | Pictured below is an example of environment variables you can set for each environment. For more guidance, read [dbt's docs on environment variables](https://docs.getdbt.com/docs/build/environment-variables). 28 | 29 | ![environment variables](../images/environment_variables.png) 30 | -------------------------------------------------------------------------------- /docs/setup/fivetran-setup.md: -------------------------------------------------------------------------------- 1 | # Fivetran project setup 2 | 3 | To set up a new project in Fivetran follow these steps: 4 | 5 | 1. First, ensure you have met the following pre-requisites: 6 | - You have set up a Snowflake Account for the project (follow all instructions from [here](./snowflake-setup.md)) 7 | - Ensure that your Snowflake project has a **LOADER_PRD** role with privileges to write data to the **RAW_PRD** database 8 | - You have created a Snowflake User called **FIVETRAN_SVC_USER_PRD** and ensured this user has the **LOADER_PRD** role 9 | - You have set up an auth key pair for this user and saved it to the ODI OnePass account 10 | 11 | 2. In Fivetran, navigate to Organization -> Accounts 12 | 3. Click _Add Acount_ 13 | 4. Choose an Account Name, select _Enterprise_ for Account Tier and _No restrictions_ for Required Authentication Type 14 | 5. Next, navigate to Destinations 15 | 6. Search for **Snowflake** and click _Select_ 16 | 7. To set up the Snowflake connector: 17 | 1. Name the destination **RAW_PRD** 18 | 2. Add the Snowflake URL for your project as the _Host_ 19 | 3. Add **FIVETRAN_SVC_USER_PRD** as the _User_ 20 | 4. Add **RAW_PRD** as the _Database_ 21 | 5. For _Auth_ select **KEY_PAIR** and enter the key pair details for the FIVETRAN_SVC_USER_PRD user 22 | 6. Add **LOADER_PRD** as the _Role_ 23 | 7. Optional: Most of the time, the cloud provider and region don't matter, but if a client is operating in a particular cloud/region and wants to minimize data transfer, it makes sense to select the client's _Cloud service provider_, _Cloud region_, and _Default Time Zone_ 24 | 8. Click the _Save & Test_ button 25 | 26 | Once you are through with these steps, you can proceed to creating and assigning permissions to Users in the Fivetran account. 27 | -------------------------------------------------------------------------------- /docs/setup/project-teardown.md: -------------------------------------------------------------------------------- 1 | # Tearing down a project 2 | 3 | Upon completion of a project (or if you just went through project setup for testing purposes) 4 | there are a few steps needed to tear down the infrastructure. 5 | 6 | 1. If the GitHub repository is to be handed off a client, transfer ownership of it to them. 7 | Otherwise, delete or archive the GitHub repository. 8 | If archiving, delete the GitHub actions secrets. 9 | 1. Open a Help Desk ticket with IT-Ops to remove Sentinel logging for the Snowflake account. 10 | 1. If the Snowflake account is to be handed off to a client, transfer ownership of it to them. 11 | Otherwise, [drop the account](https://docs.snowflake.com/en/user-guide/organizations-manage-accounts-delete). 12 | -------------------------------------------------------------------------------- /docs/setup/repo-setup.md: -------------------------------------------------------------------------------- 1 | ## Create project git repository 2 | 3 | Create a new git repository from the CalData Infrastructure Template 4 | following the instructions [here](https://github.com/cagov/caldata-infrastructure-template#usage). 5 | 6 | Once you have created the repository, push it to a remote repository in GitHub. 7 | There are some GitHub actions that will fail because the repository is not yet 8 | configured to work with the new Snowflake account. 9 | 10 | ## Set up CI in GitHub 11 | 12 | The projects generated from our infrastructure template need read access to the 13 | Snowflake account in order to do two things from GitHub actions: 14 | 15 | 1. Verify that dbt models in branches compile and pass linter checks 16 | 1. Generate dbt docs upon merge to `main`. 17 | 18 | The terraform configurations deployed above create two service accounts 19 | for GitHub actions, a production one for docs and a dev one for CI checks. 20 | 21 | ### Add key pairs to the GitHub service accounts 22 | 23 | Set up key pairs for the two GitHub actions service accounts 24 | (`GITHUB_ACTIONS_SVC_USER_DEV` and `GITHUB_ACTIONS_SVC_USER_PRD`). 25 | This follows a similar procedure to what you did for your personal key pair, 26 | though the project template currently does not assume an encrypted key pair. 27 | [This bash script](https://gist.github.com/ian-r-rose/35d49bd253194f57b57e9e59a595bed8) 28 | is a helpful shortcut for generating the key pair: 29 | ```bash 30 | bash generate_key.sh 31 | ``` 32 | 33 | Once you have created and set the key pairs, add them to the DSE 1Password shared vault. 34 | Make sure to provide enough information to disambiguate the key pair from others stored in the vault, 35 | including: 36 | 37 | * The account locator (legacy account identifier) 38 | * The organization name 39 | * The account name (distinct from the account locator) 40 | * Note : The preferred account identifier is to use name of the account prefixed by its organization (e.g. myorg-account123) 41 | * The service account name 42 | * The public key 43 | * The private key 44 | 45 | ### Set up GitHub actions secrets 46 | 47 | You need to configure secrets in GitHub actions 48 | in order for the service accounts to be able to connect to your Snowflake account. 49 | From the repository page, go to "Settings", then to "Secrets and variables", then to "Actions". 50 | 51 | Add the following repository secrets: 52 | 53 | | Variable | Value | 54 | |----------|-------| 55 | | `SNOWFLAKE_ACCOUNT` | new account locator | 56 | | `SNOWFLAKE_USER_DEV` | `GITHUB_ACTIONS_SVC_USER_DEV` | 57 | | `SNOWFLAKE_USER_PRD` | `GITHUB_ACTIONS_SVC_USER_PRD` | 58 | | `SNOWFLAKE_PRIVATE_KEY_DEV` | dev service account private key | 59 | | `SNOWFLAKE_PRIVATE_KEY_PRD` | prd service account private key | 60 | 61 | ## Enable GitHub pages for the repository 62 | 63 | The repository must have GitHub pages enabled in order for it to deploy and be viewable. 64 | 65 | 1. From the repository page, go to "Settings", then to "Pages". 66 | 1. Under "GitHub Pages visibility" select "Private" (unless the project is public!). 67 | 1. Under "Build and deployment" select "Deploy from a branch" and choose "gh-pages" as your branch. 68 | -------------------------------------------------------------------------------- /docs/setup/sentinel-setup.md: -------------------------------------------------------------------------------- 1 | # Set up Sentinel logging 2 | 3 | ODI IT requires that systems log to our Microsoft Sentinel instance 4 | for compliance with security monitoring policies. 5 | The terraform configuration deployed above creates a service account for Sentinel 6 | which needs to be integrated. 7 | 8 | 1. Create a password for the Sentinel service account. 9 | In other contexts we prefer key pairs for service accounts, but the Sentinel 10 | integration requires password authentication. In a Snowflake worksheet run: 11 | ```sql 12 | use role securityadmin; 13 | alter user sentinel_svc_user_prd set password = '' 14 | ``` 15 | 1. Store the Sentinel service account authentication information in our shared 16 | 1Password vault. 17 | Make sure to provide enough information to disambiguate it from others stored in the vault, 18 | including: 19 | 20 | * The account locator (legacy account identifier) 21 | * The organization name 22 | * The account name (distinct from the account locator) 23 | * Note : The preferred account identifier is to use name of the account prefixed by its organization (e.g. myorg-account123) 24 | * The service account name 25 | * The public key 26 | * The private key 27 | 28 | 1. Create an IT Help Desk ticket to add the new account to our Sentinel instance. 29 | Share the 1Password item with the IT-Ops staff member who is implementing the ticket. 30 | If you've included all of the above information in the vault item, 31 | it should be all they need. 32 | 1. Within fifteen minutes or so of implementation it should be clear whether the integration is working. 33 | IT-Ops should be able to see logs ingesting, and Snowflake account admins should see queries 34 | from the Sentinel service user. 35 | -------------------------------------------------------------------------------- /docs/setup/snowflake-service-accounts.md: -------------------------------------------------------------------------------- 1 | ### Create service accounts using Terraform 2 | 3 | Service accounts aren't associated with a human user. 4 | Instead, they are created by an account administrator for 5 | the purposes of allowing another service to perform some action. 6 | 7 | We currently use service accounts for: 8 | 9 | * Fivetran loading raw data 10 | * Airflow loading raw data 11 | * dbt Cloud for transforming data 12 | * GitHub actions generating docs 13 | 14 | These service accounts are created using Terraform 15 | and assigned roles according to the principle of least-privilege. 16 | They use key pair authentication, which is more secure than password-based authentication as no sensitive data are exchanged. 17 | Private keys for service accounts should be stored in CalData's 1Password vault. 18 | 19 | The following are steps for creating a new service account with key pair authentication: 20 | 21 | 1. Create a new key pair in accordance with [these docs](https://docs.snowflake.com/en/user-guide/key-pair-auth#configuring-key-pair-authentication). 22 | Most of the time, you should create a key pair with encryption enabled for the private key. 23 | 1. Add the private key to the CalData 1Password vault, along with the intended service account user name and passphrase (if applicable) 24 | 1. Create a new user in the Snowflake Terraform configuration (`users.tf`) and assign it the appropriate functional role. 25 | Once the user is created, add its public key in the Snowflake UI: 26 | ```sql 27 | ALTER USER SET RSA_PUBLIC_KEY='MII...' 28 | ``` 29 | Note that we need to remove the header and trailer (i.e. `-- BEGIN PUBLIC KEY --`) as well as any line breaks 30 | in order for Snowflake to accept the public key as valid. 31 | 1. Add the *private* key for the user to whatever system needs to access Snowflake. 32 | 33 | Service accounts should not be shared across different applications, 34 | so if one becomes compromised, the damage is more isolated. 35 | -------------------------------------------------------------------------------- /docs/setup/terraform-project-setup.md: -------------------------------------------------------------------------------- 1 | # Deploy project infrastructure using Terraform 2 | 3 | We will create two separate deployments of the project infrastructure, 4 | one for development, and one for production. 5 | In some places we will refer to project name and owner as `` and ``, respectively, 6 | following our [naming conventions](../learning/naming-conventions.md). 7 | You should substitute the appropriate names there. 8 | 9 | ## Create the dev configuration 10 | 11 | 1. Ensure that your environment has environment variables set for 12 | `SNOWFLAKE_ACCOUNT`, `SNOWFLAKE_USER`, `SNOWFLAKE_PRIVATE_KEY_PATH`, and `SNOWFLAKE_PRIVATE_KEY_PASSPHRASE`. 13 | Make sure you *don't* have any other `SNOWFLAKE_*` variables set, 14 | as they can interfere with authentication. 15 | 1. In the new git repository, create a directory to hold the development Terraform configuration: 16 | ```bash 17 | mkdir -p terraform/environments/dev/ 18 | ``` 19 | The location of this directory is by convention, and subject to change. 20 | 1. Copy the terraform configuration from 21 | [here](https://github.com/cagov/data-infrastructure/blob/main/terraform/snowflake/environments/dev/main.tf) 22 | to your `dev` directory. 23 | 1. In the "elt" module of `main.tf`, change the `source` parameter to point to 24 | `"github.com/cagov/data-infrastructure.git//terraform/snowflake/modules/elt?ref="` 25 | where `` is the short hash of the most recent commit in the `data-infrastructure` repository. 26 | 1. In the `dev` directory, create a new backend configuration file called `--dev.tfbackend`. 27 | The file will point to the S3 bucket in which we are storing terraform state. Populate the backend 28 | configuration file with the following (making sure to substitute values for `` and ``): 29 | ```hcl 30 | bucket = "dse-snowflake-dev-terraform-state" 31 | dynamodb_table = "dse-snowflake-dev-terraform-state-lock" 32 | key = "--dev.tfstate" 33 | region = "us-west-2" 34 | ``` 35 | 1. In the `dev` directory, create a terraform variables file called `terraform.tfvars`, 36 | and populate the "elt" module variables. These variables may expand in the future, 37 | but at the moment they are just the new Snowflake organization name, account name and the environment 38 | (in this case `"DEV"`): 39 | ```hcl 40 | organization_name = "" 41 | account_name = "" 42 | environment = "DEV" 43 | ``` 44 | 1. Initialize the configuration: 45 | ```bash 46 | terraform init -backend-config --dev.tfbackend 47 | ``` 48 | 1. Include both Mac and Linux provider binaries in your terraform lock file. 49 | This helps mitigate differences between CI environments and ODI Macs: 50 | ```bash 51 | terraform providers lock -platform=linux_amd64 -platform=darwin_amd64 52 | ``` 53 | 1. Add your new `main.tf`, `terraform.tfvars`, `--dev.tfbackend`, 54 | and terraform lock file to the git repository. Do not add the `.terraform/` directory. 55 | 56 | ## Deploy the dev configuration 57 | 58 | 1. Ensure that your local environment has environment variables set for `SNOWFLAKE_ACCOUNT`, 59 | `SNOWFLAKE_USER`, `SNOWFLAKE_PRIVATE_KEY_PATH`, and `SNOWFLAKE_PRIVATE_KEY_PASSPHRASE`, 60 | and that they are set to your new account, rather than any other accounts. 61 | 1. Run `terraform plan` to see the plan for the resources that will be created. 62 | Inspect the plan to see that everything looks correct. 63 | 1. Run `terraform apply` to deploy the configuration. This will actually create the infrastructure! 64 | 65 | ## Configure and deploy the production configuration 66 | 67 | Re-run all of the steps above, but in a new directory `terraform/environments/prd`. 68 | Everywhere where there is a `dev` (or `DEV`), replace it with a `prd` (or `PRD`). 69 | -------------------------------------------------------------------------------- /docs/static/AccessFootprintsArcPro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/static/AccessFootprintsArcPro.pdf -------------------------------------------------------------------------------- /docs/static/Download.MS.Global.Footprints.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/docs/static/Download.MS.Global.Footprints.zip -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | :root, 2 | [data-md-color-scheme="slate"], 3 | [data-md-color-scheme="default"] { 4 | --md-primary-fg-color: #00315F; 5 | --md-primary-fg-color--light: #0D4F8C; 6 | --md-primary-fg-color--dark: #00315F; 7 | --md-primary-bg-color: #ECF1F6; 8 | --md-primary-bg-color--light: #FAECDA; 9 | 10 | --md-accent-fg-color: #E3881B; 11 | --md-accent-fg-color--transparent: #FAFAFA; 12 | --md-accent-bg-color: #FAECDA; 13 | --md-accent-bg-color--light: #FAFAFA; 14 | } 15 | -------------------------------------------------------------------------------- /images/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mambaorg/micromamba:1.2.0 2 | 3 | COPY --chown=$MAMBA_USER:$MAMBA_USER images/environment.yml /tmp/environment.yml 4 | 5 | RUN micromamba install --verbose -n base --file /tmp/environment.yml && \ 6 | micromamba clean --all --yes 7 | 8 | COPY --chown=$MAMBA_USER:$MAMBA_USER jobs /home/$MAMBA_USER/jobs 9 | WORKDIR /home/$MAMBA_USER 10 | -------------------------------------------------------------------------------- /images/environment.yml: -------------------------------------------------------------------------------- 1 | name: base 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | dependencies: 6 | - python=3.11 7 | - fsspec=2023.12.0 8 | - fiona=1.10.1 9 | - geopandas=1.0.1 # version that fixes the fiona path module issue 10 | - mercantile=1.2.1 11 | - pandas 12 | - pyarrow 13 | - s3fs=2023.12.0 14 | - snowflake-connector-python=3.5.0 15 | - pip 16 | - pip: 17 | - pygris 18 | -------------------------------------------------------------------------------- /jobs/__init__.py: -------------------------------------------------------------------------------- 1 | """Jobs for task runners or orchestrators.""" 2 | -------------------------------------------------------------------------------- /jobs/geo/__init__.py: -------------------------------------------------------------------------------- 1 | """Geospatial jobs.""" 2 | -------------------------------------------------------------------------------- /jobs/geo/load_global_ml_building_footprints.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | from jobs.utils.snowflake import gdf_to_snowflake, snowflake_connection_from_environment 6 | 7 | HERE = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | 10 | def load_state_footprints(conn) -> None: 11 | """Load Microsoft Global ML building footprints dataset for California.""" 12 | import fsspec 13 | import geopandas 14 | import mercantile 15 | import pandas 16 | import shapely.geometry 17 | 18 | print("Identifying California quadkeys") 19 | df = pandas.read_csv( 20 | "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv", 21 | dtype={"QuadKey": "str"}, # Don't use an int, since there are leading zeros! 22 | ) 23 | 24 | # Get the shape of California so that we can identify the quadkeys which intersect 25 | california = ( 26 | geopandas.read_file(os.path.join(HERE, "data", "california.geojson")) 27 | .iloc[0] 28 | .geometry 29 | ) 30 | 31 | # As a first pass, find all the tiles which intersect the bounding box, 32 | # since that's what mercantile knows how to do. 33 | features = [] 34 | for tile in mercantile.tiles(*california.bounds, zooms=9): 35 | features.append( 36 | { 37 | "quadkey": mercantile.quadkey(tile), 38 | "geometry": shapely.geometry.shape( 39 | mercantile.feature(tile)["geometry"] 40 | ), 41 | } 42 | ) 43 | 44 | # As a second pass, prune out the tiles which don't actually intersect California 45 | quadkeys = geopandas.GeoDataFrame.from_records(features).set_geometry("geometry") 46 | california_quadkeys = quadkeys[quadkeys.intersects(california)] 47 | 48 | # Now get a list of all the URLs which have a quadkey intersecting California 49 | california_data = df[ 50 | df.QuadKey.isin(california_quadkeys.quadkey) & (df.Location == "UnitedStates") 51 | ] 52 | 53 | overwrite = True # For the first subset, overwrite any existing table 54 | for _, row in california_data.iterrows(): 55 | print(f"Reading quadkey {row.QuadKey}") 56 | with fsspec.open(row.Url, compression="infer") as f: 57 | gdf = geopandas.read_file(f, driver="GeoJSONSeq") 58 | # If we include quadkeys here it could help with Snowflake partitioning 59 | gdf = gdf.assign(quadkey=row.QuadKey) 60 | gdf_to_snowflake( 61 | gdf, 62 | conn, 63 | table_name="GLOBAL_ML_BUILDING_FOOTPRINTS", 64 | overwrite=overwrite, 65 | strict_geometries=False, 66 | ) 67 | overwrite = False # For all subsequent gdfs, append 68 | 69 | 70 | if __name__ == "__main__": 71 | conn = snowflake_connection_from_environment( 72 | schema="BUILDING_FOOTPRINTS", 73 | client_session_keep_alive=True, # This can be a slow job! Keep the session alive 74 | ) 75 | load_state_footprints(conn) 76 | -------------------------------------------------------------------------------- /jobs/geo/load_us_building_footprints.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from jobs.utils.snowflake import gdf_to_snowflake, snowflake_connection_from_environment 4 | 5 | 6 | def load_state_footprints(conn) -> None: 7 | """Load Microsoft state building footprints dataset for California.""" 8 | import geopandas 9 | 10 | print("Downloading data") 11 | gdf = geopandas.read_file( 12 | "https://minedbuildings.z5.web.core.windows.net/legacy/usbuildings-v2/California.geojson.zip" 13 | ) 14 | 15 | print("Writing data to snowflake") 16 | gdf_to_snowflake( 17 | gdf, 18 | conn, 19 | table_name="US_BUILDING_FOOTPRINTS", 20 | cluster=False, 21 | ) 22 | 23 | 24 | if __name__ == "__main__": 25 | conn = snowflake_connection_from_environment( 26 | schema="BUILDING_FOOTPRINTS", 27 | client_session_keep_alive=True, # This can be a slow job! Keep the session alive 28 | ) 29 | load_state_footprints(conn) 30 | -------------------------------------------------------------------------------- /jobs/geo/tiger.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from jobs.utils.snowflake import gdf_to_snowflake, snowflake_connection_from_environment 4 | 5 | 6 | def load_geo_data(conn, year: int) -> None: 7 | """Load CA Census geo data into Snowflake.""" 8 | from pygris import ( 9 | block_groups, 10 | blocks, 11 | coastline, 12 | combined_statistical_areas, 13 | core_based_statistical_areas, 14 | counties, 15 | county_subdivisions, 16 | divisions, 17 | nation, 18 | native_areas, 19 | places, 20 | primary_roads, 21 | primary_secondary_roads, 22 | pumas, 23 | rails, 24 | regions, 25 | states, 26 | tracts, 27 | tribal_block_groups, 28 | tribal_subdivisions_national, 29 | urban_areas, 30 | ) 31 | 32 | print(f"Downloading data for CA in year {year}") 33 | 34 | ca_loaders = { 35 | "COUNTIES": counties, 36 | "TRACTS": tracts, 37 | "BLOCK_GROUPS": block_groups, 38 | "BLOCKS": blocks, 39 | "PLACES": places, 40 | "PUMAS": pumas, 41 | "COUNTY_SUBDIVISIONS": county_subdivisions, 42 | "PRIMARY_SECONDARY_ROADS": primary_secondary_roads, 43 | } 44 | 45 | us_loaders = { 46 | "COASTLINE": coastline, 47 | "DIVISIONS": divisions, 48 | "NATION": nation, 49 | "NATIVE_AREAS": native_areas, 50 | "PRIMARY_ROADS": primary_roads, 51 | "RAILS": rails, 52 | "REGIONS": regions, 53 | "STATES": states, 54 | "TRIBAL_BLOCK_GROUPS": tribal_block_groups, 55 | "TRIBAL_SUBDIVISIONS_NATIONAL": tribal_subdivisions_national, 56 | "URBAN_AREAS": urban_areas, 57 | "CORE_BASED_STATISTICAL_AREAS": core_based_statistical_areas, 58 | "COMBINED_STATISTICAL_AREAS": combined_statistical_areas, 59 | } 60 | 61 | state = "CA" 62 | 63 | for table_name, loader in ca_loaders.items(): 64 | try: 65 | gdf_to_snowflake( 66 | loader(state=state, year=year) 67 | .reset_index(drop=True) 68 | .to_crs( 69 | epsg=4326 70 | ), # using .reset_index(drop=True) to address the following UserWarning: 71 | # Pandas Dataframe has non-standard index of type which will not be written. Consider changing the 72 | # index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s) 73 | conn, 74 | table_name=table_name, 75 | cluster=False, 76 | ) 77 | except ValueError as value_error: 78 | print( 79 | f"This ValueError: {value_error} This pertains to this CA loader: {table_name}" 80 | ) 81 | 82 | for table_name, loader in us_loaders.items(): 83 | try: 84 | gdf_to_snowflake( 85 | loader(year=year) 86 | .reset_index(drop=True) 87 | .to_crs( 88 | epsg=4326 89 | ), # using .reset_index(drop=True) to address the following UserWarning: 90 | # Pandas Dataframe has non-standard index of type which will not be written. Consider changing the 91 | # index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s) 92 | conn, 93 | table_name=table_name, 94 | cluster=False, 95 | ) 96 | except ValueError as value_error: 97 | print( 98 | f"This ValueError: {value_error} This pertains to this US loader: {table_name}" 99 | ) 100 | 101 | 102 | if __name__ == "__main__": 103 | # TODO: perhaps make a real CLI here. 104 | import sys 105 | 106 | N_ARGS = 2 107 | assert len(sys.argv) == N_ARGS, "Expecting 1 argument: year (four digits)" 108 | 109 | year = int(sys.argv[1]) 110 | 111 | conn = snowflake_connection_from_environment( 112 | schema=f"TIGER_{year}", 113 | client_session_keep_alive=True, 114 | ) 115 | 116 | load_geo_data(conn, year) 117 | -------------------------------------------------------------------------------- /jobs/geo/write_building_footprints.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | from jobs.utils.snowflake import snowflake_connection_from_environment 6 | 7 | 8 | def write_building_footprints(conn, kind: str): 9 | """Grab Microsoft building footprints data enriched with Census TIGER Blocks data for California from Snowflake and write to an S3 bucket.""" 10 | import geopandas 11 | import s3fs 12 | import shapely.wkb 13 | 14 | sql_alter = """ 15 | alter session set GEOGRAPHY_OUTPUT_FORMAT='WKB'; 16 | """ 17 | conn.cursor().execute(sql_alter) 18 | 19 | ref = ( 20 | "ANALYTICS_PRD.BUILDING_FOOTPRINTS" 21 | f".GEO_REFERENCE__{kind.upper()}_BUILDING_FOOTPRINTS_WITH_TIGER" 22 | ) 23 | sql_counties = f""" 24 | SELECT DISTINCT "county_fips" 25 | FROM {ref} 26 | ORDER BY 1 ASC 27 | """ 28 | 29 | counties = conn.cursor().execute(sql_counties).fetchall() 30 | counties = [x[0] for x in counties if x[0] is not None] 31 | 32 | for index, county in enumerate(counties): 33 | sql_table = f""" 34 | SELECT * 35 | FROM {ref} 36 | WHERE "county_fips" = {county} 37 | """ 38 | df = conn.cursor().execute(sql_table).fetch_pandas_all() 39 | gdf = geopandas.GeoDataFrame( 40 | df.assign(geometry=df.geometry.apply(shapely.wkb.loads)), 41 | crs="EPSG:4326", 42 | ) 43 | 44 | gdf = gdf[gdf.geometry.geom_type != "GeometryCollection"] 45 | 46 | file_prefix = f"county_fips_{county}" 47 | gdf.to_parquet(f"{file_prefix}.parquet") 48 | # .shz suffix triggers GDAL to write zipped shapefile 49 | gdf.to_file(f"{file_prefix}.shz") 50 | 51 | print( 52 | f"Loading {file_prefix}. This is number {index+1} out of {len(counties)} counties." 53 | ) 54 | 55 | s3 = s3fs.S3FileSystem(anon=False) 56 | s3.put( 57 | f"{file_prefix}.parquet", 58 | f"s3://dof-demographics-dev-us-west-2-public/{kind}_building_footprints/parquet/{file_prefix}.parquet", 59 | ) 60 | # Esri doesn't like .shp.zip or .shz, so rename to just be .zip. 61 | s3.put( 62 | f"{file_prefix}.shz", 63 | f"s3://dof-demographics-dev-us-west-2-public/{kind}_building_footprints/shp/{file_prefix}.zip", 64 | ) 65 | 66 | os.remove(f"{file_prefix}.parquet") 67 | os.remove(f"{file_prefix}.shz") 68 | 69 | 70 | if __name__ == "__main__": 71 | import sys 72 | 73 | N_ARGS = 1 74 | 75 | # This is a bit of a hack: our batch jobs are designed more around loading data 76 | # to a data warehouse than unloading it, and so the default connection parameters 77 | # specify a LOADER role. Here we replace that with a REPORTER role for grabbing 78 | # data from the marts db. 79 | os.environ["SNOWFLAKE_ROLE"] = os.environ["SNOWFLAKE_ROLE"].replace( 80 | "LOADER", "REPORTER" 81 | ) 82 | os.environ["SNOWFLAKE_WAREHOUSE"] = os.environ["SNOWFLAKE_WAREHOUSE"].replace( 83 | "LOADING", "REPORTING" 84 | ) 85 | 86 | conn = snowflake_connection_from_environment( 87 | client_session_keep_alive=True, # This can be a slow job! Keep the session alive 88 | ) 89 | 90 | if len(sys.argv) != N_ARGS + 1 or sys.argv[1] not in ("global_ml", "us"): 91 | raise ValueError( 92 | "Must provide specify one of 'global_ml' or 'us' for building footprint source" 93 | ) 94 | 95 | write_building_footprints(conn, kind=sys.argv[1]) 96 | -------------------------------------------------------------------------------- /jobs/test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | import geopandas 6 | 7 | URL = "https://usbuildingdata.blob.core.windows.net/usbuildings-v2/Alaska.geojson.zip" 8 | 9 | if __name__ == "__main__": 10 | gdf = geopandas.read_file(URL) 11 | gdf.to_parquet(f"s3://{os.environ['BUCKET']}/alaska.parquet") 12 | -------------------------------------------------------------------------------- /jobs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Common utilities.""" 2 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: CalData Data Services and Engineering Infrastructure 2 | theme: 3 | name: material 4 | logo: images/odi-square_logomark-blue.svg 5 | favicon: images/odi-circle_logomark-blue.png 6 | features: 7 | - content.code.copy 8 | palette: 9 | # Palette toggle for light mode 10 | - scheme: default 11 | toggle: 12 | icon: material/weather-night 13 | name: Switch to dark mode 14 | 15 | # Palette toggle for dark mode 16 | - scheme: slate 17 | toggle: 18 | icon: material/weather-sunny 19 | name: Switch to light mode 20 | extra_css: 21 | - stylesheets/extra.css 22 | 23 | repo_name: cagov/data-infrastructure 24 | repo_url: https://github.com/cagov/data-infrastructure 25 | edit_uri: "" 26 | 27 | markdown_extensions: 28 | - toc: 29 | permalink: true 30 | - admonition 31 | - pymdownx.highlight: 32 | auto_title: false 33 | - pymdownx.superfences: 34 | custom_fences: 35 | - name: mermaid 36 | class: mermaid 37 | format: !!python/name:pymdownx.superfences.fence_code_format 38 | 39 | nav: 40 | - Introduction: index.md 41 | - Code development: 42 | - Local environment setup: code/local-setup.md 43 | - Codespaces: code/codespaces.md 44 | - Code Review: code/code-review.md 45 | - Writing Documentation: code/writing-documentation.md 46 | - Terraform Setup: code/terraform-local-setup.md 47 | - GitHub Project Management: code/github-project-management.md 48 | - Azure DevOps Project Management: code/azdevops-project-management.md 49 | - Project infrastructure: 50 | - Cloud infrastructure: infra/cloud-infrastructure.md 51 | - Project architecture: infra/architecture.md 52 | - Snowflake overview: infra/snowflake.md 53 | - Project setup: 54 | - Snowflake setup: setup/snowflake-setup.md 55 | - git/Github setup: setup/repo-setup.md 56 | - Terraform setup: setup/terraform-project-setup.md 57 | - Sentinel setup: setup/sentinel-setup.md 58 | - dbt Cloud setup: setup/dbt-setup.md 59 | - Adding service accounts: setup/snowflake-service-accounts.md 60 | - Project teardown: setup/project-teardown.md 61 | - dbt: 62 | - dbt overview: dbt/dbt.md 63 | - dbt performance: dbt/dbt-performance.md 64 | - dbt Cloud Snowflake project: dbt_docs_snowflake/index.html 65 | - Data: 66 | - Building footprints: data/footprints.md 67 | - Learning: 68 | - MDSA glossary: learning/glossary.md 69 | - Security guidelines: learning/security.md 70 | - Naming conventions: learning/naming-conventions.md 71 | - git: learning/git.md 72 | - dbt: learning/dbt.md 73 | - Cloud data warehouses: learning/cloud-data-warehouses.md 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 88 3 | select = [ 4 | "B", # flake8-bugbear 5 | "BLE", # flake8-blind-except 6 | "C4", # comprehensions 7 | "D", # pydocstyle 8 | "E", # pycodestyle 9 | "F", # pyflakes 10 | "I", # isort 11 | "ISC", # flake8-implicit-str-concat 12 | "PGH", # pygrep-hooks 13 | "PLC", # pylint 14 | "PLE", # pylint 15 | "PLR", # pylint import style 16 | "PLW", # pylint 17 | "RET", # flake8-return 18 | "RUF", # ruff-specific rules 19 | "SIM", # flake8-simplify 20 | "T10", # flake8-debugger 21 | "TID", # flake8-tidy-imports 22 | "UP", # pyupgrade 23 | "W", # pycodestyle 24 | "YTT", # flake8-2020 25 | 26 | ] 27 | respect-gitignore = true 28 | ignore = [ 29 | "D100", # public module 30 | "D101", # public class 31 | "D102", # public method 32 | "D103", # public function 33 | "D104", # public package 34 | "D203", # blank line before docstring 35 | "D212", # Start multi-line docstring at the second line. 36 | "E501", # line length handled by black 37 | "ISC001", # Handled by formatter 38 | "PGH003", # specific mypy ignore codes 39 | "PLR0913", # too many arguments 40 | "PLR0912", # too many branches 41 | "RET505", # no-else-return 42 | "RET506", # no-else-raise 43 | ] 44 | target-version = "py310" 45 | 46 | [tool.mypy] 47 | python_version = "3.10" 48 | allow_untyped_decorators = true # would love to enable this, but airflow decorators are untyped 49 | ignore_missing_imports = true 50 | no_implicit_optional = true 51 | show_error_codes = true 52 | warn_redundant_casts = true 53 | warn_unused_ignores = false 54 | warn_unreachable = true 55 | 56 | [tool.poetry] 57 | name = "data-infrastructure" 58 | version = "0.1.0" 59 | description = "\"CalData Data Services and Engineering Infrastructure\"" 60 | authors = ["Ian Rose "] 61 | license = "MIT" 62 | readme = "README.md" 63 | package-mode = false 64 | 65 | [tool.poetry.dependencies] 66 | python = "^3.10" 67 | mkdocs-material = "~9.1.3" 68 | dbt-core = "~1.8.0" 69 | dbt-snowflake = "~1.8.0" 70 | awscliv2 = "^2.2.0" 71 | 72 | [tool.poetry.group.dev.dependencies] 73 | pre-commit = "^3.3.1" 74 | sqlfluff = "3.0.7" 75 | sqlfluff-templater-dbt = "3.0.7" 76 | 77 | [build-system] 78 | requires = ["poetry-core"] 79 | build-backend = "poetry.core.masonry.api" 80 | -------------------------------------------------------------------------------- /terraform/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform 2 | *.tfstate 3 | *.tfstate.backup 4 | *.tfstate.*.backup 5 | # Created by pre-commit, but not intended to be a stand-alone module. 6 | s3-remote-state/.terraform.lock.hcl 7 | -------------------------------------------------------------------------------- /terraform/aws/environments/dev/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/aws" { 5 | version = "4.56.0" 6 | constraints = "4.56.0" 7 | hashes = [ 8 | "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=", 9 | "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=", 10 | "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=", 11 | "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab", 12 | "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa", 13 | "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03", 14 | "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0", 15 | "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39", 16 | "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b", 17 | "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa", 18 | "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", 19 | "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1", 20 | "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87", 21 | "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711", 22 | "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365", 23 | "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054", 24 | "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81", 25 | "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef", 26 | ] 27 | } 28 | 29 | provider "registry.terraform.io/hashicorp/random" { 30 | version = "3.4.3" 31 | constraints = "3.4.3" 32 | hashes = [ 33 | "h1:saZR+mhthL0OZl4SyHXZraxyaBNVMxiZzks78nWcZ2o=", 34 | "h1:tL3katm68lX+4lAncjQA9AXL4GR/VM+RPwqYf4D2X8Q=", 35 | "h1:xZGZf18JjMS06pFa4NErzANI98qi59SEcBsOcS2P2yQ=", 36 | "zh:41c53ba47085d8261590990f8633c8906696fa0a3c4b384ff6a7ecbf84339752", 37 | "zh:59d98081c4475f2ad77d881c4412c5129c56214892f490adf11c7e7a5a47de9b", 38 | "zh:686ad1ee40b812b9e016317e7f34c0d63ef837e084dea4a1f578f64a6314ad53", 39 | "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", 40 | "zh:84103eae7251384c0d995f5a257c72b0096605048f757b749b7b62107a5dccb3", 41 | "zh:8ee974b110adb78c7cd18aae82b2729e5124d8f115d484215fd5199451053de5", 42 | "zh:9dd4561e3c847e45de603f17fa0c01ae14cae8c4b7b4e6423c9ef3904b308dda", 43 | "zh:bb07bb3c2c0296beba0beec629ebc6474c70732387477a65966483b5efabdbc6", 44 | "zh:e891339e96c9e5a888727b45b2e1bb3fcbdfe0fd7c5b4396e4695459b38c8cb1", 45 | "zh:ea4739860c24dfeaac6c100b2a2e357106a89d18751f7693f3c31ecf6a996f8d", 46 | "zh:f0c76ac303fd0ab59146c39bc121c5d7d86f878e9a69294e29444d4c653786f8", 47 | "zh:f143a9a5af42b38fed328a161279906759ff39ac428ebcfe55606e05e1518b93", 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /terraform/aws/environments/dev/dse-infra-dev.tfbackend: -------------------------------------------------------------------------------- 1 | bucket = "dse-infra-dev-terraform-state" 2 | dynamodb_table = "dse-infra-dev-terraform-state-lock" 3 | key = "dse-infra-dev.tfstate" 4 | region = "us-west-1" 5 | -------------------------------------------------------------------------------- /terraform/aws/environments/dev/main.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # Terraform Setup # 3 | ################################## 4 | 5 | terraform { 6 | required_version = ">= 1.0" 7 | 8 | required_providers { 9 | aws = { 10 | source = "hashicorp/aws" 11 | version = "4.56.0" 12 | } 13 | random = { 14 | source = "hashicorp/random" 15 | version = "3.4.3" 16 | } 17 | } 18 | 19 | backend "s3" { 20 | } 21 | } 22 | 23 | locals { 24 | owner = "dse" 25 | environment = "dev" 26 | project = "infra" 27 | region = "us-west-2" 28 | } 29 | 30 | provider "aws" { 31 | region = local.region 32 | 33 | default_tags { 34 | tags = { 35 | Owner = local.owner 36 | Project = local.project 37 | Environment = local.environment 38 | } 39 | } 40 | } 41 | 42 | ############################ 43 | # Infrastructure # 44 | ############################ 45 | 46 | module "infra" { 47 | source = "../../modules/infra" 48 | 49 | owner = local.owner 50 | environment = local.environment 51 | project = local.project 52 | snowflake_loader_secret = { 53 | test = "dse-snowflake-dev-us-west-2-loader" 54 | latest = "dse-snowflake-prd-us-west-2-loader" 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /terraform/aws/environments/dev/remote-state/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/aws" { 5 | version = "4.56.0" 6 | constraints = "4.56.0" 7 | hashes = [ 8 | "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=", 9 | "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=", 10 | "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=", 11 | "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab", 12 | "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa", 13 | "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03", 14 | "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0", 15 | "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39", 16 | "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b", 17 | "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa", 18 | "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", 19 | "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1", 20 | "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87", 21 | "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711", 22 | "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365", 23 | "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054", 24 | "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81", 25 | "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef", 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /terraform/aws/environments/dev/remote-state/main.tf: -------------------------------------------------------------------------------- 1 | ../../../../s3-remote-state/main.tf -------------------------------------------------------------------------------- /terraform/aws/environments/dev/remote-state/terraform.tfvars: -------------------------------------------------------------------------------- 1 | owner = "dse" 2 | environment = "dev" 3 | project = "infra" 4 | region = "us-west-1" 5 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/airflow.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "mwaa" { 2 | name = "${local.prefix}-${var.region}-mwaa-execution-role" 3 | assume_role_policy = data.aws_iam_policy_document.assume.json 4 | } 5 | 6 | resource "aws_iam_policy" "mwaa" { 7 | name = "${local.prefix}-${var.region}-mwaa-execution-policy" 8 | policy = data.aws_iam_policy_document.mwaa.json 9 | } 10 | 11 | resource "aws_iam_role_policy_attachment" "mwaa_execution_role" { 12 | role = aws_iam_role.mwaa.name 13 | policy_arn = aws_iam_policy.mwaa.arn 14 | } 15 | 16 | resource "aws_iam_role_policy_attachment" "mwaa_batch_submit_role" { 17 | role = aws_iam_role.mwaa.name 18 | policy_arn = aws_iam_policy.batch_submit_policy.arn 19 | } 20 | 21 | locals { 22 | # Define the environment name as a `local` so we can refer to it in the 23 | # execution role policy without introducing a cycle. 24 | environment_name = "${local.prefix}-${var.region}-mwaa-environment" 25 | } 26 | 27 | data "aws_iam_policy_document" "assume" { 28 | version = "2012-10-17" 29 | statement { 30 | effect = "Allow" 31 | principals { 32 | identifiers = [ 33 | "airflow-env.amazonaws.com", 34 | "airflow.amazonaws.com" 35 | ] 36 | type = "Service" 37 | } 38 | actions = [ 39 | "sts:AssumeRole" 40 | ] 41 | } 42 | } 43 | 44 | data "aws_iam_policy_document" "mwaa" { 45 | version = "2012-10-17" 46 | statement { 47 | effect = "Allow" 48 | actions = [ 49 | "airflow:PublishMetrics" 50 | ] 51 | resources = [ 52 | "arn:aws:airflow:${var.region}:${data.aws_caller_identity.current.account_id}:environment/${local.environment_name}" 53 | ] 54 | } 55 | statement { 56 | effect = "Deny" 57 | actions = ["s3:ListAllMyBuckets"] 58 | resources = [ 59 | aws_s3_bucket.mwaa.arn, 60 | "${aws_s3_bucket.mwaa.arn}/*", 61 | ] 62 | } 63 | statement { 64 | effect = "Allow" 65 | actions = [ 66 | "s3:GetObject*", 67 | "s3:GetBucket*", 68 | "s3:List*" 69 | ] 70 | resources = [ 71 | aws_s3_bucket.mwaa.arn, 72 | "${aws_s3_bucket.mwaa.arn}/*", 73 | ] 74 | } 75 | statement { 76 | effect = "Allow" 77 | actions = [ 78 | "s3:GetAccountPublicAccessBlock" 79 | ] 80 | resources = ["*"] 81 | } 82 | statement { 83 | effect = "Allow" 84 | actions = [ 85 | "logs:CreateLogStream", 86 | "logs:CreateLogGroup", 87 | "logs:PutLogEvents", 88 | "logs:GetLogEvents", 89 | "logs:GetLogRecord", 90 | "logs:GetLogGroupFields", 91 | "logs:GetQueryResults" 92 | ] 93 | resources = [ 94 | "arn:aws:logs:${var.region}:${data.aws_caller_identity.current.account_id}:log-group:airflow-${local.environment_name}-*" 95 | ] 96 | } 97 | statement { 98 | effect = "Allow" 99 | actions = [ 100 | "logs:DescribeLogGroups" 101 | ] 102 | resources = [ 103 | "*" 104 | ] 105 | } 106 | statement { 107 | 108 | effect = "Allow" 109 | actions = [ 110 | "cloudwatch:PutMetricData" 111 | ] 112 | resources = [ 113 | "*" 114 | ] 115 | } 116 | statement { 117 | effect = "Allow" 118 | actions = [ 119 | "sqs:ChangeMessageVisibility", 120 | "sqs:DeleteMessage", 121 | "sqs:GetQueueAttributes", 122 | "sqs:GetQueueUrl", 123 | "sqs:ReceiveMessage", 124 | "sqs:SendMessage" 125 | ] 126 | resources = [ 127 | "arn:aws:sqs:${var.region}:*:airflow-celery-*" 128 | ] 129 | } 130 | statement { 131 | effect = "Allow" 132 | actions = [ 133 | "kms:Decrypt", 134 | "kms:DescribeKey", 135 | "kms:GenerateDataKey*", 136 | "kms:Encrypt" 137 | ] 138 | resources = [] 139 | not_resources = ["arn:aws:kms:*:${data.aws_caller_identity.current.account_id}:key/*"] 140 | condition { 141 | test = "StringLike" 142 | values = [ 143 | "sqs.${var.region}.amazonaws.com" 144 | ] 145 | variable = "kms:ViaService" 146 | } 147 | } 148 | } 149 | 150 | resource "aws_mwaa_environment" "this" { 151 | execution_role_arn = aws_iam_role.mwaa.arn 152 | name = local.environment_name 153 | schedulers = 2 154 | max_workers = 5 155 | min_workers = 1 156 | airflow_version = "2.7.2" 157 | 158 | airflow_configuration_options = { 159 | "custom.scratch_bucket" = aws_s3_bucket.scratch.id 160 | "custom.default_job_queue" = aws_batch_job_queue.default.name 161 | # Note: default job definition to the "latest", rather than the "test" environment. 162 | "custom.default_job_definition" = aws_batch_job_definition.default["latest"].name 163 | } 164 | 165 | logging_configuration { 166 | dag_processing_logs { 167 | enabled = true 168 | log_level = "INFO" 169 | } 170 | 171 | scheduler_logs { 172 | enabled = true 173 | log_level = "INFO" 174 | } 175 | 176 | task_logs { 177 | enabled = true 178 | log_level = "INFO" 179 | } 180 | 181 | webserver_logs { 182 | enabled = true 183 | log_level = "INFO" 184 | } 185 | 186 | worker_logs { 187 | enabled = true 188 | log_level = "INFO" 189 | } 190 | } 191 | 192 | 193 | source_bucket_arn = aws_s3_bucket.mwaa.arn 194 | dag_s3_path = "dags/" 195 | requirements_s3_path = "requirements.txt" 196 | 197 | network_configuration { 198 | security_group_ids = [aws_security_group.mwaa.id] 199 | subnet_ids = aws_subnet.private[*].id 200 | } 201 | webserver_access_mode = "PUBLIC_ONLY" 202 | depends_on = [aws_iam_role_policy_attachment.mwaa_execution_role] 203 | } 204 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/batch.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # AWS Batch # 3 | ################################## 4 | 5 | data "aws_iam_policy_document" "aws_batch_service_policy" { 6 | statement { 7 | actions = [ 8 | "sts:AssumeRole" 9 | ] 10 | effect = "Allow" 11 | principals { 12 | type = "Service" 13 | identifiers = ["batch.amazonaws.com"] 14 | } 15 | } 16 | } 17 | 18 | 19 | resource "aws_iam_role" "aws_batch_service_role" { 20 | name = "${local.prefix}-${var.region}-batch-service-role" 21 | assume_role_policy = data.aws_iam_policy_document.aws_batch_service_policy.json 22 | } 23 | 24 | resource "aws_iam_role_policy_attachment" "aws_batch_service_role" { 25 | role = aws_iam_role.aws_batch_service_role.name 26 | policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole" 27 | } 28 | 29 | resource "aws_batch_compute_environment" "default" { 30 | compute_environment_name = "${local.prefix}-${var.region}-default" 31 | 32 | compute_resources { 33 | max_vcpus = 16 34 | 35 | security_group_ids = [ 36 | aws_security_group.batch.id 37 | ] 38 | 39 | subnets = aws_subnet.public[*].id 40 | 41 | type = "FARGATE" 42 | } 43 | 44 | service_role = aws_iam_role.aws_batch_service_role.arn 45 | type = "MANAGED" 46 | depends_on = [aws_iam_role_policy_attachment.aws_batch_service_role] 47 | } 48 | 49 | resource "aws_iam_role" "ecs_task_execution_role" { 50 | name = "${local.prefix}-${var.region}-batch-exec-role" 51 | assume_role_policy = data.aws_iam_policy_document.assume_role_policy.json 52 | } 53 | 54 | data "aws_iam_policy_document" "assume_role_policy" { 55 | statement { 56 | actions = ["sts:AssumeRole"] 57 | 58 | principals { 59 | type = "Service" 60 | identifiers = ["ecs-tasks.amazonaws.com"] 61 | } 62 | } 63 | } 64 | 65 | resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" { 66 | role = aws_iam_role.ecs_task_execution_role.name 67 | policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" 68 | } 69 | 70 | resource "aws_iam_role_policy_attachment" "ecs_task_execution_access_snowflake_loader" { 71 | for_each = toset(local.jobs) 72 | role = aws_iam_role.ecs_task_execution_role.name 73 | policy_arn = aws_iam_policy.access_snowflake_loader[each.key].arn 74 | } 75 | 76 | resource "aws_iam_role" "batch_job_role" { 77 | name = "${local.prefix}-${var.region}-batch-job-role" 78 | description = "Role for AWS batch jobs" 79 | assume_role_policy = data.aws_iam_policy_document.assume_role_policy.json 80 | } 81 | 82 | resource "aws_iam_role_policy_attachment" "s3_scratch_policy_role_attachment" { 83 | role = aws_iam_role.batch_job_role.name 84 | policy_arn = aws_iam_policy.s3_scratch_policy.arn 85 | } 86 | 87 | resource "aws_iam_role_policy_attachment" "dof_demographics_read_write_access" { 88 | role = aws_iam_role.batch_job_role.name 89 | policy_arn = aws_iam_policy.dof_demographics_read_write_access.arn 90 | } 91 | 92 | resource "aws_batch_job_queue" "default" { 93 | name = "${local.prefix}-${var.region}-default" 94 | state = "ENABLED" 95 | priority = 1 96 | compute_environments = [ 97 | aws_batch_compute_environment.default.arn, 98 | ] 99 | } 100 | 101 | resource "aws_batch_job_definition" "default" { 102 | for_each = toset(local.jobs) 103 | name = "${local.prefix}-${var.region}-${each.key}" 104 | type = "container" 105 | platform_capabilities = ["FARGATE"] 106 | 107 | container_properties = jsonencode({ 108 | command = ["echo", "$SNOWFLAKE_USER", "$SNOWFLAKE_ROLE"] 109 | image = "${aws_ecr_repository.default.repository_url}:${each.key}" 110 | fargatePlatformConfiguration = { 111 | platformVersion = "LATEST" 112 | } 113 | resourceRequirements = [ 114 | { type = "VCPU", value = "0.25" }, 115 | { type = "MEMORY", value = "512" } 116 | ] 117 | # TODO: Figure out how to properly pass in a private key rather than a password. 118 | # Ran into some issues with properly encoding it as an environment variable. 119 | secrets = [ 120 | for s in local.snowflake_data : { 121 | name = "SNOWFLAKE_${upper(s)}", 122 | valueFrom = data.aws_secretsmanager_secret.snowflake_loader_secret[each.key].arn != null ? "${data.aws_secretsmanager_secret.snowflake_loader_secret[each.key].arn}:${s}::" : "" 123 | } 124 | ] 125 | networkConfiguration = { 126 | assignPublicIp : "ENABLED" 127 | }, 128 | executionRoleArn = aws_iam_role.ecs_task_execution_role.arn 129 | jobRoleArn = aws_iam_role.batch_job_role.arn 130 | }) 131 | } 132 | 133 | data "aws_iam_policy_document" "batch_submit_policy_document" { 134 | statement { 135 | actions = [ 136 | "batch:SubmitJob", 137 | "batch:CancelJob", 138 | "batch:ListJobs", 139 | ] 140 | resources = [ 141 | "arn:aws:batch:${var.region}:${data.aws_caller_identity.current.account_id}:job-definition/${local.prefix}*", 142 | aws_batch_job_queue.default.arn, 143 | ] 144 | } 145 | statement { 146 | actions = [ 147 | "batch:DescribeJobs", 148 | ] 149 | resources = ["*"] 150 | } 151 | } 152 | 153 | resource "aws_iam_policy" "batch_submit_policy" { 154 | name = "${local.prefix}-${var.region}-batch-submit-policy" 155 | description = "Policy allowing to submit batch jobs for ${local.prefix}" 156 | policy = data.aws_iam_policy_document.batch_submit_policy_document.json 157 | } 158 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/ecr.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # Container registry # 3 | ################################## 4 | 5 | resource "aws_ecr_repository" "default" { 6 | name = "${local.prefix}-${var.region}-default" 7 | image_tag_mutability = "MUTABLE" 8 | 9 | image_scanning_configuration { 10 | scan_on_push = true 11 | } 12 | } 13 | 14 | data "aws_iam_policy_document" "default_ecr_policy_document" { 15 | # Policy from https://github.com/aws-actions/amazon-ecr-login#permissions 16 | statement { 17 | actions = [ 18 | "ecr:BatchGetImage", 19 | "ecr:BatchCheckLayerAvailability", 20 | "ecr:CompleteLayerUpload", 21 | "ecr:GetDownloadUrlForLayer", 22 | "ecr:InitiateLayerUpload", 23 | "ecr:PutImage", 24 | "ecr:UploadLayerPart", 25 | ] 26 | resources = [aws_ecr_repository.default.arn] 27 | } 28 | statement { 29 | actions = [ 30 | "ecr:GetAuthorizationToken" 31 | ] 32 | # Why does this need *? https://github.com/aws-actions/amazon-ecr-login#ecr-private 33 | resources = ["*"] 34 | } 35 | } 36 | 37 | resource "aws_iam_policy" "default_ecr_policy" { 38 | name = "${local.prefix}-${var.region}-default-ecr-push-policy" 39 | description = "Policy allowing pushing to the default ecr repository for ${local.prefix}" 40 | policy = data.aws_iam_policy_document.default_ecr_policy_document.json 41 | } 42 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/iam.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # IAM Policies # 3 | ################################## 4 | 5 | # Adapted from https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_examples_aws_my-sec-creds-self-manage.html 6 | data "aws_iam_policy_document" "self_manage_credentials" { 7 | statement { 8 | sid = "AllowViewAccountInfo" 9 | effect = "Allow" 10 | actions = [ 11 | "iam:GetAccountPasswordPolicy", 12 | "iam:GetAccountSummary", 13 | "iam:ListVirtualMFADevices", 14 | ] 15 | resources = ["*"] 16 | } 17 | statement { 18 | sid = "AllowManageOwnPasswords" 19 | effect = "Allow" 20 | actions = [ 21 | "iam:ChangePassword", 22 | "iam:GetUser", 23 | "iam:CreateLoginProfile", 24 | "iam:DeleteLoginProfile", 25 | "iam:GetLoginProfile", 26 | "iam:UpdateLoginProfile", 27 | ] 28 | resources = ["arn:aws:iam::*:user/$${aws:username}"] 29 | } 30 | statement { 31 | sid = "AllowManageOwnAccessKeys" 32 | effect = "Allow" 33 | actions = [ 34 | "iam:CreateAccessKey", 35 | "iam:DeleteAccessKey", 36 | "iam:ListAccessKeys", 37 | "iam:UpdateAccessKey", 38 | ] 39 | resources = ["arn:aws:iam::*:user/$${aws:username}"] 40 | } 41 | statement { 42 | sid = "AllowManageOwnVirtualMFADevice" 43 | effect = "Allow" 44 | actions = [ 45 | "iam:CreateVirtualMFADevice" 46 | ] 47 | resources = ["arn:aws:iam::*:mfa/*"] 48 | } 49 | statement { 50 | sid = "AllowManageOwnUserMFA" 51 | effect = "Allow" 52 | actions = [ 53 | "iam:DeactivateMFADevice", 54 | "iam:EnableMFADevice", 55 | "iam:ListMFADevices", 56 | "iam:ResyncMFADevice" 57 | ] 58 | resources = ["arn:aws:iam::*:user/$${aws:username}"] 59 | } 60 | } 61 | 62 | resource "aws_iam_policy" "self_manage_credentials" { 63 | name = "${local.prefix}-self-manage-credentials-policy" 64 | description = "Allow a user to manage their own credentials" 65 | policy = data.aws_iam_policy_document.self_manage_credentials.json 66 | } 67 | 68 | data "aws_iam_policy_document" "access_snowflake_loader" { 69 | for_each = toset(local.jobs) 70 | statement { 71 | actions = ["secretsmanager:GetSecretValue"] 72 | resources = [ 73 | data.aws_secretsmanager_secret.snowflake_loader_secret[each.key].arn 74 | ] 75 | } 76 | } 77 | 78 | resource "aws_iam_policy" "access_snowflake_loader" { 79 | for_each = toset(local.jobs) 80 | name = "${local.prefix}-access-snowflake-loader-${each.key}" 81 | description = "Allow a user/role to access Snowflake loader role in SecretsManager for the ${each.key} secret" 82 | policy = data.aws_iam_policy_document.access_snowflake_loader[each.key].json 83 | } 84 | 85 | ################################## 86 | # IAM Service Users # 87 | ################################## 88 | 89 | # NOTE: in general, policies and roles are defined close to the resources 90 | # they support. 91 | 92 | # CD bot for GitHub actions 93 | resource "aws_iam_user" "cd_bot" { 94 | name = "${local.prefix}-cd-bot" 95 | } 96 | 97 | resource "aws_iam_user_policy_attachment" "ecr_cd_bot_policy_attachment" { 98 | user = aws_iam_user.cd_bot.name 99 | policy_arn = aws_iam_policy.default_ecr_policy.arn 100 | } 101 | 102 | resource "aws_iam_user_policy_attachment" "batch_cd_bot_policy_attachment" { 103 | user = aws_iam_user.cd_bot.name 104 | policy_arn = aws_iam_policy.batch_submit_policy.arn 105 | } 106 | 107 | ################################## 108 | # IAM Human Users # 109 | ################################## 110 | 111 | resource "aws_iam_user" "arman" { 112 | name = "ArmanMadani" 113 | } 114 | 115 | resource "aws_iam_user" "esa" { 116 | name = "EsaEslami" 117 | } 118 | 119 | resource "aws_iam_user" "kim" { 120 | name = "KimHicks" 121 | } 122 | 123 | resource "aws_iam_user" "monica" { 124 | name = "MonicaBobra" 125 | } 126 | 127 | resource "aws_iam_user" "rocio" { 128 | name = "RocioMora" 129 | } 130 | 131 | ################################## 132 | # IAM User Groups # 133 | ################################## 134 | 135 | resource "aws_iam_group" "aae" { 136 | name = "odi-advanced-analytics-${var.environment}" 137 | } 138 | 139 | resource "aws_iam_group_policy_attachment" "aae_dsa_project" { 140 | for_each = toset(local.dsa_projects) 141 | group = aws_iam_group.aae.name 142 | policy_arn = aws_iam_policy.s3_dsa_project_policy[each.key].arn 143 | } 144 | 145 | resource "aws_iam_group_policy_attachment" "aae_list_all_my_buckets" { 146 | group = aws_iam_group.aae.name 147 | policy_arn = aws_iam_policy.s3_list_all_my_buckets.arn 148 | } 149 | 150 | resource "aws_iam_group_policy_attachment" "aae_self_manage_creentials" { 151 | group = aws_iam_group.aae.name 152 | policy_arn = aws_iam_policy.self_manage_credentials.arn 153 | } 154 | 155 | resource "aws_iam_group_membership" "aae" { 156 | name = "${aws_iam_group.aae.name}-membership" 157 | group = aws_iam_group.aae.name 158 | users = [ 159 | aws_iam_user.arman.name, 160 | aws_iam_user.esa.name, 161 | aws_iam_user.kim.name, 162 | aws_iam_user.monica.name, 163 | aws_iam_user.rocio.name, 164 | ] 165 | } 166 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/main.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # Terraform Setup # 3 | ################################## 4 | 5 | terraform { 6 | # Note: when a package is added or updated, we have to update the lockfile in a 7 | # platform-independent way, cf. https://github.com/hashicorp/terraform/issues/28041 8 | # To update the lockfile run: 9 | # 10 | # terraform providers lock -platform=linux_amd64 -platform=darwin_amd64 11 | required_providers { 12 | aws = { 13 | source = "hashicorp/aws" 14 | version = "4.56.0" 15 | } 16 | random = { 17 | source = "hashicorp/random" 18 | version = "3.4.3" 19 | } 20 | } 21 | required_version = ">= 1.0" 22 | } 23 | 24 | data "aws_caller_identity" "current" {} 25 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/network.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # Networking # 3 | ################################## 4 | 5 | data "aws_availability_zones" "available" { 6 | state = "available" 7 | } 8 | 9 | resource "aws_vpc" "this" { 10 | cidr_block = "10.0.0.0/16" 11 | 12 | tags = { 13 | Name = "${local.prefix}-main" 14 | } 15 | } 16 | 17 | resource "aws_security_group" "batch" { 18 | name = "${local.prefix}-batch-sg" 19 | description = "Allow ECS tasks to reach out to internet" 20 | vpc_id = aws_vpc.this.id 21 | 22 | egress { 23 | description = "Allow ECS tasks to talk to the internet" 24 | from_port = 0 25 | to_port = 0 26 | protocol = "-1" 27 | cidr_blocks = ["0.0.0.0/0"] 28 | } 29 | 30 | tags = { 31 | Name = "${local.prefix}-batch-sg" 32 | } 33 | } 34 | 35 | resource "aws_security_group" "mwaa" { 36 | vpc_id = aws_vpc.this.id 37 | name = "${local.prefix}-mwaa-no-ingress-sg" 38 | ingress { 39 | from_port = 0 40 | to_port = 0 41 | protocol = "-1" 42 | self = true 43 | } 44 | egress { 45 | from_port = 0 46 | to_port = 0 47 | protocol = "-1" 48 | cidr_blocks = [ 49 | "0.0.0.0/0" 50 | ] 51 | } 52 | 53 | tags = { 54 | Name = "${local.prefix}-mwaa-no-ingress-sg" 55 | } 56 | } 57 | 58 | resource "aws_internet_gateway" "this" { 59 | vpc_id = aws_vpc.this.id 60 | 61 | tags = { 62 | Name = "${local.prefix}-main" 63 | } 64 | } 65 | 66 | resource "aws_route_table" "public" { 67 | vpc_id = aws_vpc.this.id 68 | route { 69 | cidr_block = "0.0.0.0/0" 70 | gateway_id = aws_internet_gateway.this.id 71 | } 72 | 73 | tags = { 74 | Name = "${local.prefix}-public" 75 | } 76 | } 77 | 78 | resource "random_id" "public_subnet" { 79 | count = 2 80 | byte_length = 3 81 | } 82 | 83 | resource "random_id" "private_subnet" { 84 | count = 2 85 | byte_length = 3 86 | } 87 | 88 | resource "aws_eip" "this" { 89 | count = 2 90 | vpc = true 91 | 92 | tags = { 93 | Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-nat-${random_id.private_subnet[count.index].hex}" 94 | } 95 | } 96 | 97 | resource "aws_nat_gateway" "this" { 98 | count = length(aws_subnet.public) 99 | allocation_id = aws_eip.this[count.index].id 100 | subnet_id = aws_subnet.public[count.index].id 101 | tags = { 102 | Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-nat-${random_id.private_subnet[count.index].hex}" 103 | } 104 | } 105 | 106 | resource "aws_route_table" "private" { 107 | count = length(aws_nat_gateway.this) 108 | vpc_id = aws_vpc.this.id 109 | route { 110 | cidr_block = "0.0.0.0/0" 111 | nat_gateway_id = aws_nat_gateway.this[count.index].id 112 | } 113 | tags = { 114 | Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-nat-${random_id.private_subnet[count.index].hex}" 115 | } 116 | } 117 | 118 | resource "aws_subnet" "public" { 119 | count = 2 120 | vpc_id = aws_vpc.this.id 121 | cidr_block = "10.0.${count.index}.0/24" 122 | availability_zone = data.aws_availability_zones.available.names[count.index] 123 | map_public_ip_on_launch = true 124 | 125 | tags = { 126 | Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-public-${random_id.public_subnet[count.index].hex}" 127 | } 128 | } 129 | 130 | resource "aws_route_table_association" "public" { 131 | count = length(aws_subnet.public) 132 | subnet_id = aws_subnet.public[count.index].id 133 | route_table_id = aws_route_table.public.id 134 | } 135 | 136 | resource "aws_subnet" "private" { 137 | count = 2 138 | vpc_id = aws_vpc.this.id 139 | cidr_block = "10.0.${count.index + length(aws_subnet.public)}.0/24" 140 | availability_zone = data.aws_availability_zones.available.names[count.index] 141 | map_public_ip_on_launch = false 142 | 143 | tags = { 144 | Name = "${local.prefix}-${data.aws_availability_zones.available.names[count.index]}-private-${random_id.private_subnet[count.index].hex}" 145 | } 146 | } 147 | 148 | resource "aws_route_table_association" "private" { 149 | count = length(aws_subnet.private) 150 | route_table_id = aws_route_table.private[count.index].id 151 | subnet_id = aws_subnet.private[count.index].id 152 | } 153 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/outputs.tf: -------------------------------------------------------------------------------- 1 | output "state" { 2 | description = "Resources from terraform-state" 3 | value = { 4 | repository_url = aws_ecr_repository.default.repository_url 5 | scratch_bucket = aws_s3_bucket.scratch.id 6 | mwaa_bucket = aws_s3_bucket.mwaa.id 7 | github_actions_bot = aws_iam_user.cd_bot.name 8 | batch_job_queue = aws_batch_job_queue.default.name 9 | batch_job_definitions = { 10 | test = aws_batch_job_definition.default["test"].name 11 | latest = aws_batch_job_definition.default["latest"].name 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/secrets.tf: -------------------------------------------------------------------------------- 1 | ################################## 2 | # AWS Secrets Manager # 3 | ################################## 4 | 5 | locals { 6 | snowflake_data = ["account", "user", "database", "warehouse", "role", "password"] 7 | 8 | jobs = ["test", "latest"] 9 | } 10 | 11 | data "aws_secretsmanager_secret" "snowflake_loader_secret" { 12 | for_each = toset(local.jobs) 13 | name = var.snowflake_loader_secret[each.key] 14 | } 15 | -------------------------------------------------------------------------------- /terraform/aws/modules/infra/variables.tf: -------------------------------------------------------------------------------- 1 | variable "owner" { 2 | description = "Owner of the resource" 3 | type = string 4 | default = "dse" 5 | } 6 | 7 | variable "project" { 8 | description = "Name of the project the resource is serving" 9 | type = string 10 | default = "infra" 11 | } 12 | 13 | variable "environment" { 14 | description = "Deployment environment of the resource" 15 | type = string 16 | default = "dev" 17 | } 18 | 19 | variable "region" { 20 | description = "Region for AWS resources" 21 | type = string 22 | default = "us-west-2" 23 | } 24 | 25 | variable "snowflake_loader_secret" { 26 | description = "ARN for SecretsManager login info to Snowflake with loader role" 27 | type = object({ test = string, latest = string }) 28 | default = null 29 | } 30 | 31 | locals { 32 | prefix = "${var.owner}-${var.project}-${var.environment}" 33 | } 34 | -------------------------------------------------------------------------------- /terraform/s3-remote-state/README.md: -------------------------------------------------------------------------------- 1 | # Terraform S3 remote state 2 | 3 | This Terraform module is intended to bootstrap remote state for the main terraform 4 | project in the parent directory. 5 | 6 | It uses the [S3 backend](https://developer.hashicorp.com/terraform/language/settings/backends/s3), 7 | which stores the state in an S3 bucket and uses a DynamoDB table for locking. 8 | 9 | Ideally, this should only need to be set up once per project: 10 | 11 | ```bash 12 | terraform apply 13 | ``` 14 | 15 | 16 | ## Requirements 17 | 18 | | Name | Version | 19 | |------|---------| 20 | | [terraform](#requirement\_terraform) | >= 1.0 | 21 | | [aws](#requirement\_aws) | 4.56.0 | 22 | 23 | ## Providers 24 | 25 | | Name | Version | 26 | |------|---------| 27 | | [aws](#provider\_aws) | 4.56.0 | 28 | 29 | ## Modules 30 | 31 | No modules. 32 | 33 | ## Resources 34 | 35 | | Name | Type | 36 | |------|------| 37 | | [aws_dynamodb_table.terraform_state_lock](https://registry.terraform.io/providers/hashicorp/aws/4.56.0/docs/resources/dynamodb_table) | resource | 38 | | [aws_s3_bucket.terraform_state](https://registry.terraform.io/providers/hashicorp/aws/4.56.0/docs/resources/s3_bucket) | resource | 39 | | [aws_s3_bucket_versioning.terraform_state](https://registry.terraform.io/providers/hashicorp/aws/4.56.0/docs/resources/s3_bucket_versioning) | resource | 40 | 41 | ## Inputs 42 | 43 | | Name | Description | Type | Default | Required | 44 | |------|-------------|------|---------|:--------:| 45 | | [environment](#input\_environment) | Deployment environment of the resource | `string` | n/a | yes | 46 | | [owner](#input\_owner) | Owner of the resource | `string` | `"dse"` | no | 47 | | [project](#input\_project) | Name of the project the resource is serving | `string` | n/a | yes | 48 | | [region](#input\_region) | AWS Region | `string` | `"us-west-2"` | no | 49 | 50 | ## Outputs 51 | 52 | | Name | Description | 53 | |------|-------------| 54 | | [bucket](#output\_bucket) | State bucket | 55 | | [dynamodb\_table](#output\_dynamodb\_table) | State lock | 56 | | [key](#output\_key) | State object key | 57 | | [region](#output\_region) | AWS Region | 58 | 59 | -------------------------------------------------------------------------------- /terraform/s3-remote-state/main.tf: -------------------------------------------------------------------------------- 1 | ################################ 2 | # Variables # 3 | ################################ 4 | 5 | variable "owner" { 6 | description = "Owner of the resource" 7 | type = string 8 | default = "dse" 9 | } 10 | 11 | variable "project" { 12 | description = "Name of the project the resource is serving" 13 | type = string 14 | } 15 | 16 | variable "environment" { 17 | description = "Deployment environment of the resource" 18 | type = string 19 | } 20 | 21 | variable "region" { 22 | description = "AWS Region" 23 | type = string 24 | default = "us-west-2" 25 | } 26 | 27 | locals { 28 | prefix = "${var.owner}-${var.project}-${var.environment}" 29 | } 30 | 31 | ################################ 32 | # Terraform setup # 33 | ################################ 34 | 35 | terraform { 36 | required_providers { 37 | aws = { 38 | source = "hashicorp/aws" 39 | version = "4.56.0" 40 | } 41 | } 42 | required_version = ">= 1.0" 43 | } 44 | 45 | provider "aws" { 46 | region = var.region 47 | 48 | default_tags { 49 | tags = { 50 | Owner = var.owner 51 | Project = var.project 52 | Environment = var.environment 53 | } 54 | } 55 | } 56 | 57 | 58 | ################################ 59 | # State backend # 60 | ################################ 61 | 62 | resource "aws_s3_bucket" "terraform_state" { 63 | bucket = "${local.prefix}-terraform-state" 64 | 65 | lifecycle { 66 | prevent_destroy = true 67 | } 68 | } 69 | 70 | resource "aws_s3_bucket_versioning" "terraform_state" { 71 | bucket = aws_s3_bucket.terraform_state.id 72 | 73 | versioning_configuration { 74 | status = "Enabled" 75 | } 76 | } 77 | 78 | resource "aws_dynamodb_table" "terraform_state_lock" { 79 | name = "${local.prefix}-terraform-state-lock" 80 | read_capacity = 1 81 | write_capacity = 1 82 | hash_key = "LockID" 83 | 84 | attribute { 85 | name = "LockID" 86 | type = "S" 87 | } 88 | } 89 | 90 | ################################ 91 | # Outputs for a tfbackend file # 92 | ################################ 93 | 94 | output "bucket" { 95 | description = "State bucket" 96 | value = aws_s3_bucket.terraform_state.bucket 97 | } 98 | 99 | output "key" { 100 | description = "State object key" 101 | value = "${local.prefix}.tfstate" 102 | } 103 | 104 | output "region" { 105 | description = "AWS Region" 106 | value = var.region 107 | } 108 | 109 | output "dynamodb_table" { 110 | description = "State lock" 111 | value = aws_dynamodb_table.terraform_state_lock.name 112 | } 113 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/snowflake-labs/snowflake" { 5 | version = "1.0.1" 6 | constraints = "~> 1.0, 1.0.1" 7 | hashes = [ 8 | "h1:KbiPdzWifjw7jNSiQlIT4b8TyDTfMOhRdYdFcOvt9lA=", 9 | "h1:RW6Tbf/g9fmY/WOQY6WktxQ9TywBpJi9Lw5O1BqnCs4=", 10 | "h1:glVuLBCPg23s0K4Vzjwy+CBLXCMRhKZHuF/6yxIof+I=", 11 | "zh:1a8c1c8d7003943d0c8ab492ec2d352f3552ae1e5be6ae2ced16da95b9859769", 12 | "zh:2bc7c58adbc504f6aa61774a7bbf99bdfbf7bbf691182d01518146bb28c8e2fa", 13 | "zh:30482878d46ac18624daf6559b2ee294aa24c7bffff5bf2d2a2133072db4aa8a", 14 | "zh:3f1f1088375fde975993029be32955881ba71d84e24db20e69bb9d437305780f", 15 | "zh:42510e778b420295461179eb97f5c436edc157c8980c7b3c0db71eb08c063d49", 16 | "zh:475ee5e75e4b93e3e939cd5b2d803e1c3f31d22963bdc49a21d4536afa6eaf90", 17 | "zh:55918ef218513ea1e2b916893aa1272e327beeeb80b205efaffcdefbb2b52ba0", 18 | "zh:651c8526a9d4bd834fa623a74737bf485fc64e383a5e32d3531cf0fa146863a9", 19 | "zh:892f03d08fdff2746e1d2acd5bf520a764a07a00e177fe1fbb2521daccd62523", 20 | "zh:a8a999d555aae9d205b0c1c2432a94c37e8630bddb4357ccaf2e44911dede481", 21 | "zh:cba89d14632697d219e4f848ac206d16cc152c65b7740fb6c5c08ed98dd054ba", 22 | "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/dse-snowflake-dev.tfbackend: -------------------------------------------------------------------------------- 1 | bucket = "dse-snowflake-dev-terraform-state" 2 | dynamodb_table = "dse-snowflake-dev-terraform-state-lock" 3 | key = "dse-snowflake-dev.tfstate" 4 | region = "us-west-2" 5 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/main.tf: -------------------------------------------------------------------------------- 1 | ############################ 2 | # Variables # 3 | ############################ 4 | 5 | variable "environment" { 6 | description = "Environment suffix" 7 | type = string 8 | } 9 | 10 | variable "account_name" { 11 | description = "Snowflake account name" 12 | type = string 13 | } 14 | 15 | variable "organization_name" { 16 | description = "Snowflake account organization" 17 | type = string 18 | } 19 | 20 | ############################ 21 | # Providers # 22 | ############################ 23 | 24 | terraform { 25 | required_providers { 26 | snowflake = { 27 | source = "Snowflake-Labs/snowflake" 28 | version = "1.0.1" 29 | } 30 | } 31 | required_version = ">= 1.0" 32 | 33 | backend "s3" { 34 | } 35 | } 36 | 37 | # This provider is intentionally low-permission. In Snowflake, object creators are 38 | # the default owners of the object. To control the owner, we create different provider 39 | # blocks with different roles, and require that all snowflake resources explicitly 40 | # flag the role they want for the creator. 41 | provider "snowflake" { 42 | account_name = var.account_name 43 | organization_name = var.organization_name 44 | role = "PUBLIC" 45 | } 46 | 47 | # Snowflake provider for account administration (to be used only when necessary). 48 | provider "snowflake" { 49 | alias = "accountadmin" 50 | role = "ACCOUNTADMIN" 51 | account_name = var.account_name 52 | organization_name = var.organization_name 53 | } 54 | 55 | # Snowflake provider for creating databases, warehouses, etc. 56 | provider "snowflake" { 57 | alias = "sysadmin" 58 | role = "SYSADMIN" 59 | account_name = var.account_name 60 | organization_name = var.organization_name 61 | } 62 | 63 | # Snowflake provider for managing grants to roles. 64 | provider "snowflake" { 65 | alias = "securityadmin" 66 | role = "SECURITYADMIN" 67 | account_name = var.account_name 68 | organization_name = var.organization_name 69 | } 70 | 71 | # Snowflake provider for managing user accounts and roles. 72 | provider "snowflake" { 73 | alias = "useradmin" 74 | role = "USERADMIN" 75 | account_name = var.account_name 76 | organization_name = var.organization_name 77 | } 78 | 79 | ############################ 80 | # Environment # 81 | ############################ 82 | 83 | module "elt" { 84 | source = "../../modules/elt" 85 | providers = { 86 | snowflake.accountadmin = snowflake.accountadmin, 87 | snowflake.securityadmin = snowflake.securityadmin, 88 | snowflake.sysadmin = snowflake.sysadmin, 89 | snowflake.useradmin = snowflake.useradmin, 90 | } 91 | 92 | environment = var.environment 93 | } 94 | 95 | ############################################################## 96 | # Assign LOGGER role to TRANSFORMER role 97 | # This is only needed for the ODI default snowflake instance 98 | # More backgorund information related to this is found 99 | # here - https://github.com/cagov/data-infrastructure/issues/428 100 | ############################################################## 101 | 102 | resource "snowflake_grant_account_role" "logger_to_transformer" { 103 | provider = snowflake.useradmin 104 | role_name = "LOGGER_${var.environment}" 105 | parent_role_name = "TRANSFORMER_${var.environment}" 106 | } 107 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/remote-state/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/aws" { 5 | version = "4.56.0" 6 | constraints = "4.56.0" 7 | hashes = [ 8 | "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=", 9 | "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=", 10 | "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=", 11 | "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab", 12 | "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa", 13 | "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03", 14 | "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0", 15 | "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39", 16 | "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b", 17 | "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa", 18 | "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", 19 | "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1", 20 | "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87", 21 | "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711", 22 | "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365", 23 | "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054", 24 | "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81", 25 | "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef", 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/remote-state/main.tf: -------------------------------------------------------------------------------- 1 | ../../../../s3-remote-state/main.tf -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/remote-state/terraform.tfvars: -------------------------------------------------------------------------------- 1 | owner = "dse" 2 | environment = "dev" 3 | project = "snowflake" 4 | region = "us-west-2" 5 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/dev/terraform.tfvars: -------------------------------------------------------------------------------- 1 | account_name = "HJB86910" 2 | environment = "DEV" 3 | organization_name= "VSB79059" 4 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/prd/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/snowflake-labs/snowflake" { 5 | version = "1.0.1" 6 | constraints = "~> 1.0, 1.0.1" 7 | hashes = [ 8 | "h1:KbiPdzWifjw7jNSiQlIT4b8TyDTfMOhRdYdFcOvt9lA=", 9 | "h1:RW6Tbf/g9fmY/WOQY6WktxQ9TywBpJi9Lw5O1BqnCs4=", 10 | "h1:glVuLBCPg23s0K4Vzjwy+CBLXCMRhKZHuF/6yxIof+I=", 11 | "zh:1a8c1c8d7003943d0c8ab492ec2d352f3552ae1e5be6ae2ced16da95b9859769", 12 | "zh:2bc7c58adbc504f6aa61774a7bbf99bdfbf7bbf691182d01518146bb28c8e2fa", 13 | "zh:30482878d46ac18624daf6559b2ee294aa24c7bffff5bf2d2a2133072db4aa8a", 14 | "zh:3f1f1088375fde975993029be32955881ba71d84e24db20e69bb9d437305780f", 15 | "zh:42510e778b420295461179eb97f5c436edc157c8980c7b3c0db71eb08c063d49", 16 | "zh:475ee5e75e4b93e3e939cd5b2d803e1c3f31d22963bdc49a21d4536afa6eaf90", 17 | "zh:55918ef218513ea1e2b916893aa1272e327beeeb80b205efaffcdefbb2b52ba0", 18 | "zh:651c8526a9d4bd834fa623a74737bf485fc64e383a5e32d3531cf0fa146863a9", 19 | "zh:892f03d08fdff2746e1d2acd5bf520a764a07a00e177fe1fbb2521daccd62523", 20 | "zh:a8a999d555aae9d205b0c1c2432a94c37e8630bddb4357ccaf2e44911dede481", 21 | "zh:cba89d14632697d219e4f848ac206d16cc152c65b7740fb6c5c08ed98dd054ba", 22 | "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/prd/dse-snowflake-prd.tfbackend: -------------------------------------------------------------------------------- 1 | bucket = "dse-snowflake-prd-terraform-state" 2 | dynamodb_table = "dse-snowflake-prd-terraform-state-lock" 3 | key = "dse-snowflake-prd.tfstate" 4 | region = "us-west-2" 5 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/prd/remote-state/.terraform.lock.hcl: -------------------------------------------------------------------------------- 1 | # This file is maintained automatically by "terraform init". 2 | # Manual edits may be lost in future updates. 3 | 4 | provider "registry.terraform.io/hashicorp/aws" { 5 | version = "4.56.0" 6 | constraints = "4.56.0" 7 | hashes = [ 8 | "h1:CnpBvf3mH16Kcez47OsjmIeGkY2PUVihKRwbkyOvo48=", 9 | "h1:koDunHl/LUmCAKy3VFie6MakXN7ng93v8HBRpKI8He8=", 10 | "h1:v6DE95Ll2mxE96IGUsT/h6WQTU1d2cfHydWah1FgT20=", 11 | "zh:1d2b7693a102da015a86b9235b554272b9280597011216c3ddd1a6dc95ad8dab", 12 | "zh:28c3e8ebaa077f65c4ac5fd051c95887070293fcff0386dfc2e4b7e248a0aefa", 13 | "zh:2a620bc4a87be06e7acac1bc15e966dba45df643bf6c3efb811e74e6c2122b03", 14 | "zh:30d3ac148fa0634e7ba1de66e1af1328481c92cd774adcfc0e27f828103b17e0", 15 | "zh:3d3eebf916f25e11b12dd3c692f8fe1e4c4e9a0c414af9d0d881ddebd28dcd39", 16 | "zh:3f4600f2881c02fcc69080df68747c9a0b9b11cb002117fd918b7800f2ac402b", 17 | "zh:7156fb12c3b4f2964f7e78cee97f31d95b43045467f90749d2ed545725c36baa", 18 | "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", 19 | "zh:a5bbc84fd37d468c7b016009776b6d2a287bbb746af81aba786cdf8eb5fce4a1", 20 | "zh:d5322bcd4e11caddbbfaa1198893824d4b4d28f504517a3a87902cf86d75bd87", 21 | "zh:d766eb9f86a40060d63e12ef674d7c9c47ec4e47ade487f1f49af8c89b441711", 22 | "zh:df23f592b99f6617f09e449009bbb49068a69fc926b15ca29e30b068c9c67365", 23 | "zh:e7b0acee2d98549731547259b539f598e18db07c0c202d3a34b922beff711054", 24 | "zh:ec317f79fdcce934c39458ea312862e7f7ec48cafb8bcc9b5a00d9b78b629d81", 25 | "zh:f78ec7a771867d96dfee96bf74523341ba42feeb64ce2f108b5bf2e7ebef0fef", 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/prd/remote-state/main.tf: -------------------------------------------------------------------------------- 1 | ../../../../s3-remote-state/main.tf -------------------------------------------------------------------------------- /terraform/snowflake/environments/prd/remote-state/terraform.tfvars: -------------------------------------------------------------------------------- 1 | owner = "dse" 2 | environment = "prd" 3 | project = "snowflake" 4 | region = "us-west-2" 5 | -------------------------------------------------------------------------------- /terraform/snowflake/environments/prd/terraform.tfvars: -------------------------------------------------------------------------------- 1 | account_name = "HJB86910" 2 | environment = "PRD" 3 | organization_name= "VSB79059" 4 | okta_integration_name= "OKTAINTEGRATION" 5 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/database/outputs.tf: -------------------------------------------------------------------------------- 1 | output "name" { 2 | description = "Database name" 3 | value = snowflake_database.this.name 4 | } 5 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/database/variables.tf: -------------------------------------------------------------------------------- 1 | variable "name" { 2 | description = "Database name" 3 | type = string 4 | } 5 | 6 | variable "comment" { 7 | description = "Comment to apply to warehouse" 8 | type = string 9 | default = null 10 | } 11 | 12 | variable "data_retention_time_in_days" { 13 | description = "Data retention time in days" 14 | type = number 15 | default = 7 16 | } 17 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/elt/databases.tf: -------------------------------------------------------------------------------- 1 | ####################################### 2 | # Databases # 3 | ####################################### 4 | 5 | # The primary database where transformation tools like dbt operate. 6 | module "transform" { 7 | source = "../database" 8 | providers = { 9 | snowflake.securityadmin = snowflake.securityadmin, 10 | snowflake.sysadmin = snowflake.sysadmin, 11 | snowflake.useradmin = snowflake.useradmin, 12 | } 13 | name = "TRANSFORM_${var.environment}" 14 | comment = "Transformation database" 15 | data_retention_time_in_days = 7 16 | } 17 | 18 | # The primary raw database, where ELT tools land data. 19 | module "raw" { 20 | source = "../database" 21 | providers = { 22 | snowflake.securityadmin = snowflake.securityadmin, 23 | snowflake.sysadmin = snowflake.sysadmin, 24 | snowflake.useradmin = snowflake.useradmin, 25 | } 26 | name = "RAW_${var.environment}" 27 | comment = "Raw database, intended for ingest of raw data from source systems prior to any modeling or transformation" 28 | data_retention_time_in_days = 7 29 | } 30 | 31 | # The primary reporting database. 32 | module "analytics" { 33 | source = "../database" 34 | providers = { 35 | snowflake.securityadmin = snowflake.securityadmin, 36 | snowflake.sysadmin = snowflake.sysadmin, 37 | snowflake.useradmin = snowflake.useradmin, 38 | } 39 | name = "ANALYTICS_${var.environment}" 40 | comment = "Analytics database for data consumers, holding analysis-ready data marts/models" 41 | data_retention_time_in_days = 7 42 | } 43 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/elt/main.tf: -------------------------------------------------------------------------------- 1 | ###################################### 2 | # Terraform # 3 | ###################################### 4 | 5 | terraform { 6 | required_providers { 7 | snowflake = { 8 | source = "Snowflake-Labs/snowflake" 9 | version = "~> 1.0" 10 | configuration_aliases = [ 11 | snowflake.accountadmin, 12 | snowflake.securityadmin, 13 | snowflake.sysadmin, 14 | snowflake.useradmin, 15 | ] 16 | } 17 | } 18 | required_version = ">= 1.0" 19 | } 20 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/elt/users.tf: -------------------------------------------------------------------------------- 1 | ###################################### 2 | # Service Accounts/Users # 3 | ###################################### 4 | 5 | resource "snowflake_service_user" "dbt" { 6 | provider = snowflake.useradmin 7 | name = "DBT_CLOUD_SVC_USER_${var.environment}" 8 | comment = "Service user for dbt Cloud" 9 | lifecycle { 10 | ignore_changes = [rsa_public_key] 11 | } 12 | 13 | default_warehouse = module.transforming["XS"].name 14 | default_role = snowflake_account_role.transformer.name 15 | } 16 | 17 | resource "snowflake_service_user" "airflow" { 18 | provider = snowflake.useradmin 19 | name = "MWAA_SVC_USER_${var.environment}" 20 | comment = "Service user for Airflow" 21 | lifecycle { 22 | ignore_changes = [rsa_public_key] 23 | } 24 | 25 | default_warehouse = module.loading["XS"].name 26 | default_role = snowflake_account_role.loader.name 27 | } 28 | 29 | resource "snowflake_service_user" "fivetran" { 30 | provider = snowflake.useradmin 31 | name = "FIVETRAN_SVC_USER_${var.environment}" 32 | comment = "Service user for Fivetran" 33 | lifecycle { 34 | ignore_changes = [rsa_public_key] 35 | } 36 | 37 | default_warehouse = module.loading["XS"].name 38 | default_role = snowflake_account_role.loader.name 39 | } 40 | 41 | resource "snowflake_service_user" "github_ci" { 42 | provider = snowflake.useradmin 43 | name = "GITHUB_ACTIONS_SVC_USER_${var.environment}" 44 | comment = "Service user for GitHub CI" 45 | lifecycle { 46 | ignore_changes = [rsa_public_key] 47 | } 48 | 49 | default_warehouse = module.reporting["XS"].name 50 | default_role = snowflake_account_role.reader.name 51 | } 52 | 53 | resource "snowflake_legacy_service_user" "sentinel" { 54 | provider = snowflake.useradmin 55 | name = "SENTINEL_SVC_USER_${var.environment}" 56 | comment = "Service user for Sentinel" 57 | lifecycle { 58 | ignore_changes = [rsa_public_key] 59 | } 60 | 61 | default_warehouse = module.logging.name 62 | default_role = snowflake_account_role.logger.name 63 | } 64 | 65 | ###################################### 66 | # Role Grants # 67 | ###################################### 68 | 69 | resource "snowflake_grant_account_role" "transformer_to_dbt" { 70 | provider = snowflake.useradmin 71 | role_name = snowflake_account_role.transformer.name 72 | user_name = snowflake_service_user.dbt.name 73 | } 74 | 75 | resource "snowflake_grant_account_role" "loader_to_airflow" { 76 | provider = snowflake.useradmin 77 | role_name = snowflake_account_role.loader.name 78 | user_name = snowflake_service_user.airflow.name 79 | } 80 | 81 | resource "snowflake_grant_account_role" "loader_to_fivetran" { 82 | provider = snowflake.useradmin 83 | role_name = snowflake_account_role.loader.name 84 | user_name = snowflake_service_user.fivetran.name 85 | } 86 | 87 | resource "snowflake_grant_account_role" "reader_to_github_ci" { 88 | provider = snowflake.useradmin 89 | role_name = snowflake_account_role.reader.name 90 | user_name = snowflake_service_user.github_ci.name 91 | } 92 | 93 | resource "snowflake_grant_account_role" "logger_to_sentinel" { 94 | provider = snowflake.useradmin 95 | role_name = snowflake_account_role.logger.name 96 | user_name = snowflake_legacy_service_user.sentinel.name 97 | } 98 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/elt/variables.tf: -------------------------------------------------------------------------------- 1 | variable "environment" { 2 | description = "Environment suffix" 3 | type = string 4 | } 5 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/elt/warehouses.tf: -------------------------------------------------------------------------------- 1 | ################################# 2 | # Warehouses # 3 | ################################# 4 | 5 | #X-Small: Good for small tasks and experimenting. 6 | #Small: Suitable for single-user workloads and development. 7 | #Medium: Handles moderate concurrency and data volumes. 8 | #Large: Manages larger queries and higher concurrency. 9 | #X-Large: Powerful for demanding workloads and data-intensive operations. 10 | #2X-Large: Double the capacity of X-Large. 11 | #3X-Large: Triple the capacity of X-Large. 12 | #4X-Large: Quadruple the capacity of X-Large. 13 | 14 | locals { 15 | sizes = { 16 | "XS" = "X-SMALL", 17 | "S" = "SMALL", 18 | "M" = "MEDIUM", 19 | "L" = "LARGE", 20 | "XL" = "X-LARGE", 21 | "2XL" = "2X-LARGE", 22 | "3XL" = "3X-LARGE", 23 | "4XL" = "4X-LARGE", 24 | } 25 | } 26 | 27 | # Primary warehouse for loading data to Snowflake from ELT/ETL tools 28 | module "loading" { 29 | source = "../warehouse" 30 | for_each = local.sizes 31 | providers = { 32 | snowflake.securityadmin = snowflake.securityadmin, 33 | snowflake.sysadmin = snowflake.sysadmin, 34 | snowflake.useradmin = snowflake.useradmin, 35 | } 36 | 37 | name = "LOADING_${each.key}_${var.environment}" 38 | comment = "Primary warehouse for loading data to Snowflake from ELT/ETL tools" 39 | size = each.value 40 | } 41 | 42 | # Primary warehouse for transforming data. Analytics engineers and automated 43 | # transformation tools should use this warehouse. 44 | module "transforming" { 45 | source = "../warehouse" 46 | for_each = local.sizes 47 | providers = { 48 | snowflake.securityadmin = snowflake.securityadmin, 49 | snowflake.sysadmin = snowflake.sysadmin, 50 | snowflake.useradmin = snowflake.useradmin, 51 | } 52 | 53 | name = "TRANSFORMING_${each.key}_${var.environment}" 54 | comment = "Primary warehouse for transforming data. Analytics engineers and automated transformation tools should use this warehouse" 55 | size = each.value 56 | } 57 | 58 | # Primary warehouse for reporting. End-users and BI tools should use this warehouse. 59 | module "reporting" { 60 | source = "../warehouse" 61 | for_each = local.sizes 62 | providers = { 63 | snowflake.securityadmin = snowflake.securityadmin, 64 | snowflake.sysadmin = snowflake.sysadmin, 65 | snowflake.useradmin = snowflake.useradmin, 66 | } 67 | 68 | name = "REPORTING_${each.key}_${var.environment}" 69 | comment = "Primary warehouse for reporting. End-users and BI tools should use this warehouse" 70 | size = each.value 71 | } 72 | 73 | # Primary warehouse for logging. Logging tools like Sentinel should use this warehouse. 74 | module "logging" { 75 | source = "../warehouse" 76 | providers = { 77 | snowflake.securityadmin = snowflake.securityadmin, 78 | snowflake.sysadmin = snowflake.sysadmin, 79 | snowflake.useradmin = snowflake.useradmin, 80 | } 81 | 82 | name = "LOGGING_XS_${var.environment}" 83 | comment = "Primary warehouse for logging. Logging tools like Sentinel should use this warehouse." 84 | size = "X-SMALL" 85 | auto_suspend = 1 86 | } 87 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/warehouse/main.tf: -------------------------------------------------------------------------------- 1 | ###################################### 2 | # Terraform # 3 | ###################################### 4 | 5 | terraform { 6 | required_providers { 7 | snowflake = { 8 | source = "Snowflake-Labs/snowflake" 9 | version = "~> 1.0" 10 | configuration_aliases = [ 11 | snowflake.securityadmin, 12 | snowflake.sysadmin, 13 | snowflake.useradmin, 14 | ] 15 | } 16 | } 17 | required_version = ">= 1.0" 18 | } 19 | 20 | ###################################### 21 | # Permissions # 22 | ###################################### 23 | 24 | locals { 25 | # Permissions to use a data warehouse. No role is given the MODIFY permission, 26 | # instead warehouses should be treated as stateless, and if we need a larger 27 | # one it should be created individually. 28 | warehouse = { 29 | MOU = ["MONITOR", "OPERATE", "USAGE"] 30 | } 31 | } 32 | 33 | ################################# 34 | # Warehouses # 35 | ################################# 36 | 37 | 38 | resource "snowflake_warehouse" "this" { 39 | name = var.name 40 | provider = snowflake.sysadmin 41 | auto_suspend = var.auto_suspend 42 | auto_resume = true 43 | initially_suspended = true 44 | comment = var.comment 45 | warehouse_size = var.size 46 | } 47 | 48 | ################################# 49 | # Warehouse Access Roles # 50 | ################################# 51 | 52 | # Monitoring, usage, and operating permissions for the LOADING warehouse. 53 | resource "snowflake_account_role" "this" { 54 | name = "${var.name}_WH_MOU" 55 | provider = snowflake.useradmin 56 | comment = "Monitoring, usage, and operating permissions for the ${var.name} warehouse" 57 | } 58 | 59 | ################################# 60 | # Role Grants # 61 | ################################# 62 | 63 | resource "snowflake_grant_account_role" "this_to_sysadmin" { 64 | provider = snowflake.useradmin 65 | role_name = snowflake_account_role.this.name 66 | parent_role_name = "SYSADMIN" 67 | } 68 | 69 | ################################# 70 | # Warehouse Grants # 71 | ################################# 72 | 73 | resource "snowflake_grant_privileges_to_account_role" "this" { 74 | provider = snowflake.securityadmin 75 | privileges = local.warehouse.MOU 76 | account_role_name = snowflake_account_role.this.name 77 | on_account_object { 78 | object_type = "WAREHOUSE" 79 | object_name = snowflake_warehouse.this.name 80 | } 81 | with_grant_option = false 82 | } 83 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/warehouse/outputs.tf: -------------------------------------------------------------------------------- 1 | output "access_role_name" { 2 | description = "Warehouse access_role" 3 | value = snowflake_account_role.this.name 4 | } 5 | 6 | output "name" { 7 | description = "Warehouse name" 8 | value = snowflake_warehouse.this.name 9 | } 10 | -------------------------------------------------------------------------------- /terraform/snowflake/modules/warehouse/variables.tf: -------------------------------------------------------------------------------- 1 | variable "name" { 2 | description = "Warehouse name" 3 | type = string 4 | } 5 | 6 | variable "comment" { 7 | description = "Comment to apply to warehouse" 8 | type = string 9 | default = null 10 | } 11 | 12 | variable "auto_suspend" { 13 | description = "Auto-suspend time for warehouse" 14 | type = number 15 | default = 300 16 | } 17 | 18 | variable "size" { 19 | description = "Size of warehouse" 20 | type = string 21 | default = "x-small" 22 | } 23 | -------------------------------------------------------------------------------- /transform/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_packages/ 3 | logs/ 4 | .vscode/ 5 | -------------------------------------------------------------------------------- /transform/.sqlfluff: -------------------------------------------------------------------------------- 1 | [sqlfluff] 2 | dialect = snowflake 3 | templater = dbt 4 | max_line_length = 120 5 | 6 | # I decide the column ordering! 7 | exclude_rules = structure.column_order 8 | 9 | # Probably a controversial exclusion, so adding some justification here: 10 | # This disables the rule that prevents unreserved keywords from being 11 | # used as column aliases. The rule is intended to prevent accidental shadowing 12 | # of SQL keywords, which, honestly, sounds like a good idea! **However**, 13 | # this can result in some awkward contortions in final dataset column names, 14 | # which are intended for end users who shouldn't care about our query language 15 | # limitations. Since SQL tends to have quite a long list of keywords, the 16 | # restriction actually prevents some quite natural column names, preventing 17 | # legibility for end users (e.g., date, timestamp, name). 18 | [sqlfluff:rules:references.keywords] 19 | quoted_identifiers_policy = none 20 | unquoted_identifiers_policy = none 21 | -------------------------------------------------------------------------------- /transform/.sqlfluffignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_packages/ 3 | -------------------------------------------------------------------------------- /transform/README.md: -------------------------------------------------------------------------------- 1 | # CalData dbt project 2 | 3 | This is the primary dbt project for the CalData Data Services and Engineering (DSE) team. 4 | It targets Snowflake as its data warehouse. 5 | Linting and testing are driven through GitHub actions. 6 | 7 | ## Building the docs 8 | 9 | To build and view the docs locally, run 10 | 11 | ```bash 12 | (dbt docs generate && cd target/ && python -m http.server) 13 | ``` 14 | 15 | in a terminal, then navigate to `http://localhost:8000` in your web browser. 16 | 17 | ## Resources: 18 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 19 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 20 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers 21 | - Find [dbt events](https://events.getdbt.com) near you 22 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 23 | -------------------------------------------------------------------------------- /transform/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/analyses/.gitkeep -------------------------------------------------------------------------------- /transform/dbt_project.yml: -------------------------------------------------------------------------------- 1 | # Project name 2 | name: "dse_analytics" 3 | version: "1.0.0" 4 | config-version: 2 5 | 6 | flags: 7 | send_anonymous_usage_stats: false 8 | use_colors: true 9 | warn_error: false 10 | state_modified_compare_more_unrendered_values: true 11 | skip_nodes_if_on_run_start_fails: true 12 | require_explicit_package_overrides_for_builtin_materializations: true 13 | source_freshness_run_project_hooks: false 14 | 15 | # This setting configures which "profile" dbt uses for this project. 16 | profile: "dse_snowflake" 17 | 18 | # These configurations specify where dbt should look for different types of files. 19 | # The `source-paths` config, for example, states that models in this project can be 20 | # found in the "models/" directory. You probably won't need to change these! 21 | model-paths: ["models"] 22 | analysis-paths: ["analyses"] 23 | test-paths: ["tests"] 24 | seed-paths: ["seeds"] 25 | macro-paths: ["macros"] 26 | snapshot-paths: ["snapshots"] 27 | 28 | target-path: "target" # directory which will store compiled SQL files 29 | clean-targets: # directories to be removed by `dbt clean` 30 | - "target" 31 | - "dbt_packages" 32 | 33 | models: 34 | dse_analytics: 35 | staging: 36 | +database: "{{ env_var('DBT_TRANSFORM_DB', 'TRANSFORM_DEV') }}" 37 | department_of_finance: 38 | +schema: department_of_finance 39 | snowflake_cost_tracking: 40 | +schema: snowflake_cost_tracking 41 | 42 | # These staging models are a little unusual for two reasons: 43 | # 44 | # 1. They are incremental 45 | # 2. They do some very light aggregation 46 | # 47 | # We do this because the source views in the SNOWFLAKE meta-database 48 | # have a retention time of one year, and don't have very strong 49 | # uniqueness constraints for their data grain. By making the models 50 | # incremental we ensure that we retain data that is older than the retention 51 | # time. By aggregating to the usage date (and table/warehouse, if applicable), 52 | # we ensure that we can correctly merge in the incremental updates 53 | # without resulting in duplicated rows. 54 | +materialized: incremental 55 | 56 | # Never do a full refresh so that we avoid overwriting any old data. 57 | # Otherwise we risk losing data beyond the 1 year retention window 58 | +full_refresh: false 59 | 60 | intermediate: 61 | +database: "{{ env_var('DBT_TRANSFORM_DB', 'TRANSFORM_DEV') }}" 62 | state_entities: 63 | +schema: state_entities 64 | snowflake_cost_tracking: 65 | +schema: snowflake_cost_tracking 66 | +materialized: view 67 | 68 | marts: 69 | # All marts models as tables to avoid needing write access to TRANSFORM 70 | # https://community.snowflake.com/s/article/SQL-compilation-error-Failure-during-expansion-of-view-mySecureView 71 | +materialized: table 72 | +database: "{{ env_var('DBT_ANALYTICS_DB', 'ANALYTICS_DEV') }}" 73 | state_entities: 74 | +schema: state_entities 75 | snowflake_cost_tracking: 76 | +schema: snowflake_cost_tracking 77 | +materialized: table 78 | -------------------------------------------------------------------------------- /transform/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/macros/.gitkeep -------------------------------------------------------------------------------- /transform/macros/_macros.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | macros: 4 | - name: spatial_join_with_deduplication 5 | description: | 6 | Macro to perform a spatial join between two relations with deduplication of the 7 | geometries in the left table. For all left geometries that satisfy the predicate 8 | for more than one geometry in the right table, we compute their intersection and 9 | then choose the left geometry with the greatest intersection. 10 | arguments: 11 | - name: left_model 12 | type: string 13 | description: The left model to join. Can be a relation or CTE. 14 | - name: right_model 15 | type: string 16 | description: The right model to join. Can be a relation or CTE. 17 | - name: left_cols 18 | type: list of strings 19 | description: | 20 | List columns to keep from the left table 21 | (excluding the geometry column, which is always retained) 22 | - name: right_cols 23 | type: list of strings 24 | description: | 25 | List of columns to keep from the right table 26 | (excluding the geometry column, which is never retained). 27 | Cannot share any names with left_cols 28 | - name: left_geom 29 | type: string 30 | description: The name of the left geometry column, defaults to "geometry" 31 | - name: right_geom 32 | type: string 33 | description: The name of the right geometry column, defaults to "geometry" 34 | - name: op 35 | description: | 36 | The spatial predicate function to choose, 37 | defaults to "st_intersects" 38 | - name: kind 39 | type: string 40 | description: The kind of join, either "left" or "inner". Defaults to "left" 41 | - name: prefix 42 | type: string 43 | description: | 44 | An optional prefix to give to temporary CTEs to improve legibility and 45 | avoid name collisions. 46 | -------------------------------------------------------------------------------- /transform/macros/get_custom_schema.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_schema_name(custom_schema_name, node) -%} 2 | 3 | {# 4 | Definitions: 5 | - custom_schema_name: schema provided via dbt_project.yml or model config 6 | - target.name: name of the target (dev for local development, prod for production, etc.) 7 | - target.schema: schema provided by the target defined in profiles.yml 8 | 9 | Rather than write to a schema prefixed with target.schema, we instead just write 10 | to the actual schema name, and get safety by separating dev and prod databases. 11 | If we start to experience analytics engineers stepping on each others toes in 12 | dev, we may want to restore prefixes there (while maintaining a prefix-free 13 | lifestyle in prod). 14 | #} 15 | {%- if custom_schema_name is none -%} {{ target.schema.lower() | trim }} 16 | 17 | {%- elif target.name == 'prd' -%} {{ custom_schema_name.lower() | trim }} 18 | 19 | {%- else -%} {{ target.schema.lower() | trim }}_{{ custom_schema_name | trim }} 20 | 21 | {%- endif -%} 22 | 23 | {%- endmacro %} 24 | -------------------------------------------------------------------------------- /transform/macros/map_class_fp.sql: -------------------------------------------------------------------------------- 1 | {% macro map_class_fips(class_fips, k, v) -%} 2 | 3 | {# 4 | Class Codes source: https://www.census.gov/library/reference/code-lists/class-codes.html 5 | #} 6 | 7 | {% set class_fips_dict = { 8 | "M2" : "A military or other defense installation entirely within a place", 9 | "C1" : "An active incorporated place that does not serve as a county subdivision equivalent", 10 | "U1" : "A census designated place with an official federally recognized name", 11 | "U2" : "A census designated place without an official federally recognized name" 12 | } -%} 13 | 14 | case 15 | {% for k, v in class_fips_dict.items() -%} 16 | when "{{ class_fips }}" = '{{ k }}' 17 | then '{{ v }}' 18 | {% endfor -%} 19 | end 20 | 21 | {%- endmacro %} 22 | -------------------------------------------------------------------------------- /transform/macros/spatial_join_with_deduplication.sql: -------------------------------------------------------------------------------- 1 | {# Macro to perform a spatial join between two relations with deduplication of the 2 | geometries in the left table. For all left geometries that satisfy the predicate for 3 | more than one geometry in the right table, we compute their intersection and then 4 | choose the left geometry with the greatest intersection. 5 | #} 6 | 7 | {% macro spatial_join_with_deduplication(left_model, right_model, left_cols, right_cols, left_geom="geometry", right_geom="geometry", op="st_intersects", kind="left", prefix="") %} 8 | 9 | with {{ prefix }}_left_model_with_id as ( 10 | select 11 | /* Generate a temporary ID for footprints. We will need this to group/partition 12 | by unique footprints further down. We could use a UUID, but integers are 13 | cheaper to generate and compare. */ 14 | *, seq4() as _tmp_sjoin_id 15 | from {{ left_model }} 16 | ), 17 | 18 | {{ prefix }}_joined as ( 19 | select 20 | {% for lcol in left_cols -%} 21 | {{ prefix }}_left_model_with_id.{{ lcol }}, 22 | {% endfor -%} 23 | {% for rcol in right_cols -%} 24 | {{ right_model }}.{{ rcol }}, 25 | {% endfor -%} 26 | {{ prefix }}_left_model_with_id.{{ left_geom }}, 27 | /* We don't actually need the intersection for every geometry, only for the 28 | ones that intersect more than one. However, in order to establish which 29 | ones intersect more than one, we need a windowed COUNT partitioned by 30 | _tmp_sjoin_id. This is an expensive operation, as it likely triggers a shuffle 31 | (even though it should already be sorted by _tmp_id). In testing we've found 32 | that it's cheaper to just do the intersection for all the geometries. */ 33 | st_area( 34 | st_intersection({{ prefix }}_left_model_with_id.{{ left_geom }}, {{ right_model }}.{{ right_geom }}) 35 | ) as _tmp_sjoin_intersection, 36 | {{ prefix }}_left_model_with_id._tmp_sjoin_id 37 | from {{ prefix }}_left_model_with_id 38 | {{ kind }} join {{ right_model }} 39 | on {{ op }}({{ prefix }}_left_model_with_id.{{ left_geom }}, {{ right_model }}.{{ right_geom }}) 40 | ), 41 | 42 | {{ prefix }}_deduplicated as ( 43 | select 44 | -- Snowflake doesn't support geometries in max_by. It should, but it doesn't. 45 | -- Fortunately, we know that the geometries are identical when partitioned 46 | -- by _tmp_sjoin_id, so we can just choose any_value. 47 | any_value({{ left_geom }}) as {{ left_geom }}, 48 | {% for lcol in left_cols -%} 49 | -- max_by returns null if all the values in a group are null. So if we have a left 50 | -- join, we need to guard against nulls with a coalesce to return the single value 51 | max_by({{ lcol }}, coalesce(_tmp_sjoin_intersection, 1.0)) as {{ lcol }}, 52 | {% endfor -%} 53 | {% for rcol in right_cols -%} 54 | -- max_by returns null if all the values in a group are null. So if we have a left 55 | -- join, we need to guard against nulls with a coalesce to return the single value 56 | max_by({{ rcol }}, coalesce(_tmp_sjoin_intersection, 1.0)) as {{ rcol }}{{ "," if not loop.last }} 57 | {% endfor -%} 58 | from {{ prefix }}_joined 59 | group by _tmp_sjoin_id 60 | ) 61 | 62 | select * from {{ prefix }}_deduplicated 63 | {%- endmacro -%} 64 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/_snowflake_cost_tracking.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: int_automatic_clustering_history 5 | description: | 6 | Credits used by automatic clustering, aggregated to account and usage date. 7 | columns: 8 | - name: organization_name 9 | description: Organization name 10 | - name: account_name 11 | description: Account name 12 | - name: usage_date 13 | description: The date on which the usage occurred. 14 | - name: credits_used 15 | description: The total credits used for automatic clustering 16 | 17 | - name: int_materialized_view_refresh_history 18 | description: | 19 | Credits used by materialized view refreshes, aggregated to account and usage date. 20 | columns: 21 | - name: organization_name 22 | description: Organization name 23 | - name: account_name 24 | description: Account name 25 | - name: usage_date 26 | description: The date on which the usage occurred. 27 | - name: credits_used 28 | description: The total credits used for materialized view refreshes 29 | 30 | - name: int_pipe_usage_history 31 | description: | 32 | Credits used by pipes, aggregated to account and usage date. 33 | columns: 34 | - name: organization_name 35 | description: Organization name 36 | - name: account_name 37 | description: Account name 38 | - name: usage_date 39 | description: The date on which the usage occurred. 40 | - name: credits_used 41 | description: The total credits used by pipes 42 | 43 | - name: int_storage_daily_history 44 | description: | 45 | Credits used by storage, aggregated to account and usage date. 46 | columns: 47 | - name: organization_name 48 | description: Organization name 49 | - name: account_name 50 | description: Account name 51 | - name: usage_date 52 | description: The date on which the usage occurred. 53 | - name: credits_used 54 | description: The total credits used by storage 55 | 56 | - name: int_warehouse_metering_history 57 | description: | 58 | Credits used by compute warehouses, aggregated to account and usage date. 59 | columns: 60 | - name: organization_name 61 | description: Organization name 62 | - name: account_name 63 | description: Account name 64 | - name: usage_date 65 | description: The date on which the usage occurred. 66 | - name: credits_used 67 | description: The total credits used by warehouses 68 | 69 | - name: int_cortex_usage_daily_history 70 | description: | 71 | Credits used by Cortex AI services, aggregated to account and usage date. 72 | columns: 73 | - name: organization_name 74 | description: Organization name 75 | - name: account_name 76 | description: Account name 77 | - name: usage_date 78 | description: The date on which the usage occurred. 79 | - name: credits_used 80 | description: The total credits used by Cortex 81 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/int_automatic_clustering_history.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ ref('stg_automatic_clustering_history') }} 3 | ), 4 | 5 | usage_history as ( 6 | select 7 | organization_name, 8 | account_name, 9 | usage_date, 10 | sum(credits_used) as credits_used 11 | from source 12 | group by all 13 | ) 14 | 15 | select * from usage_history 16 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/int_cortex_usage_daily_history.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ ref('stg_cortex_usage_daily_history') }} 3 | ), 4 | 5 | usage_history as ( 6 | select 7 | organization_name, 8 | account_name, 9 | usage_date, 10 | sum(credits_used) as credits_used 11 | from source 12 | group by organization_name, account_name, usage_date 13 | ) 14 | 15 | select * from usage_history 16 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/int_materialized_view_refresh_history.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ ref('stg_materialized_view_refresh_history') }} 3 | ), 4 | 5 | usage_history as ( 6 | select 7 | organization_name, 8 | account_name, 9 | usage_date, 10 | sum(credits_used) as credits_used 11 | from source 12 | group by all 13 | ) 14 | 15 | select * from usage_history 16 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/int_pipe_usage_history.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ ref('stg_pipe_usage_history') }} 3 | ), 4 | 5 | usage_history as ( 6 | select 7 | organization_name, 8 | account_name, 9 | usage_date, 10 | sum(credits_used) as credits_used 11 | from source 12 | group by all 13 | ) 14 | 15 | select * from usage_history 16 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/int_storage_daily_history.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ ref('stg_storage_daily_history') }} 3 | ), 4 | 5 | usage_history as ( 6 | select 7 | organization_name, 8 | account_name, 9 | usage_date, 10 | sum(credits_used) as credits_used 11 | from source 12 | group by all 13 | ) 14 | 15 | select * from usage_history 16 | -------------------------------------------------------------------------------- /transform/models/intermediate/snowflake_cost_tracking/int_warehouse_metering_history.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ ref('stg_warehouse_metering_history') }} 3 | ), 4 | 5 | usage_history as ( 6 | select 7 | organization_name, 8 | account_name, 9 | usage_date, 10 | sum(credits_used) as credits_used 11 | from source 12 | group by all 13 | ) 14 | 15 | select * from usage_history 16 | -------------------------------------------------------------------------------- /transform/models/intermediate/state_entities/_int_state_entities__models.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: int_state_entities__active 5 | description: | 6 | Active state entities from the Department of Finance list. 7 | Entities which are flagged as "DO NOT USE", "abolished", or 8 | are technical entities (e.g. "DOF USE ONLY") are filtered out. 9 | columns: 10 | - name: name 11 | description: Name of the entity 12 | - name: primary_code 13 | description: The most specific non-null entity code 14 | data_tests: 15 | - not_null 16 | - unique 17 | - name: agency_code 18 | description: Agency code for entity 19 | - name: subagency_code 20 | description: Subagency code for entity 21 | - name: L1 22 | description: Level beneath subagency 23 | - name: L2 24 | description: Level beneath L1 25 | - name: L3 26 | description: Level beneath L2 27 | - name: parenthetical 28 | description: | 29 | Any text extracted from a paranthetical statement in the original text 30 | - name: do_not_use 31 | description: Whether any entity features "DO NOT USE" in the description 32 | - name: abolished 33 | description: Whether the entity features "abolished" in the description 34 | - name: restricted_use 35 | description: | 36 | Whether the entity contains a directive like "DOF USE ONLY" or "SCO USE ONLY " 37 | in the description. 38 | - name: name_raw 39 | description: | 40 | The original name, as well as any parentheticals or directives for the entity. 41 | - name: name_alpha 42 | description: | 43 | The name with things like "Office of" moved to the end, 44 | suitable for alphabetization. 45 | - name: ucm_level 46 | description: | 47 | The level in the hierarchy of the Uniform Control Manual 48 | (agency, subagency, L1, L2, or L3) 49 | - name: int_state_entities__technical 50 | description: | 51 | Acitve technical entities from the Department of Finance list. 52 | columns: 53 | - name: name 54 | description: Name of the entity 55 | - name: primary_code 56 | description: The most specific non-null entity code 57 | data_tests: 58 | - not_null 59 | - name: agency_code 60 | description: Agency code for entity 61 | - name: subagency_code 62 | description: Subagency code for entity 63 | - name: L1 64 | description: Level beneath subagency 65 | - name: L2 66 | description: Level beneath L1 67 | - name: L3 68 | description: Level beneath L2 69 | - name: parenthetical 70 | description: | 71 | Any text extracted from a paranthetical statement in the original text 72 | - name: do_not_use 73 | description: Whether any entity features "DO NOT USE" in the description 74 | - name: abolished 75 | description: Whether the entity features "abolished" in the description 76 | - name: restricted_use 77 | description: | 78 | Whether the entity contains a directive like "DOF USE ONLY" or "SCO USE ONLY " 79 | in the description. 80 | - name: name_raw 81 | description: | 82 | The original name, as well as any parentheticals or directives for the entity. 83 | - name: name_alpha 84 | description: | 85 | The name with things like "Office of" moved to the end, 86 | suitable for alphabetization. 87 | - name: ucm_level 88 | description: | 89 | The level in the hierarchy of the Uniform Control Manual 90 | (agency, subagency, L1, L2, or L3) 91 | 92 | - name: int_state_entities__budgets 93 | description: Fiscal year budgets for state entities 94 | columns: 95 | - name: primary_code 96 | description: Four digit business unit code for entity. 97 | data_tests: 98 | # There are duplicates! 99 | # - unique 100 | - not_null 101 | - name: name 102 | description: Entity name 103 | - name: ucm_level 104 | description: | 105 | The level in the hierarchy of the Uniform Control Manual 106 | (agency, subagency, L1, L2, or L3) 107 | - name: name_alpha 108 | description: Variant of name for easier alphabetization 109 | - name: budget_year_dollars 110 | description: Budget for current fiscal year. 111 | -------------------------------------------------------------------------------- /transform/models/intermediate/state_entities/int_state_entities__active.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="view") }} 2 | 3 | with 4 | active_entities as ( 5 | select * 6 | from {{ ref("stg_department_of_finance__entities") }} 7 | where 8 | do_not_use = false 9 | and abolished = false 10 | and restricted_use is null 11 | and cast(primary_code as int) < 9000 12 | and not regexp_like(lower(name_raw), 'moved to|renum\.? to') 13 | ) 14 | 15 | select * 16 | from active_entities 17 | -------------------------------------------------------------------------------- /transform/models/intermediate/state_entities/int_state_entities__budgets.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="view") }} 2 | 3 | with 4 | active_entities as (select * from {{ ref("int_state_entities__active") }}), 5 | 6 | budgets as (select * from {{ ref("stg_ebudget__budgets") }}), 7 | 8 | active_agencies_and_departments as ( 9 | -- only select at deparment level or higher 10 | select * from active_entities where coalesce(l2, l3) is null 11 | ), 12 | 13 | active_entity_budgets as ( 14 | select 15 | active_agencies_and_departments.primary_code, 16 | active_agencies_and_departments.ucm_level, 17 | active_agencies_and_departments.name, 18 | active_agencies_and_departments.name_alpha, 19 | budgets.name as budget_name, 20 | budgets.budget_year_dollars 21 | from active_agencies_and_departments 22 | left join 23 | budgets 24 | on active_agencies_and_departments.primary_code = budgets.primary_code 25 | ) 26 | 27 | select * 28 | from active_entity_budgets 29 | order by primary_code asc 30 | -------------------------------------------------------------------------------- /transform/models/intermediate/state_entities/int_state_entities__technical.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="view") }} 2 | 3 | with 4 | technical_entities as ( 5 | select * 6 | from {{ ref("stg_department_of_finance__entities") }} 7 | where 8 | (do_not_use = false and abolished = false) 9 | and (restricted_use is not null or cast(primary_code as int) >= 9000) 10 | ) 11 | 12 | select * 13 | from technical_entities 14 | -------------------------------------------------------------------------------- /transform/models/marts/geo_reference/_geo_reference__models.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: building_footprints 5 | database: "{{ env_var('DBT_RAW_DB', 'RAW_DEV') }}" 6 | config: 7 | schema: building_footprints 8 | tables: 9 | - name: us_building_footprints 10 | description: "[Microsoft US Building Footprints]\ 11 | (https://github.com/Microsoft/USBuildingFootprints) \ 12 | dataset for California." 13 | - name: global_ml_building_footprints 14 | description: "[Microsoft Global ML Building Footprints]\ 15 | (https://github.com/microsoft/GlobalMLBuildingFootprints) \ 16 | dataset for California. This contains some null geometries,\ 17 | as well as geometries that fall somewhat outside of California" 18 | 19 | - name: tiger_2022 20 | database: "{{ env_var('DBT_RAW_DB', 'RAW_DEV') }}" 21 | schema: tiger_2022 22 | tables: 23 | - name: blocks 24 | - name: places 25 | 26 | models: 27 | - name: geo_reference__us_building_footprints_with_tiger 28 | config: 29 | schema: building_footprints 30 | tags: building_footprints 31 | description: | 32 | This data table is a join of the TIGER data for blocks, tracts, counties, and 33 | places with the Microsoft US Building Footprints data for the state of CA. 34 | columns: 35 | - name: release 36 | description: The version of the data 37 | - name: capture_dates_range 38 | description: Each building footprint has a capture date tag from 2019-2020 39 | - name: county_fips 40 | description: 2020 Census county FIPS code 41 | - name: tract 42 | description: 2020 Census tract code 43 | - name: block 44 | description: 2020 Census tabulation block number 45 | - name: block_geoid 46 | description: > 47 | Census block identifier; a concatenation of 2020 Census state FIPS code, 2020 48 | Census county FIPS code, 2020 Census tract code, and 2020 Census block number 49 | - name: place_fips 50 | description: Current place FIPS code 51 | - name: place_ns 52 | description: Current place GNIS code 53 | - name: place_geoid 54 | description: > 55 | Place identifier; a concatenation of the current state 56 | FIPS code and place FIPS code 57 | - name: place_name 58 | description: > 59 | Current name and the translated legal/statistical 60 | area description for place 61 | - name: class_fips_code 62 | description: Current FIPS class code 63 | - name: class_fips 64 | description: Current FIPS class definition 65 | - name: geometry 66 | description: The footprint geometry 67 | - name: area_sqm 68 | description: The area of the footprint in square meters 69 | - name: geo_reference__global_ml_building_footprints_with_tiger 70 | config: 71 | schema: building_footprints 72 | tags: building_footprints 73 | description: | 74 | This data table is a join of the TIGER data for blocks, tracts, counties, and 75 | places with the Microsoft Global ML Building Footprints data for the state of CA. 76 | columns: 77 | - name: height 78 | description: The height of the building (negative indicates unknown height) 79 | - name: county_fips 80 | description: 2020 Census county FIPS code 81 | - name: tract 82 | description: 2020 Census tract code 83 | - name: block 84 | description: 2020 Census tabulation block number 85 | - name: block_geoid 86 | description: > 87 | Census block identifier; a concatenation of 2020 Census state FIPS code, 2020 88 | Census county FIPS code, 2020 Census tract code, and 2020 Census block number 89 | - name: place_fips 90 | description: Current place FIPS code 91 | - name: place_ns 92 | description: Current place GNIS code 93 | - name: place_geoid 94 | description: > 95 | Place identifier; a concatenation of the current state 96 | FIPS code and place FIPS code 97 | - name: place_name 98 | description: > 99 | Current name and the translated legal/statistical 100 | area description for place 101 | - name: class_fips_code 102 | description: Current FIPS class code 103 | - name: class_fips 104 | description: Current FIPS class definition 105 | - name: geometry 106 | description: The footprint geometry 107 | - name: area_sqm 108 | description: The area of the footprint in square meters 109 | -------------------------------------------------------------------------------- /transform/models/marts/geo_reference/geo_reference__global_ml_building_footprints_with_tiger.sql: -------------------------------------------------------------------------------- 1 | with footprints as ( -- noqa: ST03 2 | select 3 | "height", 4 | "geometry" 5 | from {{ source('building_footprints', 'global_ml_building_footprints') }} 6 | ), 7 | 8 | blocks_source as ( 9 | select * 10 | from {{ source('tiger_2022', 'blocks') }} 11 | ), 12 | 13 | places_source as ( 14 | select * from {{ source('tiger_2022', 'places') }} 15 | ), 16 | 17 | blocks as ( -- noqa: ST03 18 | select 19 | "COUNTYFP20" as "county_fips", 20 | "TRACTCE20" as "tract", 21 | "BLOCKCE20" as "block", 22 | "GEOID20" as "block_geoid", 23 | "geometry" 24 | from blocks_source 25 | ), 26 | 27 | places as ( -- noqa: ST03 28 | select 29 | "PLACEFP" as "place_fips", 30 | "PLACENS" as "place_ns", 31 | "GEOID" as "place_geoid", 32 | "NAME" as "place_name", 33 | "CLASSFP" as "class_fips_code", 34 | {{ map_class_fips("CLASSFP") }} as "class_fips", 35 | "geometry" 36 | from places_source 37 | ), 38 | 39 | footprints_with_blocks as ( -- noqa: ST03 40 | {{ spatial_join_with_deduplication( 41 | "footprints", 42 | "blocks", 43 | ['"height"'], 44 | ['"county_fips"', '"tract"', '"block"', '"block_geoid"'], 45 | left_geom='"geometry"', 46 | right_geom='"geometry"', 47 | kind="inner", 48 | prefix="b", 49 | ) }} 50 | ), 51 | 52 | footprints_with_blocks_and_places as ( 53 | {{ spatial_join_with_deduplication( 54 | "footprints_with_blocks", 55 | "places", 56 | ['"height"', '"county_fips"', '"tract"', '"block"', '"block_geoid"'], 57 | ['"place_fips"', '"place_ns"', '"place_geoid"', '"place_name"', '"class_fips_code"', '"class_fips"'], 58 | left_geom='"geometry"', 59 | right_geom='"geometry"', 60 | kind="left", 61 | prefix="p", 62 | ) }} 63 | ), 64 | 65 | footprints_with_blocks_and_places_final as ( 66 | select 67 | *, 68 | st_area("geometry") as "area_sqm" 69 | from footprints_with_blocks_and_places 70 | ) 71 | 72 | select * from footprints_with_blocks_and_places_final 73 | -------------------------------------------------------------------------------- /transform/models/marts/geo_reference/geo_reference__us_building_footprints_with_tiger.sql: -------------------------------------------------------------------------------- 1 | with footprints as ( -- noqa: ST03 2 | select 3 | "release", 4 | "capture_dates_range", 5 | "geometry" 6 | from {{ source('building_footprints', 'us_building_footprints') }} 7 | ), 8 | 9 | blocks_source as ( 10 | select * 11 | from {{ source('tiger_2022', 'blocks') }} 12 | ), 13 | 14 | places_source as ( 15 | select * from {{ source('tiger_2022', 'places') }} 16 | ), 17 | 18 | blocks as ( -- noqa: ST03 19 | select 20 | "COUNTYFP20" as "county_fips", 21 | "TRACTCE20" as "tract", 22 | "BLOCKCE20" as "block", 23 | "GEOID20" as "block_geoid", 24 | "geometry" 25 | from blocks_source 26 | ), 27 | 28 | places as ( -- noqa: ST03 29 | select 30 | "PLACEFP" as "place_fips", 31 | "PLACENS" as "place_ns", 32 | "GEOID" as "place_geoid", 33 | "NAME" as "place_name", 34 | "CLASSFP" as "class_fips_code", 35 | {{ map_class_fips("CLASSFP") }} as "class_fips", 36 | "geometry" 37 | from places_source 38 | ), 39 | 40 | footprints_with_blocks as ( -- noqa: ST03 41 | {{ spatial_join_with_deduplication( 42 | "footprints", 43 | "blocks", 44 | ['"release"', '"capture_dates_range"'], 45 | ['"county_fips"', '"tract"', '"block"', '"block_geoid"'], 46 | left_geom='"geometry"', 47 | right_geom='"geometry"', 48 | kind="inner", 49 | prefix="b", 50 | ) }} 51 | ), 52 | 53 | footprints_with_blocks_and_places as ( 54 | {{ spatial_join_with_deduplication( 55 | "footprints_with_blocks", 56 | "places", 57 | ['"release"', '"capture_dates_range"', '"county_fips"', '"tract"', '"block"', '"block_geoid"'], 58 | ['"place_fips"', '"place_ns"', '"place_geoid"', '"place_name"', '"class_fips_code"', '"class_fips"'], 59 | left_geom='"geometry"', 60 | right_geom='"geometry"', 61 | kind="left", 62 | prefix="p", 63 | ) }} 64 | ), 65 | 66 | footprints_with_blocks_and_places_final as ( 67 | select 68 | *, 69 | st_area("geometry") as "area_sqm" 70 | from footprints_with_blocks_and_places 71 | ) 72 | 73 | select * from footprints_with_blocks_and_places_final 74 | -------------------------------------------------------------------------------- /transform/models/marts/snowflake_cost_tracking/_snowflake_cost_tracking.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: snowflake_costs_by_date 5 | description: | 6 | Snowflake costs by date for the following: 7 | 8 | * Automatic clustering 9 | * Materialized view refreshes 10 | * Pipe usage 11 | * Storage 12 | * Warehouse usage 13 | * Cortex (GenAI) usage 14 | 15 | Data are in long form, where `usage_type` indicates which 16 | type of usage is measured in credits. 17 | columns: 18 | - name: account_name 19 | description: Account name 20 | - name: usage_date 21 | description: Date on which the usage occurred 22 | - name: usage_type 23 | description: | 24 | One of the following usage types: 25 | 26 | * `'automatic clustering'` 27 | * `'materialized view'` 28 | * `'pipe'` 29 | * `'storage'` 30 | * `'warehouse'` 31 | * `'cortex'` 32 | - name: credits_used 33 | description: The credits used for the usage type and date. 34 | -------------------------------------------------------------------------------- /transform/models/marts/snowflake_cost_tracking/snowflake_costs_by_date.sql: -------------------------------------------------------------------------------- 1 | /* 2 | TODO: this does not yet account for credits consumed by: 3 | 4 | * Query acceleration 5 | * Search optimization 6 | * Replication/failover groups 7 | */ 8 | 9 | with automatic_clustering_history as ( 10 | select 11 | account_name, 12 | usage_date, 13 | 'automatic clustering' as usage_type, 14 | credits_used 15 | from {{ ref('int_automatic_clustering_history') }} 16 | ), 17 | 18 | materialized_view_refresh_history as ( 19 | select 20 | account_name, 21 | usage_date, 22 | 'materialized view' as usage_type, 23 | credits_used 24 | from {{ ref('int_materialized_view_refresh_history') }} 25 | ), 26 | 27 | pipe_usage_history as ( 28 | select 29 | account_name, 30 | usage_date, 31 | 'pipe' as usage_type, 32 | credits_used 33 | from {{ ref('int_pipe_usage_history') }} 34 | ), 35 | 36 | storage_daily_history as ( 37 | select 38 | account_name, 39 | usage_date, 40 | 'storage' as usage_type, 41 | credits_used 42 | from {{ ref('int_storage_daily_history') }} 43 | ), 44 | 45 | warehouse_metering_history as ( 46 | select 47 | account_name, 48 | usage_date, 49 | 'warehouse' as usage_type, 50 | credits_used 51 | from {{ ref('int_warehouse_metering_history') }} 52 | ), 53 | 54 | cortex_usage_daily_history as ( 55 | select 56 | account_name, 57 | usage_date, 58 | 'cortex' as usage_type, 59 | credits_used 60 | from {{ ref('int_cortex_usage_daily_history') }} 61 | ), 62 | 63 | -- Combine the data in long form to allow for easy 64 | -- aggregations and visualizations. 65 | combined as ( 66 | select * from automatic_clustering_history 67 | union all 68 | select * from materialized_view_refresh_history 69 | union all 70 | select * from pipe_usage_history 71 | union all 72 | select * from storage_daily_history 73 | union all 74 | select * from warehouse_metering_history 75 | union all 76 | select * from cortex_usage_daily_history 77 | ) 78 | 79 | select * from combined 80 | -------------------------------------------------------------------------------- /transform/models/marts/state_entities/_state_entities__models.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dim_state_entities__agencies 5 | description: Agency-level state entities. 6 | columns: 7 | - name: name 8 | description: The name of the state agency 9 | data_tests: 10 | - unique 11 | - not_null 12 | - name: agency_code 13 | description: The numeric code of the state agency 14 | data_tests: 15 | - unique 16 | - not_null 17 | -------------------------------------------------------------------------------- /transform/models/marts/state_entities/dim_state_entities__agencies.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="table") }} 2 | 3 | with 4 | agencies as ( 5 | select 6 | name, 7 | agency_code 8 | from {{ ref("int_state_entities__active") }} 9 | where subagency_code is null and l1 is null 10 | ) 11 | 12 | select * 13 | from agencies 14 | -------------------------------------------------------------------------------- /transform/models/overview.md: -------------------------------------------------------------------------------- 1 | {% docs __overview__ %} 2 | 3 | # CalData dbt Documentation 4 | 5 | Welcome to the CalData Data Services and Engineering `dbt` Snowflake docs. 6 | To go back to the top-level docs, follow [this link](../) 7 | 8 | ## Navigation 9 | 10 | You can use the `Project` and `Database` navigation tabs on the left side of the window to explore the models in your project. 11 | 12 | ### Project Tab 13 | 14 | The Project tab mirrors the directory structure of your dbt project. 15 | In this tab, you can see all of the models defined in your dbt project, as well as models imported from dbt packages. 16 | 17 | ### Database Tab 18 | 19 | The Database tab also exposes your models, but in a format that looks more like a database explorer. 20 | This view shows relations (tables and views) grouped into database schemas. 21 | Note that ephemeral models are not shown in this interface, as they do not exist in the database. 22 | 23 | ## Graph Exploration 24 | 25 | You can click the blue icon on the bottom-right corner of the page to view the lineage graph of your models. 26 | 27 | On model pages, you'll see the immediate parents and children of the model you're exploring. 28 | By clicking the Expand button at the top-right of this lineage pane, 29 | you'll be able to see all of the models that are used to build, or are built from, 30 | the model you're exploring. 31 | 32 | Once expanded, you'll be able to use the `--select` and `--exclude` model selection syntax to filter the models in the graph. 33 | For more information on model selection, check out the [dbt docs](https://docs.getdbt.com/reference/node-selection/syntax). 34 | 35 | Note that you can also right-click on models to interactively filter and explore the graph. 36 | 37 | {% enddocs %} 38 | -------------------------------------------------------------------------------- /transform/models/staging/department_of_finance/stg_department_of_finance__entities.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized="table") }} 2 | 3 | {% set udf_schema = "PUBLIC" %} 4 | 5 | {% call set_sql_header(config) %} 6 | 7 | -- Warning! The SQL header is rendered separately from the rest of the template, 8 | -- so we redefine the udf_schema in this block: 9 | -- https://github.com/dbt-labs/dbt-core/issues/2793 10 | {% set udf_schema = "PUBLIC" %} 11 | 12 | create or replace temp function 13 | {{ udf_schema }}.reorder_name_for_alphabetization(name string) 14 | returns string 15 | language javascript 16 | as 17 | $$ 18 | // Replace fancy quotes with normal ones. 19 | const name = NAME.replace("’", "'"); 20 | 21 | // Skip some exceptions 22 | const skip = ["Governor's Office"]; 23 | if (skip.includes(name)) { 24 | return name; 25 | } 26 | 27 | // Annoying exceptions 28 | if (name.includes("Milton Marks") && name.includes("Little Hoover")) { 29 | return "Little Hoover Commission"; 30 | } 31 | 32 | // Basic organizational types by which we don't want to organize. 33 | const patterns = [ 34 | "Office of the Secretary (?:for|of)?", 35 | "Commission (?:on|for)?", 36 | "Board of Governors (?:for|of)?", 37 | "Board (?:of|on|for)?", 38 | "Agency (?:on|for)?", 39 | "(?:Department|Dept\\.) of", 40 | "Commission (?:on|for)?", 41 | "Committee (?:on|for)?", 42 | "Bureau of", 43 | "Council on", 44 | "Policy Council on", 45 | "Institute of", 46 | "Office (?:for|of)?", 47 | "Secretary (?:for|of)?", 48 | "", // Empty pattern to catch the prefixes below. 49 | ].map( 50 | // Lots of entities also start with throat clearing like "California this" 51 | // or "State that", which we also want to skip. Some also include a definite 52 | // article after the organizational unit. 53 | (p) => 54 | "(?:California\\s+)?(?:Governor's\\s+)?(?:State\\s+|St\\.\\s+)?(?:Intergovernmental\\s+)?" + 55 | p + 56 | "(?:\\s+the)?" 57 | ); 58 | 59 | const all_patterns = `(${patterns.join("|")})`; 60 | const re = RegExp(`^${all_patterns}\\s*(.+)$`); // \s* because some of the above eat spaces. 61 | const match = name.match(re); 62 | // Empty prefixes are matched, so skip if we don't get a full match. 63 | if (match && match[1] && match[2]) { 64 | return `${match[2].trim()}, ${match[1].trim()}`; 65 | } else { 66 | return name; 67 | } 68 | $$ 69 | ; 70 | 71 | create or replace temp function {{ udf_schema }}.extract_name(name string) 72 | returns string 73 | language javascript 74 | as $$ 75 | const match = NAME.match(/^(.+?)(?:(?:\s*\(.*\)\s*|\s*[-–]+\s*[A-Z/ ]+)*)$/); 76 | if (match && match[1]) { 77 | return match[1]; 78 | } 79 | return NAME; 80 | $$ 81 | ; 82 | {%- endcall %} 83 | 84 | with 85 | base_entities as (select * from {{ source("state_entities", "base_entities") }}), 86 | 87 | invalid_subagencies as ( 88 | select * 89 | from base_entities 90 | where contains("name", 'no subagency') and contains("name", 'do not use') 91 | ), 92 | 93 | entities as ( 94 | select 95 | -- Extract the first portion of the entity as the name. The other 96 | -- two (optional) groups match parentheticals and things like 97 | -- "-- DO NOT USE" or " -- DOF USE ONLY" 98 | {{ udf_schema }}.extract_name("name") as name, 99 | coalesce(l3, l2, l1, b, a) as primary_code, 100 | a as agency_code, 101 | case 102 | when b in (select b from invalid_subagencies) then null else b 103 | end as subagency_code, 104 | l1, 105 | l2, 106 | l3, 107 | regexp_substr("name", '\\((.+?)\\)') as parenthetical, 108 | contains(lower("name"), 'do not use') as do_not_use, 109 | contains(lower("name"), 'abolished') as abolished, 110 | regexp_substr("name", '[A-Z/]+ USE ONLY') as restricted_use, 111 | "name" as name_raw 112 | from base_entities 113 | ), 114 | 115 | entities_with_extras as ( 116 | select 117 | *, 118 | {{ udf_schema }}.reorder_name_for_alphabetization(name) as name_alpha, 119 | case 120 | when coalesce(l3, l2, l1, subagency_code) is null 121 | then 'agency' 122 | when coalesce(l3, l2, l1) is null 123 | then 'subagency' 124 | when coalesce(l3, l2) is null 125 | then 'L1' 126 | when l3 is null 127 | then 'L2' 128 | else 'L3' 129 | end as ucm_level 130 | from entities 131 | ) 132 | 133 | select * 134 | from entities_with_extras 135 | -------------------------------------------------------------------------------- /transform/models/staging/department_of_finance/stg_ebudget__budgets.sql: -------------------------------------------------------------------------------- 1 | with 2 | agencies_and_departments as ( 3 | select * 4 | from {{ source('state_entities', 'ebudget_agency_and_department_budgets') }} 5 | ), 6 | 7 | ebudget_budgets as ( 8 | select 9 | "web_agency_cd" as primary_code, 10 | "legal_titl" as name, 11 | "all_budget_year_dols" as budget_year_dollars 12 | from agencies_and_departments 13 | ) 14 | 15 | select * 16 | from ebudget_budgets 17 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_automatic_clustering_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "DATABASE_NAME", 7 | "SCHEMA_NAME", 8 | "TABLE_NAME", 9 | "USAGE_DATE", 10 | ], 11 | ) 12 | }} 13 | 14 | WITH source AS ( 15 | SELECT 16 | num_rows_reclustered, 17 | account_locator, 18 | database_id, 19 | schema_name, 20 | database_name, 21 | table_id, 22 | schema_id, 23 | account_name, 24 | region, 25 | credits_used, 26 | organization_name, 27 | table_name, 28 | usage_date, 29 | num_bytes_reclustered 30 | FROM {{ source('organization_usage', 'automatic_clustering_history') }} 31 | ), 32 | 33 | automatic_clustering_history AS ( 34 | SELECT 35 | organization_name, 36 | account_name, 37 | database_name, 38 | schema_name, 39 | table_name, 40 | usage_date, 41 | sum(credits_used) AS credits_used, 42 | sum(num_rows_reclustered) AS num_rows_reclustered, 43 | sum(num_bytes_reclustered) AS num_bytes_reclustered 44 | FROM source 45 | GROUP BY ALL 46 | ) 47 | 48 | SELECT * 49 | FROM automatic_clustering_history 50 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_cortex_usage_daily_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "USAGE_DATE", 7 | ], 8 | ) 9 | }} 10 | 11 | -- The ORGANIZATION_USAGE schema does not provide a specific 12 | -- view on Cortex usage, so we need to get it from the overall 13 | -- metering daily history table. 14 | -- https://docs.snowflake.com/en/user-guide/snowflake-cortex/aisql#track-costs-for-ai-services 15 | WITH source AS ( 16 | SELECT 17 | credits_adjustment_cloud_services, 18 | region, 19 | credits_used, 20 | service_type, 21 | account_locator, 22 | usage_date, 23 | account_name, 24 | credits_billed, 25 | credits_used_cloud_services, 26 | organization_name, 27 | credits_used_compute 28 | FROM {{ source('organization_usage', 'metering_daily_history') }} 29 | WHERE service_type = 'AI_SERVICES' 30 | ), 31 | 32 | metering_daily_history AS ( 33 | SELECT 34 | organization_name, 35 | account_name, 36 | usage_date, 37 | sum(credits_used_compute) AS credits_used_compute, 38 | sum(credits_used_cloud_services) AS credits_used_cloud_services, 39 | sum(credits_adjustment_cloud_services) AS credits_adjustment_cloud_services, 40 | sum(credits_used) AS credits_used, 41 | sum(credits_billed) AS credits_billed 42 | FROM source 43 | GROUP BY organization_name, account_name, usage_date 44 | ) 45 | 46 | SELECT * 47 | FROM metering_daily_history 48 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_data_transfer_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "USAGE_DATE", 7 | "SOURCE_CLOUD", 8 | "SOURCE_REGION", 9 | "TARGET_CLOUD", 10 | "TARGET_REGION", 11 | ], 12 | ) 13 | }} 14 | 15 | WITH source AS ( 16 | SELECT 17 | organization_name, 18 | account_name, 19 | account_locator, 20 | region, 21 | usage_date, 22 | source_cloud, 23 | source_region, 24 | target_cloud, 25 | target_region, 26 | bytes_transferred, 27 | transfer_type 28 | FROM {{ source('organization_usage', 'data_transfer_history') }} 29 | ), 30 | 31 | data_transfer_history AS ( 32 | SELECT 33 | organization_name, 34 | account_name, 35 | usage_date, 36 | source_cloud, 37 | source_region, 38 | target_cloud, 39 | target_region, 40 | sum(bytes_transferred) AS bytes_transferred 41 | FROM source 42 | GROUP BY ALL 43 | ) 44 | 45 | SELECT * 46 | FROM data_transfer_history 47 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_database_storage_usage_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "DATABASE_NAME", 7 | "USAGE_DATE", 8 | ], 9 | ) 10 | }} 11 | 12 | WITH source AS ( 13 | SELECT 14 | region, 15 | average_hybrid_table_storage_bytes, 16 | organization_name, 17 | usage_date, 18 | database_id, 19 | database_name, 20 | account_name, 21 | average_database_bytes, 22 | account_locator, 23 | average_failsafe_bytes 24 | FROM {{ source('organization_usage', 'database_storage_usage_history') }} 25 | ), 26 | 27 | database_storage_usage_history AS ( 28 | SELECT 29 | organization_name, 30 | account_name, 31 | database_name, 32 | usage_date, 33 | AVG(average_hybrid_table_storage_bytes) AS average_hybrid_table_storage_bytes, 34 | AVG(average_database_bytes) AS average_database_bytes, 35 | AVG(average_failsafe_bytes) AS average_failsafe_bytes 36 | FROM source 37 | GROUP BY ALL 38 | ) 39 | 40 | SELECT * 41 | FROM database_storage_usage_history 42 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_materialized_view_refresh_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "DATABASE_NAME", 7 | "SCHEMA_NAME", 8 | "TABLE_NAME", 9 | "USAGE_DATE", 10 | ], 11 | ) 12 | }} 13 | 14 | WITH source AS ( 15 | SELECT 16 | schema_name, 17 | credits_used, 18 | organization_name, 19 | database_id, 20 | schema_id, 21 | table_id, 22 | account_locator, 23 | account_name, 24 | region, 25 | database_name, 26 | table_name, 27 | usage_date 28 | FROM {{ source('organization_usage', 'materialized_view_refresh_history') }} 29 | ), 30 | 31 | materialized_view_refresh_history AS ( 32 | SELECT 33 | organization_name, 34 | account_name, 35 | database_name, 36 | schema_name, 37 | table_name, 38 | usage_date, 39 | sum(credits_used) AS credits_used 40 | FROM source 41 | GROUP BY ALL 42 | ) 43 | 44 | SELECT * 45 | FROM materialized_view_refresh_history 46 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_metering_daily_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "USAGE_DATE", 7 | ], 8 | ) 9 | }} 10 | 11 | WITH source AS ( 12 | SELECT 13 | credits_adjustment_cloud_services, 14 | region, 15 | credits_used, 16 | service_type, 17 | account_locator, 18 | usage_date, 19 | account_name, 20 | credits_billed, 21 | credits_used_cloud_services, 22 | organization_name, 23 | credits_used_compute 24 | FROM {{ source('organization_usage', 'metering_daily_history') }} 25 | ), 26 | 27 | metering_daily_history AS ( 28 | SELECT 29 | organization_name, 30 | account_name, 31 | usage_date, 32 | sum(credits_used_compute) AS credits_used_compute, 33 | sum(credits_used_cloud_services) AS credits_used_cloud_services, 34 | sum(credits_adjustment_cloud_services) AS credits_adjustment_cloud_services, 35 | sum(credits_used) AS credits_used, 36 | sum(credits_billed) AS credits_billed 37 | FROM source 38 | GROUP BY ALL 39 | ) 40 | 41 | SELECT * 42 | FROM metering_daily_history 43 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_pipe_usage_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "PIPE_NAME", 7 | "USAGE_DATE", 8 | ], 9 | ) 10 | }} 11 | 12 | WITH source AS ( 13 | SELECT 14 | region, 15 | organization_name, 16 | bytes_inserted, 17 | files_inserted, 18 | usage_date, 19 | account_locator, 20 | credits_used, 21 | account_name, 22 | pipe_id, 23 | pipe_name 24 | FROM {{ source('organization_usage', 'pipe_usage_history') }} 25 | ), 26 | 27 | pipe_usage_history AS ( 28 | SELECT 29 | organization_name, 30 | account_name, 31 | pipe_name, 32 | usage_date, 33 | sum(bytes_inserted) AS bytes_inserted, 34 | sum(files_inserted) AS files_inserted, 35 | sum(credits_used) AS credits_used 36 | FROM source 37 | GROUP BY ALL 38 | ) 39 | 40 | SELECT * 41 | FROM pipe_usage_history 42 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_stage_storage_usage_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "USAGE_DATE", 7 | ], 8 | ) 9 | }} 10 | 11 | WITH source AS ( 12 | SELECT 13 | organization_name, 14 | account_locator, 15 | account_name, 16 | region, 17 | usage_date, 18 | average_stage_bytes 19 | FROM {{ source('organization_usage', 'stage_storage_usage_history') }} 20 | ), 21 | 22 | stage_storage_usage_history AS ( 23 | SELECT 24 | organization_name, 25 | account_name, 26 | usage_date, 27 | avg(average_stage_bytes) AS average_stage_bytes 28 | FROM source 29 | GROUP BY ALL 30 | ) 31 | 32 | SELECT * 33 | FROM stage_storage_usage_history 34 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_storage_daily_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "USAGE_DATE", 7 | ], 8 | ) 9 | }} 10 | 11 | WITH source AS ( 12 | SELECT 13 | organization_name, 14 | account_name, 15 | account_locator, 16 | region, 17 | usage_date, 18 | service_type, 19 | average_bytes, 20 | credits 21 | FROM {{ source('organization_usage', 'storage_daily_history') }} 22 | ), 23 | 24 | storage_daily_history AS ( 25 | SELECT 26 | organization_name, 27 | account_name, 28 | usage_date, 29 | avg(average_bytes) AS average_bytes, 30 | sum(credits) AS credits_used 31 | FROM source 32 | GROUP BY ALL 33 | ) 34 | 35 | SELECT * 36 | FROM storage_daily_history 37 | -------------------------------------------------------------------------------- /transform/models/staging/snowflake_cost_tracking/stg_warehouse_metering_history.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized="incremental", 3 | unique_key=[ 4 | "ORGANIZATION_NAME", 5 | "ACCOUNT_NAME", 6 | "WAREHOUSE_NAME", 7 | "USAGE_DATE", 8 | ], 9 | ) 10 | }} 11 | 12 | WITH source AS ( 13 | SELECT 14 | account_name, 15 | warehouse_id, 16 | credits_used, 17 | credits_used_compute, 18 | region, 19 | start_time, 20 | credits_used_cloud_services, 21 | warehouse_name, 22 | organization_name, 23 | service_type, 24 | account_locator, 25 | end_time 26 | FROM {{ source('organization_usage', 'warehouse_metering_history') }} 27 | ), 28 | 29 | warehouse_metering_history AS ( 30 | SELECT 31 | organization_name, 32 | account_name, 33 | warehouse_name, 34 | to_date(start_time) AS usage_date, 35 | sum(credits_used) AS credits_used, 36 | sum(credits_used_compute) AS credits_used_compute, 37 | sum(credits_used_cloud_services) AS credits_used_cloud_services 38 | FROM source 39 | GROUP BY ALL 40 | ) 41 | 42 | SELECT * 43 | FROM warehouse_metering_history 44 | -------------------------------------------------------------------------------- /transform/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.3.0 4 | sha1_hash: 226ae69cdfbc9367e2aa2c472b01f99dbce11de0 5 | -------------------------------------------------------------------------------- /transform/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.3.0 4 | -------------------------------------------------------------------------------- /transform/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/seeds/.gitkeep -------------------------------------------------------------------------------- /transform/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/snapshots/.gitkeep -------------------------------------------------------------------------------- /transform/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cagov/data-infrastructure/082f8bc84a3f0bff31595e2aaa9b72c68ca09f13/transform/tests/.gitkeep --------------------------------------------------------------------------------