├── .Rbuildignore ├── .gitattributes ├── .github ├── .gitignore ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ ├── maintenance.md │ └── scientific-improvement.md ├── PULL_REQUEST_TEMPLATE │ ├── pull_request_template.md │ └── vulnerability.md ├── dependabot.yml ├── scripts │ ├── cleanup-on-pr-close.sh │ ├── create_pool.py │ ├── delete-container-tag.sh │ └── docker_build_and_push.sh └── workflows │ ├── block-fixup.yaml │ ├── check-news-md.yaml │ ├── cleanup-on-pr-close.yaml │ ├── containers-and-az-pool.yaml │ ├── delete-container-tag.yaml │ ├── format-suggest.yaml │ ├── gh-act │ ├── 2-dry.sh │ └── 2-full.sh │ ├── manual-docker-prune.yml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ ├── r-cmd-check.yaml │ ├── start-app-job.yaml │ └── test-coverage.yaml ├── .gitignore ├── .lintr ├── .pre-commit-config.yaml ├── .secrets.baseline ├── CONTRIBUTING.md ├── DESCRIPTION ├── DISCLAIMER.md ├── Dockerfile ├── LICENSE.md ├── Makefile ├── NAMESPACE ├── NEWS.md ├── R ├── azure.R ├── config.R ├── data.R ├── diagnostics.R ├── exclusions.R ├── fit_model.R ├── parameters.R ├── pipeline.R ├── read_data.R ├── utils.R └── write_output.R ├── README.md ├── SOP.md ├── _pkgdown.yml ├── air.toml ├── azure ├── generate_configs.py ├── generate_rerun_configs.py ├── job.py └── requirements.txt ├── code-of-conduct.md ├── codecov.yml ├── container-app-jobs ├── README.md ├── blob-config-runner │ ├── config.ini │ ├── requirements.txt │ └── start-jobs.py └── job-template.yaml ├── data-raw ├── convert_gostic_toy_rt_to_test_dataset.R └── sir_gt_pmf.R ├── data ├── gostic_toy_rt.rda └── sir_gt_pmf.rda ├── image.png ├── man ├── Config.Rd ├── Data.Rd ├── Exclusions.Rd ├── Interval.Rd ├── Parameters.Rd ├── apply_exclusions.Rd ├── download_file_from_container.Rd ├── download_if_specified.Rd ├── extract_diagnostics.Rd ├── fetch_blob_container.Rd ├── fetch_credential_from_env_var.Rd ├── fit_model.Rd ├── format_stan_opts.Rd ├── gostic_toy_rt.Rd ├── low_case_count_diagnostic.Rd ├── opts_formatter.Rd ├── pipeline.Rd ├── read_data.Rd ├── read_disease_parameters.Rd ├── read_exclusions.Rd ├── read_interval_pmf.Rd ├── read_json_into_config.Rd ├── sample_processing_functions.Rd ├── sir_gt_pmf.Rd ├── write_model_outputs.Rd └── write_output_dir_structure.Rd ├── open_practices.md ├── rules_of_behavior.md ├── start.sh ├── tests ├── testthat.R └── testthat │ ├── _snaps │ ├── fit_model.md │ ├── parameters.md │ └── read_data.md │ ├── data │ ├── 2025-04-02_test.parquet │ ├── CA_COVID-19.json │ ├── CA_test.parquet │ ├── bad_config.json │ ├── sample_config_no_exclusion.json │ ├── sample_config_with_exclusion.json │ ├── sample_fit.rds │ ├── test_big_exclusions.csv │ ├── test_data.parquet │ ├── test_exclusions.csv │ ├── test_parameters.parquet │ ├── us_overall_test_data.parquet │ └── v_bad_config.json │ ├── helper-expect_pipeline_files_written.R │ ├── helper-write_exclusion.R │ ├── helper-write_parameter_file.R │ ├── test-diagnostics.R │ ├── test-exclusions.R │ ├── test-fit_model.R │ ├── test-parameters.R │ ├── test-pipeline.R │ ├── test-read_data.R │ └── test-write_output.R ├── thanks.md └── utils └── Rt_review_exclusions.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | $Dockerfile-batch^ 2 | ^.env$ 3 | ^CONTRIBUTING.md$ 4 | ^DISCLAIMER.md$ 5 | ^Dockerfile$ 6 | ^Dockerfile-batch$ 7 | ^Dockerfile-dependencies$ 8 | ^Dockerfile.unified$ 9 | ^LICENSE.md$ 10 | ^Makefile$ 11 | ^SOP.md$ 12 | ^[\.]?air\.toml$ 13 | ^\.github$ 14 | ^\.lintr$ 15 | ^\.pre-commit-config\.yaml$ 16 | ^\.secrets.baseline$ 17 | ^\.vscode$ 18 | ^_pkgdown\.yml$ 19 | ^azure$ 20 | ^batch-autoscale-formula.txt$ 21 | ^code-of-conduct.md$ 22 | ^codecov\.yml$ 23 | ^container-app-jobs$ 24 | ^data-raw$ 25 | ^docs$ 26 | ^image.png$ 27 | ^open_practices.md$ 28 | ^pkgdown$ 29 | ^rules_of_behavior.md$ 30 | ^start.sh$ 31 | ^thanks.md$ 32 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | NEWS.md merge=union 2 | 3 | # Normal text let sit to auto 4 | *.htm text 5 | *.html text 6 | *.css text 7 | *.js text 8 | 9 | ## Declare files that will always have LF (aka \n aka 10 aka 0x0a) line endings on checkout. 10 | *.sh text eol=lf 11 | *.md text eol=lf 12 | *.json text eol=lf 13 | *.yml text eol=lf 14 | *.csv text eol=lf 15 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what feature is not working. 12 | 13 | **Impact** 14 | Please describe the impact this bug is causing to your program or organization. 15 | 16 | **To Reproduce** 17 | Steps to reproduce the behavior: 18 | 1. Go to '...' 19 | 2. Click on '....' 20 | 3. Scroll down to '....' 21 | 4. See error 22 | 23 | **Expected behavior** 24 | A clear and concise description of what you expected to happen. 25 | 26 | **Screenshots** 27 | If applicable, add screenshots to help explain your problem. 28 | 29 | **Logs** 30 | If applicable, please attach logs to help describe your problem. 31 | 32 | **Desktop (please complete the following information):** 33 | - OS: [e.g. iOS] 34 | - Browser [e.g. chrome, safari] 35 | - Version [e.g. 22] 36 | 37 | **Smartphone (please complete the following information):** 38 | - Device: [e.g. iPhone6] 39 | - OS: [e.g. iOS8.1] 40 | - Browser [e.g. stock browser, safari] 41 | - Version [e.g. 22] 42 | 43 | **Additional context** 44 | Add any other context about the problem here. 45 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/maintenance.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Maintenance 3 | about: Questions and requests related to organizational support and maintenance 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **What type of help do you need?** 11 | 12 | * [ ] Question 13 | * [ ] New Repo 14 | * [ ] Delete Repo 15 | * [ ] User Membership (please make sure new members are familiar with the [CDC open practices](https://github.com/CDCgov/template/blob/master/open_practices.md#profile-setup) and set up their profile with name and org info to help people collaborate with them) 16 | * [ ] Other 17 | 18 | **Please describe how you'd like us to help.** 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/scientific-improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Scientific improvement 3 | about: Suggest a way to improve an existing tool or pipeline 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Describe the improvement that needs to be made 11 | (e.g. update a parameter estimate, tweak the prior, modify the model) 12 | 13 | ## Provide links to references to methods or data sources 14 | 15 | ## Describe the changes expected to the model's outputs 16 | 17 | ## Suggest new tests that will need to be implemented 18 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/pull_request_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | **Please describe the bug this fixes or the feature this adds.** 10 | 11 | **Please describe how you tested this change. Include unit tests whenever possible.** 12 | 13 | **Did you create or modify any associated documentation with this change? If documentation is not included in PR, please link to related documentation.** 14 | 15 | **If you added or modified HTML, did you check that it was 508 compliant?** 16 | 17 | **Please tag any specific reviewers you would like to review this PR** 18 | 19 | **Please include the following checks for open source contributing?** 20 | 21 | * [ ] Did you check for sensitive data, and remove any? 22 | * [ ] Are additional approvals needed for this change? 23 | * [ ] Are there potential vulnerabilities or licensing issues with any new dependencies introduced? 24 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/vulnerability.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Vulnerability Maintenance 3 | about: Routine updates to address vulnerabilities. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **What vulnerabilities does this PR remove or update?** 11 | 12 | **Have you tested to make sure these updates do not cause unintended consequences?** 13 | 14 | **Are these patch updates? minor? major?** 15 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | ignore: 13 | - dependency-name: "*" 14 | update-types: ["version-update:semver-patch"] 15 | -------------------------------------------------------------------------------- /.github/scripts/cleanup-on-pr-close.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Delete Batch Pools and associated jobs 4 | 5 | if [ "${#}" -ne 3 ]; then 6 | echo "Usage: $0 " 7 | exit 1 8 | fi 9 | 10 | ACCOUNT_NAME="$1" 11 | RESOURCE_GROUP="$2" 12 | POOL_ID="$3" 13 | 14 | echo "Logging into Batch account" 15 | az batch account login \ 16 | --name "${ACCOUNT_NAME}" \ 17 | --resource-group "${RESOURCE_GROUP}" 18 | 19 | ########################## 20 | # Fetch & delete jobs 21 | 22 | echo "Fetching jobs in pool ${POOL_ID}" 23 | 24 | JOB_IDS=$(az batch job list --query "[?poolInfo.poolId=='$POOL_ID'].id" --output tsv) 25 | 26 | if [ -z "${JOB_IDS}" ]; then 27 | echo "No jobs found in pool: ${POOL_ID}" 28 | else 29 | # Iterate line-by-line over the tsv list 30 | echo "${JOB_IDS}" | while IFS= read -r JOB_ID; do 31 | echo "Deleting job ${JOB_ID}" 32 | az batch job delete --job-id "${JOB_ID}" --yes 33 | done 34 | fi 35 | 36 | ########################## 37 | # Delete pool 38 | 39 | az batch pool delete --pool-id "${POOL_ID}" --yes 2>/dev/null || { 40 | echo "Pool ${POOL_ID} does not exist or has already been deleted" 41 | } 42 | -------------------------------------------------------------------------------- /.github/scripts/create_pool.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.13" 3 | # dependencies = [ 4 | # "azure-batch==14.2", 5 | # "azure-identity==1.21", 6 | # "azure-mgmt-batch==18.0", 7 | # "msrest==0.7", 8 | # ] 9 | # /// 10 | """ 11 | If running locally, use: 12 | uv run --env-file .env .github/scripts/create_pool.py 13 | Requires a `.env` file with at least the following: 14 | BATCH_ACCOUNT="" 15 | SUBSCRIPTION_ID="" 16 | BATCH_USER_ASSIGNED_IDENTITY="" 17 | AZURE_BATCH_ACCOUNT_CLIENT_ID="" 18 | PRINCIPAL_ID="" 19 | CONTAINER_REGISTRY_SERVER="" 20 | CONTAINER_IMAGE_NAME="https://full-cr-server/:tag" 21 | POOL_ID="" 22 | SUBNET_ID="" 23 | RESOURCE_GROUP="" 24 | 25 | If running in CI, all of the above environment variables should be set in the repo 26 | secrets. 27 | """ 28 | 29 | import os 30 | 31 | from azure.identity import DefaultAzureCredential 32 | from azure.mgmt.batch import BatchManagementClient 33 | 34 | AUTO_SCALE_FORMULA = """ 35 | // In this example, the pool size 36 | // is adjusted based on the number of tasks in the queue. 37 | // Note that both comments and line breaks are acceptable in formula strings. 38 | 39 | // Get pending tasks for the past 5 minutes. 40 | $samples = $ActiveTasks.GetSamplePercent(TimeInterval_Minute * 5); 41 | // If we have fewer than 70 percent data points, we use the last sample point, otherwise we use the maximum of last sample point and the history average. 42 | $tasks = $samples < 70 ? max(0, $ActiveTasks.GetSample(1)) : 43 | max( $ActiveTasks.GetSample(1), avg($ActiveTasks.GetSample(TimeInterval_Minute * 5))); 44 | // If number of pending tasks is not 0, set targetVM to pending tasks, otherwise half of current dedicated. 45 | $targetVMs = $tasks > 0 ? $tasks : max(0, $TargetDedicatedNodes / 2); 46 | // The pool size is capped at 100, if target VM value is more than that, set it to 100. 47 | cappedPoolSize = 100; 48 | $TargetDedicatedNodes = max(0, min($targetVMs, cappedPoolSize)); 49 | // Set node deallocation mode - keep nodes active only until tasks finish 50 | $NodeDeallocationOption = taskcompletion; 51 | """ 52 | 53 | 54 | def main() -> None: 55 | # Create the BatchManagementClient 56 | batch_mgmt_client = BatchManagementClient( 57 | credential=DefaultAzureCredential(), 58 | subscription_id=os.environ["SUBSCRIPTION_ID"], 59 | ) 60 | 61 | # Assemble the pool parameters 62 | pool_parameters = { 63 | "identity": { 64 | "type": "UserAssigned", 65 | "userAssignedIdentities": { 66 | os.environ["BATCH_USER_ASSIGNED_IDENTITY"]: { 67 | "clientId": os.environ["AZURE_BATCH_ACCOUNT_CLIENT_ID"], 68 | "principalId": os.environ["PRINCIPAL_ID"], 69 | } 70 | }, 71 | }, 72 | "properties": { 73 | "vmSize": "STANDARD_d4d_v5", 74 | "interNodeCommunication": "Disabled", 75 | "taskSlotsPerNode": 1, 76 | "taskSchedulingPolicy": {"nodeFillType": "Spread"}, 77 | "deploymentConfiguration": { 78 | "virtualMachineConfiguration": { 79 | "imageReference": { 80 | "publisher": "microsoft-dsvm", 81 | "offer": "ubuntu-hpc", 82 | "sku": "2204", 83 | "version": "latest", 84 | }, 85 | "nodeAgentSkuId": "batch.node.ubuntu 22.04", 86 | "containerConfiguration": { 87 | "type": "dockercompatible", 88 | "containerImageNames": [os.environ["CONTAINER_IMAGE_NAME"]], 89 | "containerRegistries": [ 90 | { 91 | "identityReference": { 92 | "resourceId": os.environ[ 93 | "BATCH_USER_ASSIGNED_IDENTITY" 94 | ] 95 | }, 96 | "registryServer": os.environ[ 97 | "CONTAINER_REGISTRY_SERVER" 98 | ], 99 | } 100 | ], 101 | }, 102 | } 103 | }, 104 | "networkConfiguration": { 105 | "subnetId": os.environ["SUBNET_ID"], 106 | "publicIPAddressConfiguration": {"provision": "NoPublicIPAddresses"}, 107 | "dynamicVnetAssignmentScope": "None", 108 | }, 109 | "scaleSettings": { 110 | "autoScale": { 111 | "evaluationInterval": "PT5M", 112 | "formula": AUTO_SCALE_FORMULA, 113 | } 114 | }, 115 | "resizeOperationStatus": { 116 | "targetDedicatedNodes": 1, 117 | "nodeDeallocationOption": "Requeue", 118 | "resizeTimeout": "PT15M", 119 | "startTime": "2023-07-05T13:18:25.7572321Z", 120 | }, 121 | "currentDedicatedNodes": 0, 122 | "currentLowPriorityNodes": 0, 123 | "targetNodeCommunicationMode": "Simplified", 124 | "currentNodeCommunicationMode": "Simplified", 125 | }, 126 | } 127 | 128 | batch_mgmt_client.pool.create( 129 | resource_group_name=os.environ["RESOURCE_GROUP"], 130 | account_name=os.environ["BATCH_ACCOUNT"], 131 | pool_name=os.environ["POOL_ID"], 132 | parameters=pool_parameters, 133 | ) 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /.github/scripts/delete-container-tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Delete container tags from Azure CR 4 | 5 | if [ "${#}" -ne 3 ]; then 6 | echo "Usage: $0 " 7 | exit 1 8 | fi 9 | 10 | REGISTRY="$1" 11 | IMAGE="$2" 12 | TAG="$3" 13 | 14 | ########################## 15 | # Delete container tags 16 | 17 | # Remove the image from the registry 18 | az acr repository delete \ 19 | --yes \ 20 | --name "${REGISTRY}" \ 21 | --image "${IMAGE}:${TAG}" 22 | -------------------------------------------------------------------------------- /.github/scripts/docker_build_and_push.sh: -------------------------------------------------------------------------------- 1 | IMAGE="ghcr.io/cdcgov/$1" 2 | TAG=$2 3 | BUILDER=docker-container-driver-builder 4 | 5 | # create a builder with the docker-container driver to allow cache-export 6 | docker buildx create --name "$BUILDER" --driver=docker-container || true 7 | 8 | # use the registry cache for prior images of the same tag, or the 'latest' tag 9 | time docker buildx build --push -t "$IMAGE:$TAG" \ 10 | --builder "$BUILDER" \ 11 | --cache-from "type=registry,ref=$IMAGE:$TAG-cache" \ 12 | --cache-from "type=registry,ref=$IMAGE:latest-cache" \ 13 | --cache-to "type=registry,ref=$IMAGE:$TAG-cache,mode=max" \ 14 | -f Dockerfile . 15 | -------------------------------------------------------------------------------- /.github/workflows/block-fixup.yaml: -------------------------------------------------------------------------------- 1 | name: Block Fix-up (Git Check) 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | block-fixup: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v4 11 | - name: Block Fixup Commit Merge 12 | uses: 13rac1/block-fixup-merge-action@v2.0.0 13 | -------------------------------------------------------------------------------- /.github/workflows/check-news-md.yaml: -------------------------------------------------------------------------------- 1 | # All PRs into main MUST be deliberately labelled in the NEWS.md with a succint but informative entry 2 | # describing the changes made - this workflow checks to make sure that this has been done. 3 | 4 | name: Check NEWS.md Update 5 | 6 | on: 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | check-news-md-modification: 13 | 14 | name: Check NEWS.md modification 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - name: Checkout code 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 # Fetch all history for all tags and branches 23 | 24 | - name: Check for NEWS.md changes 25 | run: | 26 | echo "Current SHA: $GITHUB_SHA" 27 | echo "Base SHA: ${{ github.event.pull_request.base.sha }}" 28 | 29 | git fetch origin ${{ github.event.pull_request.base.ref }} 30 | 31 | CHANGED_FILES=$(git diff --name-only $GITHUB_SHA $(git merge-base $GITHUB_SHA origin/${{ github.event.pull_request.base.ref }})) 32 | echo "Changed files:" 33 | echo "$CHANGED_FILES" 34 | 35 | if echo "$CHANGED_FILES" | grep -q "NEWS.md"; then 36 | echo "NEWS.md has been modified." 37 | else 38 | echo "::error file=NEWS.md,line=1,col=5::NEWS.md must be updated with each PR." >&2 39 | exit 1 40 | fi 41 | shell: /usr/bin/bash -e {0} 42 | -------------------------------------------------------------------------------- /.github/workflows/cleanup-on-pr-close.yaml: -------------------------------------------------------------------------------- 1 | name: Tear down Batch pool 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - closed 7 | workflow_dispatch: 8 | inputs: 9 | tag: 10 | description: The name of the tag to delete. Usually the branch name. 11 | type: string 12 | 13 | env: 14 | IMAGE_TAG: ${{ inputs.tag || github.head_ref || github.ref_name }} 15 | # getting tag from input or branch name https://stackoverflow.com/a/71158878 16 | 17 | jobs: 18 | 19 | delete-pool: 20 | environment: production 21 | permissions: 22 | id-token: write 23 | runs-on: ubuntu-latest 24 | name: Delete Batch pool 25 | 26 | steps: 27 | - name: Protect 'latest' 28 | run: | 29 | if [ "${{ env.IMAGE_TAG }}" = "latest" ]; then 30 | echo "Cannot delete pool for 'latest'" 31 | exit 1 32 | fi 33 | 34 | # From: https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/configuring-openid-connect-in-cloud-providers#requesting-the-jwt-using-the-actions-core-toolkit 35 | - name: Install OIDC Client from Core Package 36 | run: npm install @actions/core@1.6.0 @actions/http-client 37 | - name: Get Id Token 38 | uses: actions/github-script@v7 39 | id: idtoken 40 | with: 41 | script: | 42 | const coredemo = require('@actions/core') 43 | const id_token = await coredemo.getIDToken('api://AzureADTokenExchange') 44 | coredemo.setOutput('id_token', id_token) 45 | 46 | - name: Delete pool 47 | uses: CDCgov/cfa-actions/runner-action@v1.4.0 48 | with: 49 | github_app_id: ${{ secrets.CDCENT_ACTOR_APP_ID }} 50 | github_app_pem: ${{ secrets.CDCENT_ACTOR_APP_PEM }} 51 | wait_for_completion: true 52 | print_logs: true 53 | script: | 54 | CURRENT_BRANCH='${{ github.event.pull_request.head.sha || github.ref_name }}' 55 | echo "Cloning repo at commit '$CURRENT_BRANCH'" 56 | git clone https://github.com/${{ github.repository }}.git 57 | cd ${{ github.event.repository.name }} 58 | git checkout $CURRENT_BRANCH 59 | 60 | echo "Logging into Azure CLI" 61 | az login --service-principal \ 62 | --username ${{ secrets.AZURE_NNHT_SP_CLIENT_ID }} \ 63 | --tenant ${{ secrets.TENANT_ID }} \ 64 | --federated-token ${{ steps.idtoken.outputs.id_token }} \ 65 | --output none 66 | 67 | echo "Running cleanup pool script" 68 | bash .github/scripts/cleanup-on-pr-close.sh \ 69 | "${{ secrets.BATCH_ACCOUNT }}" \ 70 | "${{ secrets.PRD_RESOURCE_GROUP }}" \ 71 | "cfa-epinow2-${{ env.IMAGE_TAG }}" 72 | -------------------------------------------------------------------------------- /.github/workflows/delete-container-tag.yaml: -------------------------------------------------------------------------------- 1 | name: Delete tag from container registries 2 | 3 | on: 4 | pull_request: 5 | types: [closed] 6 | workflow_dispatch: 7 | inputs: 8 | tag: 9 | description: The name of the tag to delete. Usually the branch name. 10 | type: string 11 | 12 | env: 13 | IMAGE_NAME: cfa-epinow2-pipeline 14 | IMAGE_TAG: ${{ inputs.tag || github.head_ref || github.ref_name }} 15 | # getting tag from input or branch name https://stackoverflow.com/a/71158878 16 | 17 | jobs: 18 | delete-tag-ghcr: 19 | continue-on-error: true # allow other tag deletion to happen even if one fails 20 | permissions: 21 | packages: write 22 | runs-on: ubuntu-latest 23 | name: Delete image tag from GHCR 24 | 25 | steps: 26 | # Deleting a package from GHCR by tag name is surprising complex 27 | # This action has been approved for use on cdcent/cdcgov by the CDC Github Team 28 | # https://github.com/snok/container-retention-policy 29 | - name: Delete image tag 30 | uses: snok/container-retention-policy@v3.0.0 31 | with: 32 | account: ${{ github.repository_owner }} 33 | token: ${{ secrets.GITHUB_TOKEN }} 34 | image-names: ${{ env.IMAGE_NAME }} 35 | image-tags: ${{ env.IMAGE_TAG }},${{ env.IMAGE_TAG }}-cache 36 | cut-off: 1s # required, minimum package age to be a candidate for deletion 37 | 38 | delete-tag-acr: 39 | environment: production 40 | continue-on-error: true # allow other tag deletion to happen even if one fails 41 | permissions: 42 | id-token: write 43 | runs-on: ubuntu-latest 44 | name: Delete image tag from ACR 45 | 46 | steps: 47 | - name: Protect 'latest' 48 | run: | 49 | if [ "${{ env.IMAGE_TAG }}" = "latest" ]; then 50 | echo "Cannot delete pool for 'latest'" 51 | exit 1 52 | fi 53 | 54 | # From: https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/configuring-openid-connect-in-cloud-providers#requesting-the-jwt-using-the-actions-core-toolkit 55 | - name: Install OIDC Client from Core Package 56 | run: npm install @actions/core@1.6.0 @actions/http-client 57 | - name: Get Id Token 58 | uses: actions/github-script@v7 59 | id: idtoken 60 | with: 61 | script: | 62 | const coredemo = require('@actions/core') 63 | const id_token = await coredemo.getIDToken('api://AzureADTokenExchange') 64 | coredemo.setOutput('id_token', id_token) 65 | 66 | - name: Delete ACR tag 67 | uses: CDCgov/cfa-actions/runner-action@v1.4.0 68 | with: 69 | github_app_id: ${{ secrets.CDCENT_ACTOR_APP_ID }} 70 | github_app_pem: ${{ secrets.CDCENT_ACTOR_APP_PEM }} 71 | wait_for_completion: true 72 | print_logs: true 73 | script: | 74 | CURRENT_BRANCH='${{ github.event.pull_request.head.sha || github.ref_name }}' 75 | echo "Cloning repo at commit '$CURRENT_BRANCH'" 76 | git clone https://github.com/${{ github.repository }}.git 77 | cd ${{ github.event.repository.name }} 78 | git checkout $CURRENT_BRANCH 79 | 80 | echo "Logging into Azure CLI" 81 | az login --service-principal \ 82 | --username ${{ secrets.AZURE_NNHT_SP_CLIENT_ID }} \ 83 | --tenant ${{ secrets.TENANT_ID }} \ 84 | --federated-token ${{ steps.idtoken.outputs.id_token }} \ 85 | --output none 86 | 87 | echo "Running delete tag script" 88 | bash .github/scripts/delete-container-tag.sh \ 89 | ${{ secrets.CONTAINER_REGISTRY_URL }} \ 90 | ${{ env.IMAGE_NAME }} \ 91 | ${{ env.IMAGE_TAG }} 92 | -------------------------------------------------------------------------------- /.github/workflows/format-suggest.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/posit-dev/setup-air/tree/main/examples 2 | on: 3 | pull_request: 4 | 5 | name: format-suggest.yaml 6 | 7 | permissions: read-all 8 | 9 | jobs: 10 | format-suggest: 11 | name: format-suggest 12 | runs-on: ubuntu-latest 13 | permissions: 14 | pull-requests: write 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Install 19 | uses: posit-dev/setup-air@v1 20 | 21 | - name: Format 22 | run: air format . 23 | 24 | - name: Suggest 25 | uses: reviewdog/action-suggester@v1 26 | with: 27 | level: error 28 | fail_level: error 29 | tool_name: air 30 | -------------------------------------------------------------------------------- /.github/workflows/gh-act/2-dry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Requires that you have first run 'gh extension install nektos/gh-act' 4 | # as well as having installed the docker engine and added your user to the docker group 5 | 6 | # This checks syntax before you push to Github Actions, helping with debug hell 7 | # To run the entire pipeline locally, see 2-full.sh 8 | 9 | gh act -P cfa-cdcgov=... -n -W '.github/workflows/2-Run-Epinow2-Pipeline.yaml' 10 | -------------------------------------------------------------------------------- /.github/workflows/gh-act/2-full.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Requires that you have first run 'gh extension install nektos/gh-act' 4 | # as well as having installed the docker engine and added your user to the docker group 5 | 6 | # This runs the github actions workflow locally 7 | 8 | gh act -P cfa-cdcgov=catthehacker/ubuntu:full-20.04 -W '.github/workflows/2-Run-Epinow2-Pipeline.yaml' 9 | -------------------------------------------------------------------------------- /.github/workflows/manual-docker-prune.yml: -------------------------------------------------------------------------------- 1 | name: Manual Docker System Prune (CDCgov) 2 | on: 3 | workflow_dispatch: 4 | 5 | jobs: 6 | docker-system-prune: 7 | runs-on: cfa-cdcgov 8 | 9 | steps: 10 | - run: docker system prune --all --force --volumes 11 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown website 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | pull-requests: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | r-version: "4.4.3" 34 | 35 | - uses: r-lib/actions/setup-r-dependencies@v2 36 | with: 37 | extra-packages: any::pkgdown, local::. 38 | needs: website 39 | 40 | - name: Build site 41 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 42 | shell: Rscript {0} 43 | 44 | - name: Save artifact 45 | if: ${{ github.event_name == 'pull_request' }} 46 | uses: actions/upload-artifact@v4 47 | with: 48 | name: pkgdown-site 49 | path: ./docs 50 | retention-days: 7 51 | 52 | - name: Post to PR 53 | uses: CDCgov/cfa-actions/post-artifact@v1.4.0 54 | if: ${{ github.event_name == 'pull_request' }} 55 | with: 56 | artifact-name: pkgdown-site 57 | gh-token: ${{ secrets.GITHUB_TOKEN }} 58 | 59 | - name: Deploy to GitHub pages 🚀 60 | if: github.event_name != 'pull_request' 61 | uses: JamesIves/github-pages-deploy-action@v4.7.3 62 | with: 63 | clean: false 64 | branch: gh-pages 65 | folder: docs 66 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | workflow_dispatch: 7 | 8 | name: PR Commands 9 | 10 | jobs: 11 | document: 12 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'USER') && startsWith(github.event.comment.body, '/document') }} 13 | name: document 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | steps: 18 | - uses: actions/checkout@v4 19 | with: 20 | persist-credentials: false 21 | 22 | - uses: r-lib/actions/pr-fetch@v2 23 | with: 24 | repo-token: ${{ secrets.GITHUB_TOKEN }} 25 | 26 | - uses: r-lib/actions/setup-r@v2 27 | with: 28 | use-public-rspm: true 29 | 30 | - uses: r-lib/actions/setup-r-dependencies@v2 31 | with: 32 | extra-packages: any::roxygen2 33 | needs: pr-document 34 | 35 | - name: Document 36 | run: roxygen2::roxygenise() 37 | shell: Rscript {0} 38 | 39 | - name: commit 40 | run: | 41 | git config --local user.name "$GITHUB_ACTOR" 42 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 43 | git add man/\* NAMESPACE 44 | git commit -m 'Document' 45 | 46 | - uses: r-lib/actions/pr-push@v2 47 | with: 48 | repo-token: ${{ secrets.ZS_PAT }} 49 | 50 | style: 51 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'USER') && startsWith(github.event.comment.body, '/style') }} 52 | name: style 53 | runs-on: ubuntu-latest 54 | env: 55 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 56 | steps: 57 | - uses: actions/checkout@v4 58 | with: 59 | persist-credentials: false 60 | 61 | - uses: r-lib/actions/pr-fetch@v2 62 | with: 63 | repo-token: ${{ secrets.GITHUB_TOKEN }} 64 | 65 | - uses: r-lib/actions/setup-r@v2 66 | 67 | - name: Install dependencies 68 | run: install.packages("styler") 69 | shell: Rscript {0} 70 | 71 | - name: Style 72 | run: styler::style_pkg() 73 | shell: Rscript {0} 74 | 75 | - name: commit 76 | run: | 77 | git config --local user.name "$GITHUB_ACTOR" 78 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 79 | git add \*.R 80 | git commit -m 'Style' 81 | 82 | - uses: r-lib/actions/pr-push@v2 83 | with: 84 | repo-token: ${{ secrets.ZS_PAT }} 85 | -------------------------------------------------------------------------------- /.github/workflows/r-cmd-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | 4 | name: R CMD check 5 | 6 | on: 7 | pull_request: 8 | branches: [main] 9 | push: 10 | branches: 11 | - main 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | R-CMD-check: 19 | runs-on: ${{ matrix.config.os }} 20 | 21 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 22 | 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | config: 27 | - {os: ubuntu-latest, r: 'release'} 28 | # Dropping these for now because we deploy only on Ubuntu. 29 | # Ucomment as needed if supporting additional environments. 30 | #- {os: macos-latest, r: 'release'} 31 | #- {os: windows-latest, r: 'release'} 32 | 33 | env: 34 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 35 | R_KEEP_PKG_SOURCE: yes 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - uses: r-lib/actions/setup-pandoc@v2 41 | 42 | - uses: r-lib/actions/setup-r@v2 43 | with: 44 | r-version: "4.4.3" 45 | http-user-agent: ${{ matrix.config.http-user-agent }} 46 | use-public-rspm: true 47 | 48 | - uses: r-lib/actions/setup-r-dependencies@v2 49 | with: 50 | extra-packages: any::rcmdcheck, any::roxygen2 51 | needs: check 52 | 53 | - name: Install cmdstan 54 | uses: epinowcast/actions/install-cmdstan@v1 55 | with: 56 | cmdstan-version: '2.36.0' 57 | num-cores: 2 58 | 59 | - name: Check that roxygen documentation is up to date 60 | run: | 61 | Rscript -e "roxygen2::roxygenize()" 62 | git diff --exit-code man || (echo "::error::Documentation is not up to date. Run 'roxygen2::roxygenize()' locally to re-render." && exit 1) 63 | 64 | - uses: r-lib/actions/check-r-package@v2 65 | with: 66 | upload-snapshots: true 67 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 68 | -------------------------------------------------------------------------------- /.github/workflows/start-app-job.yaml: -------------------------------------------------------------------------------- 1 | name: Start Container App Job 2 | 3 | # This GitHub Actions workflow executes a Container App Job for the cfa-epinow2-pipeline. 4 | # It requires a config file be provided as input. 5 | # Steps are to pull a template of the configured job from Azure, update the template with 6 | # the config file provided by the user, and start the job. 7 | # Operations are done through Azure CLI. 8 | 9 | on: 10 | workflow_dispatch: 11 | inputs: 12 | config_file: 13 | description: "Config File" 14 | required: true 15 | default: 16 | 17 | env: 18 | RESOURCE_GROUP: ext-edav-cfa-prd 19 | JOB_NAME: cfa-epinow2-test-caj 20 | 21 | jobs: 22 | start-caj: 23 | environment: production 24 | permissions: 25 | id-token: 'write' 26 | packages: 'read' 27 | contents: 'write' 28 | runs-on: ubuntu-latest 29 | name: start caj 30 | steps: 31 | - name: Azure login with OIDC 32 | id: azure_login_2 33 | uses: azure/login@v2 34 | with: 35 | # managed by EDAV. Contact Amit Mantri or Jon Kislin if you have issues. 36 | client-id: ${{ secrets.AZURE_NNHT_SP_CLIENT_ID }} 37 | tenant-id: ${{ secrets.TENANT_ID }} 38 | subscription-id: ${{ secrets.SUBSCRIPTION_ID }} 39 | 40 | - name: Get container app job template 41 | run: | 42 | az containerapp job show \ 43 | --resource-group "${{ env.RESOURCE_GROUP }}" \ 44 | --name "${{ env.JOB_NAME }}" \ 45 | --query "properties.template" \ 46 | --output yaml > job-template.yaml 47 | 48 | - name: Update template with input value 49 | run: | 50 | sed -i 's|<>|${{ github.event.inputs.config_file }}|' job-template.yaml 51 | 52 | - name: Run container app job 53 | run: | 54 | az containerapp job start \ 55 | --resource-group "${{ env.RESOURCE_GROUP }}" \ 56 | --name "${{ env.JOB_NAME }}" \ 57 | --yaml job-template.yaml 58 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | name: Code coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | r-version: "4.4.3" 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr 28 | needs: coverage 29 | 30 | - name: Install cmdstan 31 | uses: epinowcast/actions/install-cmdstan@v1 32 | with: 33 | cmdstan-version: '2.36.0' 34 | num-cores: 2 35 | 36 | - name: Test coverage 37 | shell: Rscript {0} 38 | run: | 39 | covr::codecov( 40 | quiet = FALSE, 41 | clean = FALSE, 42 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package"), 43 | token = "${{ secrets.CODECOV_TOKEN }}" 44 | ) 45 | 46 | - name: Show testthat output 47 | if: always() 48 | run: | 49 | ## -------------------------------------------------------------------- 50 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 51 | shell: bash 52 | 53 | - name: Upload test results 54 | if: failure() 55 | uses: actions/upload-artifact@v4 56 | with: 57 | name: coverage-test-failures 58 | path: ${{ runner.temp }}/package 59 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | linters: linters_with_defaults(object_name_linter = NULL, object_usage_linter = NULL) 2 | encoding: "UTF-8" 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # All available hooks: https://pre-commit.com/hooks.html 2 | # R specific hooks: https://github.com/lorenzwalthert/precommit 3 | repos: 4 | # R 5 | - repo: https://github.com/lorenzwalthert/precommit 6 | rev: v0.4.3.9003 7 | hooks: 8 | - id: use-tidy-description 9 | - id: lintr 10 | - id: readme-rmd-rendered 11 | - id: parsable-R 12 | - id: no-browser-statement 13 | - id: no-print-statement 14 | exclude: '^tests/testthat/test-print\.R$' 15 | - id: no-debug-statement 16 | - id: deps-in-desc 17 | - repo: https://github.com/pre-commit/pre-commit-hooks 18 | rev: v5.0.0 19 | hooks: 20 | - id: check-added-large-files 21 | args: ['--maxkb=200'] 22 | - id: file-contents-sorter 23 | files: '^\.Rbuildignore$' 24 | - id: end-of-file-fixer 25 | exclude: '(\.Rd)|(tests/testthat/_snaps/)' 26 | - id: check-yaml 27 | - id: check-toml 28 | - id: mixed-line-ending 29 | args: ['--fix=lf'] 30 | - id: trailing-whitespace 31 | exclude: '(tests/testthat/_snaps/)|(\.Rd)' 32 | - repo: https://github.com/pre-commit-ci/pre-commit-ci-config 33 | rev: v1.6.1 34 | hooks: 35 | # Only required when https://pre-commit.ci is used for config validation 36 | - id: check-pre-commit-ci-config 37 | - repo: local 38 | hooks: 39 | - id: forbid-to-commit 40 | name: Don't commit common R artifacts 41 | entry: Cannot commit .Rhistory, .RData, .Rds or .rds. 42 | language: fail 43 | files: '\.(Rhistory|RData|Rds|rds)$' 44 | exclude: '^tests/testthat/data/.*\.rds$' 45 | # `exclude: ` to allow committing specific files 46 | # Secrets 47 | - repo: https://github.com/Yelp/detect-secrets 48 | rev: v1.5.0 49 | hooks: 50 | - id: detect-secrets 51 | args: ['--baseline', '.secrets.baseline'] 52 | exclude: package.lock.json 53 | ci: 54 | autofix_commit_msg: | 55 | [pre-commit.ci] auto fixes from pre-commit.com hooks 56 | 57 | for more information, see https://pre-commit.ci 58 | autofix_prs: true 59 | autoupdate_branch: '' 60 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' 61 | autoupdate_schedule: weekly 62 | submodules: false 63 | -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.5.0", 3 | "plugins_used": [ 4 | { 5 | "name": "ArtifactoryDetector" 6 | }, 7 | { 8 | "name": "AWSKeyDetector" 9 | }, 10 | { 11 | "name": "AzureStorageKeyDetector" 12 | }, 13 | { 14 | "name": "Base64HighEntropyString", 15 | "limit": 4.5 16 | }, 17 | { 18 | "name": "BasicAuthDetector" 19 | }, 20 | { 21 | "name": "CloudantDetector" 22 | }, 23 | { 24 | "name": "DiscordBotTokenDetector" 25 | }, 26 | { 27 | "name": "GitHubTokenDetector" 28 | }, 29 | { 30 | "name": "GitLabTokenDetector" 31 | }, 32 | { 33 | "name": "HexHighEntropyString", 34 | "limit": 3.0 35 | }, 36 | { 37 | "name": "IbmCloudIamDetector" 38 | }, 39 | { 40 | "name": "IbmCosHmacDetector" 41 | }, 42 | { 43 | "name": "IPPublicDetector" 44 | }, 45 | { 46 | "name": "JwtTokenDetector" 47 | }, 48 | { 49 | "name": "KeywordDetector", 50 | "keyword_exclude": "" 51 | }, 52 | { 53 | "name": "MailchimpDetector" 54 | }, 55 | { 56 | "name": "NpmDetector" 57 | }, 58 | { 59 | "name": "OpenAIDetector" 60 | }, 61 | { 62 | "name": "PrivateKeyDetector" 63 | }, 64 | { 65 | "name": "PypiTokenDetector" 66 | }, 67 | { 68 | "name": "SendGridDetector" 69 | }, 70 | { 71 | "name": "SlackDetector" 72 | }, 73 | { 74 | "name": "SoftlayerDetector" 75 | }, 76 | { 77 | "name": "SquareOAuthDetector" 78 | }, 79 | { 80 | "name": "StripeDetector" 81 | }, 82 | { 83 | "name": "TelegramBotTokenDetector" 84 | }, 85 | { 86 | "name": "TwilioKeyDetector" 87 | } 88 | ], 89 | "filters_used": [ 90 | { 91 | "path": "detect_secrets.filters.allowlist.is_line_allowlisted" 92 | }, 93 | { 94 | "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", 95 | "min_level": 2 96 | }, 97 | { 98 | "path": "detect_secrets.filters.heuristic.is_indirect_reference" 99 | }, 100 | { 101 | "path": "detect_secrets.filters.heuristic.is_likely_id_string" 102 | }, 103 | { 104 | "path": "detect_secrets.filters.heuristic.is_lock_file" 105 | }, 106 | { 107 | "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" 108 | }, 109 | { 110 | "path": "detect_secrets.filters.heuristic.is_potential_uuid" 111 | }, 112 | { 113 | "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" 114 | }, 115 | { 116 | "path": "detect_secrets.filters.heuristic.is_sequential_string" 117 | }, 118 | { 119 | "path": "detect_secrets.filters.heuristic.is_swagger_file" 120 | }, 121 | { 122 | "path": "detect_secrets.filters.heuristic.is_templated_secret" 123 | } 124 | ], 125 | "results": {}, 126 | "generated_at": "2024-09-20T17:50:20Z" 127 | } 128 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome! 2 | Thank you for contributing to CDC's Open Source projects! If you have any 3 | questions or doubts, don't be afraid to send them our way. We appreciate all 4 | contributions, and we are looking forward to fostering an open, transparent, and 5 | collaborative environment. 6 | 7 | ## Package authorship 8 | These guidelines build on the recommendations in [R packages](https://r-pkgs.org/). 9 | 10 | * Maintainer: The CFA team member who is actively in charge of maintaining the 11 | package. 12 | * Authors: Any CFA team member who has taken a lead role in code development, 13 | review, testing, etc. 14 | * Contributors: Team members who have made smaller but substantial contributions 15 | to the codebase (40+ hours, including coding, code review, testing, etc.) 16 | * Smaller contributions: Smaller contributiosn of code to this repo or 17 | to its predecessor, cdcent/cfa-nnh-pipelines, including small bug fixes, 18 | issuees, or code review will not be granted package authorship in the 19 | description file, but may be acknowledged in NEWS.md or release notes if 20 | appropriate 21 | 22 | Before contributing, we encourage you to also read our [LICENSE](LICENSE), 23 | [README](README.md), and 24 | [code-of-conduct](code-of-conduct.md) 25 | files, also found in this repository. If you have any inquiries or questions not 26 | answered by the content of this repository, feel free to [contact us](mailto:surveillanceplatform@cdc.gov). 27 | 28 | ## Public Domain 29 | This project is in the public domain within the United States, and copyright and 30 | related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/). 31 | All contributions to this project will be released under the CC0 dedication. By 32 | submitting a pull request you are agreeing to comply with this waiver of 33 | copyright interest. 34 | 35 | ## Requesting Changes 36 | Our pull request/merging process is designed to give the CDC Surveillance Team 37 | and other in our space an opportunity to consider and discuss any suggested 38 | changes. This policy affects all CDC spaces, both on-line and off, and all users 39 | are expected to abide by it. 40 | 41 | ### Open an issue in the repository 42 | If you don't have specific language to submit but would like to suggest a change 43 | or have something addressed, you can open an issue in this repository. Team 44 | members will respond to the issue as soon as possible. 45 | 46 | ### Submit a pull request 47 | If you would like to contribute, please submit a pull request. In order for us 48 | to merge a pull request, it must: 49 | * Be at least seven days old. Pull requests may be held longer if necessary 50 | to give people the opportunity to assess it. 51 | * Receive a +1 from a majority of team members associated with the request. 52 | If there is significant dissent between the team, a meeting will be held to 53 | discuss a plan of action for the pull request. 54 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CFAEpiNow2Pipeline 2 | Title: EpiNow2 wrapper for deployment to Azure Batch 3 | Version: 0.2.1.9000 4 | Authors@R: c( 5 | person("Zachary", "Susswein", , "utb2@cdc.gov", role = "aut", 6 | comment = c(ORCID = "0000-0002-4329-4833")), 7 | person("Katelyn", "Gostic", , "uep6@cdc.gov", role = "aut", 8 | comment = c(ORCID = "0000-0002-9369-6371")), 9 | person("Nathan", "McIntosh", , "ute2@cdc.gov", role = "aut"), 10 | person("Patrick", "Corbett", , "pyv3@cdc.gov", role = "aut"), 11 | person("Adam", "Howes", , "xwg3@cdc.gov", role = "aut", 12 | comment = c(ORCID = "0000-0003-2386-4031")), 13 | person("Micah", "Wiesner", , "zqm6@cdc.gov", role = c("aut", "cre")) 14 | ) 15 | Description: Add logging, metadata handling, and data handling 16 | functions to use EpiNow2 in a pipeline. This pipeline is optimized 17 | for the Center for Forecasting and Outbreak Analytics' use-case, 18 | fitting hundreds of models in parallel. 19 | License: Apache License (>= 2) 20 | Encoding: UTF-8 21 | Remotes: 22 | github::epiforecasts/EpiNow2@bcf297cf36a93cc56123bc3c9e8cebfb1421a962, 23 | github::stan-dev/cmdstanr 24 | Roxygen: list(markdown = TRUE) 25 | RoxygenNote: 7.3.2 26 | Suggests: 27 | primarycensored, 28 | testthat (>= 3.0.0), 29 | usethis, 30 | withr 31 | Config/testthat/edition: 3 32 | Imports: 33 | AzureRMR, 34 | AzureStor, 35 | cmdstanr, 36 | cli, 37 | data.table, 38 | DBI, 39 | dplyr, 40 | duckdb, 41 | EpiNow2 (>= 1.4.0), 42 | jsonlite, 43 | rlang, 44 | rstan, 45 | S7, 46 | lubridate, 47 | readxl, 48 | tidyr, 49 | tidybayes, 50 | optparse, 51 | Microsoft365R 52 | Additional_repositories: 53 | https://stan-dev.r-universe.dev 54 | URL: https://cdcgov.github.io/cfa-epinow2-pipeline/ 55 | Depends: 56 | R (>= 3.50) 57 | LazyData: true 58 | -------------------------------------------------------------------------------- /DISCLAIMER.md: -------------------------------------------------------------------------------- 1 | # DISCLAIMER 2 | Use of this service is limited only to **non-sensitive and publicly available 3 | data**. Users must not use, share, or store any kind of sensitive data like 4 | health status, provision or payment of healthcare, Personally Identifiable 5 | Information (PII) and/or Protected Health Information (PHI), etc. under **ANY** 6 | circumstance. 7 | 8 | Administrators for this service reserve the right to moderate all information 9 | used, shared, or stored with this service at any time. Any user that cannot 10 | abide by this disclaimer and Code of Conduct may be subject to action, up to 11 | and including revoking access to services. 12 | 13 | The material embodied in this software is provided to you "as-is" and without 14 | warranty of any kind, express, implied or otherwise, including without 15 | limitation, any warranty of fitness for a particular purpose. In no event shall 16 | the Centers for Disease Control and Prevention (CDC) or the United States (U.S.) 17 | government be liable to you or anyone else for any direct, special, incidental, 18 | indirect or consequential damages of any kind, or any damages whatsoever, 19 | including without limitation, loss of profit, loss of use, savings or revenue, 20 | or the claims of third parties, whether or not CDC or the U.S. government has 21 | been advised of the possibility of such loss, however caused and on any theory 22 | of liability, arising out of or in connection with the possession, use or 23 | performance of this software. 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/rocker/geospatial:4.4.1 2 | 3 | # Will copy the package to the container preserving the directory structure 4 | RUN mkdir -p pkg 5 | 6 | COPY ./DESCRIPTION pkg/ 7 | 8 | # Installing missing dependencies 9 | RUN apt-get update && apt-get install -y --no-install-recommends pandoc-citeproc 10 | RUN install2.r pak 11 | # dependencies = TRUE means we install `suggests` too 12 | RUN Rscript -e 'pak::local_install_deps("pkg", upgrade = FALSE, dependencies = TRUE)' 13 | # The cmdstan version will need to be incrementally updated 14 | # Must also manually bump cmdstan version `.github/workflows` when updating 15 | RUN Rscript -e 'cmdstanr::install_cmdstan(version="2.36.0")' 16 | # This requires access to the Azure Container Registry 17 | # FROM ghcr.io/cdcgov/cfa-epinow2-pipeline:${TAG} 18 | 19 | # Will copy the package to the container preserving the directory structure 20 | COPY . pkg/ 21 | 22 | # Install the full package while leaving the tar.gz file in the 23 | # container for later use. 24 | RUN R CMD build --no-build-vignettes --no-manual pkg && \ 25 | R CMD INSTALL CFAEpiNow2Pipeline_*.tar.gz 26 | 27 | # Ensure the package is working properly 28 | RUN R CMD check --no-build-vignettes --no-manual CFAEpiNow2Pipeline_*.tar.gz 29 | 30 | CMD ["bash"] 31 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | REGISTRY=cfaprdbatchcr.azurecr.io/ 2 | IMAGE_NAME=cfa-epinow2-pipeline 3 | BRANCH=$(shell git branch --show-current) 4 | CONFIG_CONTAINER=rt-epinow2-config 5 | CNTR_MGR=docker 6 | ifeq ($(BRANCH), main) 7 | TAG=latest 8 | else 9 | TAG=$(BRANCH) 10 | endif 11 | 12 | CONFIG=test.json 13 | POOL="cfa-epinow2-$(TAG)" 14 | TIMESTAMP:=$(shell date -u +"%Y%m%d_%H%M%S") 15 | JOB:=Rt-estimation-$(TIMESTAMP) 16 | 17 | # The report date to use, in ISO format (YYYY-MM-DD). Default is today 18 | REPORT_DATE?=$(shell date -u +%F) 19 | 20 | .DEFAULT_GOAL := help 21 | 22 | help: 23 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 24 | 25 | pull: ## Login to Azure Container Registry and pull the latest container image 26 | az acr login --name 'cfaprdbatchcr' 27 | $(CNTR_MGR) pull $(REGISTRY)$(IMAGE_NAME):$(TAG) 28 | 29 | build: ## Build the Docker image with given tag 30 | $(CNTR_MGR) build -t $(REGISTRY)$(IMAGE_NAME):$(TAG) \ 31 | --build-arg TAG=$(TAG) -f Dockerfile . 32 | 33 | tag: ## Tags the local image for pushing to the container registry 34 | $(CNTR_MGR) tag $(IMAGE_NAME):$(TAG) $(REGISTRY)$(IMAGE_NAME):$(TAG) 35 | 36 | config: ## Generates a configuration file for running the model 37 | uv run azure/generate_configs.py \ 38 | --disease="COVID-19,Influenza" \ 39 | --state=all \ 40 | --output-container=nssp-rt-v2 \ 41 | --job-id=$(JOB) \ 42 | --report-date-str=$(REPORT_DATE) 43 | 44 | rerun-config: ## Generate a configuration file to rerun a previous model 45 | uv run azure/generate_rerun_configs.py \ 46 | --output-container=nssp-rt-v2 \ 47 | --job-id=$(JOB) \ 48 | --report-date-str=$(REPORT_DATE) 49 | 50 | run-batch: ## Runs job.py on Azure Batch 51 | uv run --env-file .env \ 52 | azure/job.py \ 53 | --image_name="$(REGISTRY)$(IMAGE_NAME):$(TAG)" \ 54 | --config_container="$(CONFIG_CONTAINER)" \ 55 | --pool_id="$(POOL)" \ 56 | --job_id="$(JOB)" 57 | 58 | run-prod: config run-batch ## Calls config and run-batch 59 | 60 | rerun-prod: rerun-config run-batch ## Calls rerun-config and run-batch 61 | 62 | run: ## Run pipeline from R interactively in the container 63 | $(CNTR_MGR) run --mount type=bind,source=$(PWD),target=/mnt -it \ 64 | --env-file .env \ 65 | --rm $(REGISTRY)$(IMAGE_NAME):$(TAG) \ 66 | Rscript -e "CFAEpiNow2Pipeline::orchestrate_pipeline('$(CONFIG)', config_container = 'rt-epinow2-config', input_dir = '/mnt/input', output_dir = '/mnt')" 67 | 68 | up: ## Start an interactive bash shell in the container with project directory mounted 69 | $(CNTR_MGR) run --mount type=bind,source=$(PWD),target=/cfa-epinow2-pipeline -it \ 70 | --env-file .env \ 71 | --rm $(REGISTRY)$(IMAGE_NAME):$(TAG) /bin/bash 72 | 73 | push: ## Push the tagged image to the container registry 74 | $(CNTR_MGR) push $(REGISTRY)$(IMAGE_NAME):$(TAG) 75 | 76 | test-batch: ## Run GitHub Actions workflow and then job.py for testing on Azure Batch 77 | uv run azure/generate_configs.py \ 78 | --disease="COVID-19,Influenza" \ 79 | --state=NY \ 80 | --output-container=nssp-rt-testing \ 81 | --job-id=$(JOB) \ 82 | --report-date-str=$(REPORT_DATE) 83 | uv run --env-file .env \ 84 | azure/job.py \ 85 | --image_name="$(REGISTRY)$(IMAGE_NAME):$(TAG)" \ 86 | --config_container="$(CONFIG_CONTAINER)" \ 87 | --pool_id="$(POOL)" \ 88 | --job_id="$(JOB)" 89 | 90 | test: ## Run unit tests for the CFAEpiNow2Pipeline R package 91 | Rscript -e "testthat::test_local()" 92 | 93 | document: ## Generate roxygen2 documentation for the CFAEpiNow2Pipeline R package 94 | Rscript -e "roxygen2::roxygenize()" 95 | 96 | check: ## Perform R CMD check for the CFAEpiNow2Pipeline R package 97 | Rscript -e "rcmdcheck::rcmdcheck()" 98 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(Config) 4 | export(Data) 5 | export(DelayInterval) 6 | export(Exclusions) 7 | export(GenerationInterval) 8 | export(Parameters) 9 | export(RightTruncation) 10 | export(apply_exclusions) 11 | export(download_file_from_container) 12 | export(download_if_specified) 13 | export(execute_model_logic) 14 | export(extract_diagnostics) 15 | export(fetch_blob_container) 16 | export(fetch_credential_from_env_var) 17 | export(fit_model) 18 | export(format_delay_interval) 19 | export(format_generation_interval) 20 | export(format_right_truncation) 21 | export(format_stan_opts) 22 | export(low_case_count_diagnostic) 23 | export(orchestrate_pipeline) 24 | export(process_quantiles) 25 | export(process_samples) 26 | export(read_data) 27 | export(read_disease_parameters) 28 | export(read_exclusions) 29 | export(read_interval_pmf) 30 | export(read_json_into_config) 31 | export(write_model_outputs) 32 | export(write_output_dir_structure) 33 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # CFAEpiNow2Pipeline v0.2.0 2 | 3 | ## Features 4 | * Adding dependencies to install cmdstanr backend and using GH action 5 | * convert drop cols value to character for point/state exlcusions 6 | * Run `make test-batch` target locally 7 | * Update runner action version 8 | * Remove duplicate batch autoscale text file 9 | * Improve consistency in docs 10 | * Update version of deploy action 11 | * Update github checkout action from V2 to V4 12 | * Setting up dependabot yaml file 13 | * Remove out-of-date demo folder 14 | * Add automated check that docs are up to date 15 | * Rewrite README for simplification and clarity 16 | * Switch to the `air` code formatter 17 | * Replace remaining self-hosted runner workflows with ubuntu-latest 18 | * Fix mismatch between R code and documentation 19 | * Change code owner and include authors in R package 20 | * Change code owner 21 | * Add documentation to the Makefile 22 | * Fix mismatch between R code and documentation 23 | * Fix production diseases 24 | * Add RSV specifications 25 | * Create the config files locally to speed things up 26 | * Lock dependencies for creating the pool 27 | * Saving state exclusions to nssp-rt/state_exclusions 28 | * Automate tag deletion from ghcr.io 29 | * Editing of `SOP.md` 30 | * Pin r-version at 4.4.3 for CI/CD 31 | * Fix minor typos in `SOP.md`. 32 | * Swap from `Dockerfile-batch` to using an inline-metadata script, managed by `uv`. 33 | * Adding dynamic logic to re-query for configs in blob 34 | * Automate creation of outlier csv for nssp-elt-2/outliers 35 | * Fix 'latest' tag for CI 36 | * Updated path for read/write of data outliers 37 | * Updating makefile to represent unified Dockerfile approach (not two-step build) 38 | * Make sure we change "COVID-19/Omicron" to "COVID-19" when reading NSSP data. 39 | * Unified Dockerfile 40 | * Add instructions for data outliers reruns to the SOP. 41 | * Add ability to call `make rerun-prod` to rerun just the tasks that needed a data change. 42 | * Add output container as a new field in the config file. 43 | * Building with ubuntu-latest and using Container App runner for all else, remove azure-cli action 44 | * Adding exclusions documentation and Makefile support 45 | * Add the blob storage container, if provided 46 | * Adding make command to test Azure batch 47 | * Updating subnet ID and pool VM to 22.04 from 20.04 48 | * Write model diagnostics to an output file, correcting an oversight 49 | * Refactored GH Actions container build to cfa-actions 2-step build 50 | * Creating SOP.md to document weekly run procedures, including diagram 51 | * Allows unique job_ids for runs. 52 | * Makefile supports either docker or podman as arguments to setup & manage containers 53 | * Streamlined configurable container execution provided by included start.sh script 54 | * Container App Job execution tools added including job-template.yaml file for single task and Python script for bulk tasks 55 | * GitHub Actions workflow added to start Azure Container App Job 56 | * Minor changes in removing unused container tags from Azure CR 57 | * Reactivated DEBUG level logs from EpiNow2 so that sampler progress is visible 58 | * Added new test data and unit tests for point exclusions 59 | 60 | # CFAEpiNow2Pipeline v0.1.0 61 | 62 | This initial release establishes minimal feature parity with the internal EpiNow2 Rt modeling pipeline. It adds wrappers to integrate with internal data schemas and ingest pre-estimated model parameters (i.e., generation intervals, right-truncation). It defines an output schema and adds comprehensive logging. The repository also has functionality to set up and deploy to Azure Batch. 63 | 64 | ## Features 65 | 66 | * GitHub Actions to build Docker images on PR and merge to main, deploy Azure Batch environments off the built images, and tear down the environment (including images) on PR close. 67 | * Comprehensive documentation of pipeline code and validation of input data, parameters, and model run configs 68 | * Set up comprehensive logging of model runs and handle pipeline failures to preserve logs where possible 69 | * Automatically download and upload inputs and outputs from Azure Blob Storage 70 | * A new script for building the pool. Runnable from CLI or GHA. Requires `uv` be installed, and then `uv` handles the python and dependency management based on the inline script metadata. 71 | -------------------------------------------------------------------------------- /R/azure.R: -------------------------------------------------------------------------------- 1 | #' Download if specified 2 | #' 3 | #' @param blob_path The name of the blob to download 4 | #' @param blob_storage_container The name of the container to download from 5 | #' @param dir The directory to which to write the downloaded file 6 | #' @return The path of the file 7 | #' @family azure 8 | #' @export 9 | download_if_specified <- function( 10 | blob_path, 11 | blob_storage_container, 12 | dir 13 | ) { 14 | # Guard against null input erroring out file.exists() 15 | if (rlang::is_null(blob_path)) { 16 | local_path <- NULL 17 | } else { 18 | file_exists <- file.exists(file.path(dir, blob_path)) 19 | if (!rlang::is_null(blob_storage_container) && !file_exists) { 20 | container <- fetch_blob_container(blob_storage_container) 21 | local_path <- download_file_from_container( 22 | blob_storage_path = blob_path, 23 | local_file_path = file.path(dir, blob_path), 24 | storage_container = container 25 | ) 26 | } else { 27 | local_path <- file.path(dir, blob_path) 28 | } 29 | } 30 | local_path 31 | } 32 | 33 | #' Download specified blobs from Blob Storage and save them in a local dir 34 | #' 35 | #' @param blob_storage_path A character of a blob in `storage_container` 36 | #' @param local_file_path The local path to save the blob 37 | #' @param storage_container The blob storage container with `blob_storage_path` 38 | # 39 | #' @return Invisibly, `local_file_path` 40 | #' @family azure 41 | #' @export 42 | download_file_from_container <- function( 43 | blob_storage_path, 44 | local_file_path, 45 | storage_container 46 | ) { 47 | cli::cli_alert_info( 48 | "Downloading blob {.path {blob_storage_path}} to {.path {local_file_path}}" 49 | ) 50 | 51 | rlang::try_fetch( 52 | { 53 | dirs <- dirname(local_file_path) 54 | 55 | if (!dir.exists(dirs)) { 56 | cli::cli_alert("Creating directory {.path {dirs}}") 57 | dir.create(dirs, recursive = TRUE) 58 | } 59 | 60 | AzureStor::download_blob( 61 | container = storage_container, 62 | src = blob_storage_path, 63 | dest = local_file_path, 64 | overwrite = TRUE 65 | ) 66 | }, 67 | error = function(cnd) { 68 | cli::cli_abort(c( 69 | "Failed to download {.path {blob_storage_path}}", 70 | ">" = "Does the blob exist in the container?" 71 | )) 72 | } 73 | ) 74 | 75 | cli::cli_alert_success( 76 | "Blob {.path {blob_storage_path}} downloaded successfully" 77 | ) 78 | 79 | invisible(local_file_path) 80 | } 81 | 82 | #' Load Azure Blob container using credentials in environment variables 83 | #' 84 | #' This function depends on the following Azure credentials stored in 85 | #' environment variables: 86 | #' 87 | #' * `az_tenant_id`: an Azure Active Directory (AAD) tenant ID 88 | #' * `az_subscription_id`: an Azure subscription ID 89 | #' * `az_resource_group`: The name of the Azure resource group 90 | #' * `az_storage_account`: The name of the Azure storage account 91 | #' 92 | #' As a result it is an impure function, and should be used bearing that 93 | #' warning in mind. Each variable is obtained using 94 | #' [fetch_credential_from_env_var()] (which will return an error if the 95 | #' credential is not specified or empty). 96 | #' 97 | #' @param container_name The Azure Blob Storage container associated with the 98 | #' credentials 99 | #' @return A Blob endpoint 100 | #' @family azure 101 | #' @export 102 | fetch_blob_container <- function(container_name) { 103 | cli::cli_alert_info( 104 | "Attempting to connect to container {.var {container_name}}" 105 | ) 106 | cli::cli_alert_info("Loading Azure credentials from env vars") 107 | # nolint start: object_name_linter 108 | az_tenant_id <- fetch_credential_from_env_var("az_tenant_id") 109 | az_client_id <- fetch_credential_from_env_var("az_client_id") 110 | az_service_principal <- fetch_credential_from_env_var("az_service_principal") 111 | # nolint end: object_name_linter 112 | cli::cli_alert_success("Credentials loaded successfully") 113 | 114 | cli::cli_alert_info("Authenticating with loaded credentials") 115 | rlang::try_fetch( 116 | { 117 | # First, get a general-purpose token using SP flow 118 | # Analogous to: 119 | # az login --service-principal \ 120 | # --username $az_client_id \ 121 | # --password $az_service_principal \ 122 | # --tenant $az_tenant_id 123 | # NOTE: the SP is also sometimes called the `client_secret` 124 | token <- AzureRMR::get_azure_token( 125 | resource = "https://storage.azure.com", 126 | tenant = az_tenant_id, 127 | app = az_client_id, 128 | password = az_service_principal 129 | ) 130 | # Then fetch a storage endpoint using the token. Follows flow from 131 | # https://github.com/Azure/AzureStor. 132 | # Note that we're using the ABS endpoint (the first example line) 133 | # but following the AAD token flow from the AAD alternative at 134 | # end of the box. If we didn't replace the endpoint and used the 135 | # example flow then it allows authentication to blob but throws 136 | # a 409 when trying to download. 137 | endpoint <- AzureStor::storage_endpoint( 138 | "https://cfaazurebatchprd.blob.core.windows.net", 139 | token = token 140 | ) 141 | 142 | # Finally, set up instantiation of storage container generic 143 | container <- AzureStor::storage_container(endpoint, container_name) 144 | }, 145 | error = function(cnd) { 146 | cli::cli_abort( 147 | "Failure authenticating connection to {.var {container_name}}", 148 | parent = cnd 149 | ) 150 | } 151 | ) 152 | 153 | cli::cli_alert_success("Authenticated connection to {.var {container_name}}") 154 | 155 | return(container) 156 | } 157 | 158 | #' Fetch Azure credential from environment variable 159 | #' 160 | #' And throw an informative error if credential is not found 161 | #' 162 | #' @param env_var A character, the credential to fetch 163 | #' 164 | #' @return The associated value 165 | #' @family azure 166 | #' @export 167 | fetch_credential_from_env_var <- function(env_var) { 168 | credential <- Sys.getenv(env_var) 169 | 170 | if (credential == "") { 171 | cli::cli_abort( 172 | c( 173 | "Error loading Azure credentials from environment variables", 174 | "!" = "Environment variable {.envvar {env_var}} not specified or empty" 175 | ), 176 | class = "CFA_Rt" 177 | ) 178 | } 179 | 180 | return(credential) 181 | } 182 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Synthetic dataset of stochastic SIR system with known Rt 2 | #' 3 | #' A dataset from Gostic, Katelyn M., et al. "Practical considerations for 4 | #' measuring the effective reproductive number, Rt." PLoS Computational Biology 5 | #' 16.12 (2020): e1008409. The data are simulated from a stochastic SEIR 6 | #' compartmental model. 7 | #' 8 | #' This synthetic dataset has a number of desirable properties: 9 | #' 10 | #' 1. The force of infection changes depending on the Rt, allowing for sudden 11 | #' changes in the Rt. This allows for modeling of sudden changes in infection 12 | #' dynamics, which might otherwise be difficult to capture. Rt estimation 13 | #' framework 14 | #' 15 | #' 2. The realized Rt is known at each timepoint 16 | #' 17 | #' 3. The dataset incorporates a simple generation interval and a reporting 18 | #' delay. 19 | #' 20 | #' Gostic et al. benchmark the performance of a number of Rt estimation 21 | #' frameworks, providing practical guidance on how to use this dataset to 22 | #' evaluate Rt estimates. 23 | #' 24 | #' In practice, we've found that the amount of observation noise in the 25 | #' incidence and/or observed cases is often undesirably low for testing. Many 26 | #' empirical datasets are much noisier. As a result, models built with these 27 | #' settings in mind can perform poorly on this dataset or fail to converge. To 28 | #' the original dataset, we add a new column with the original incidence counts 29 | #' with additional observation noise: `obs_incidence`. We manually add 30 | #' observation noise with `rnbinom(299, mu = gostic_toy_rt[["obs_cases"]], size 31 | #' = 10)` and the random seed 123456 and store it in the `obs_incidence` column. 32 | #' 33 | #' @name gostic_toy_rt 34 | #' @format `gostic_toy_rt` A data frame with 301 rows and 12 columns: 35 | #' \describe{ 36 | #' \item{time}{Timestep of the discrete-time stochastic SEIR simulation} 37 | #' \item{date}{Added from the original Gostic, 2020 dataset. A date 38 | #' corresponding to the assigned `time`. Arbitrarily starts on January 1st, 39 | #' 2023.} 40 | #' \item{S, E, I, R}{The realized state of the stochastic SEIR system} 41 | #' \item{dS, dEI, DIR}{The stochastic transition between compartments} 42 | #' \item{incidence}{The true incidence in the `I` compartment at time t} 43 | #' \item{obs_cases}{The observed number of cases at time t from 44 | #' forward-convolved incidence.} 45 | #' \item{obs_incidence}{Added from the original Gostic, 2020 dataset. The 46 | #' `incidence` column with added negative-binomial observation noise. 47 | #' Created with `set.seed(123456)` and the call 48 | #' `rnbinom(299, mu = gostic_toy_rt[["incidence"]], size = 10)` Useful for 49 | #' testing.} 50 | #' \item{true_r0}{The initial R0 of the system (i.e., 2)} 51 | #' \item{true_rt}{The known, true Rt of the epidemic system} 52 | #' } 53 | #' @source 54 | #' # nolint 55 | #' @family data 56 | "gostic_toy_rt" 57 | 58 | #' Generation interval corresponding to the sample `gostic_toy_rt` dataset 59 | #' 60 | #' Gostic et al., 2020 simulates data from a stochastic SEIR model. Residence 61 | #' time in both the E and the I compartments is exponentially distributed, with 62 | #' a mean of 4 days (or a rate/inverse-scale of 1/4). These residence times 63 | #' imply a gamma-distributed generation time distribution with a shape of 2 and 64 | #' a rate of 1/4. We convert the continuous gamma distribution into a PMF to use 65 | #' with `{RtGam}`. 66 | #' 67 | #' From this parametric specification, we produce a double-censored, 68 | #' left-truncated probability mass function of the generation interval 69 | #' distribution. We produce the PMF using `{epinowcast}`'s 70 | #' `simulate_double_censored_pmf()` with version 0.3.0. See 71 | #' https://doi.org/10.1101/2024.01.12.24301247 for more information on 72 | #' double-censoring biases and corrections. 73 | #' 74 | #' We correct the output from `simulate_double_censored_pmf()` to make it 75 | #' appropriate to use with `{EpiNow2}`. The function returns a numeric vector, 76 | #' with the position of the element corresponding to one day more than the 77 | #' length of the delay and value corresponding to the amount of discretized 78 | #' probability density in the bin. The vector does not necessarily sum to one. 79 | #' We drop the first element of the vector, which corresponds to a zero-day 80 | #' delay. The renewal framework, which underpins our model does not account for 81 | #' zero-day delays. We renormalize the left-truncated vector to sum to one so 82 | #' that it's a proper PMF. 83 | #' 84 | #' @name sir_gt_pmf 85 | #' @format `sir_gt_pmf` A numeric vector of length 26 that sums to one within 86 | #' numerical tolerance 87 | #' @family data 88 | "sir_gt_pmf" 89 | -------------------------------------------------------------------------------- /R/exclusions.R: -------------------------------------------------------------------------------- 1 | #' Convert case counts in matching rows to NA 2 | #' 3 | #' Mark selected points to be ignored in model fitting. This manual selection 4 | #' occurs externally to the pipeline and is passed to the pipeline in an 5 | #' exclusions file read with [read_exclusions()]. Mechanically, the exclusions 6 | #' are applied by converting specified points to NAs in the dataset. NAs are 7 | #' skipped in model fitting by EpiNow2, so matched rows are excluded from the 8 | #' model likelihood. 9 | #' 10 | #' @param cases A dataframe returned by [read_data()] 11 | #' @param exclusions A dataframe returned by [read_exclusions()] 12 | #' 13 | #' @return A dataframe with the same rows and schema as `cases` where the value 14 | #' in the column `confirm` converted to NA in any rows that match a row in 15 | #' `exclusions` 16 | #' @family exclusions 17 | #' @export 18 | apply_exclusions <- function(cases, exclusions) { 19 | cli::cli_alert_info("Applying exclusions to case data") 20 | 21 | con <- DBI::dbConnect(duckdb::duckdb()) 22 | on.exit(DBI::dbDisconnect(con)) 23 | 24 | duckdb::duckdb_register(con, "cases", cases) 25 | duckdb::duckdb_register(con, "exclusions", exclusions) 26 | 27 | df <- DBI::dbGetQuery( 28 | con, 29 | " 30 | SELECT 31 | cases.report_date, 32 | cases.reference_date, 33 | cases.disease, 34 | cases.geo_value, 35 | CASE 36 | WHEN exclusions.reference_date IS NOT NULL THEN NULL 37 | ELSE cases.confirm 38 | END AS confirm 39 | FROM cases 40 | LEFT JOIN exclusions 41 | ON cases.reference_date = exclusions.reference_date 42 | AND cases.report_date = exclusions.report_date 43 | AND cases.geo_value = exclusions.geo_value 44 | AND cases.disease = exclusions.disease 45 | ORDER BY cases.reference_date 46 | " 47 | ) 48 | 49 | cli::cli_alert_info("{.val {sum(is.na(df[['confirm']]))}} exclusions applied") 50 | 51 | return(df) 52 | } 53 | 54 | #' Read exclusions from an external file 55 | #' 56 | #' Expects to read a CSV with required columns: 57 | #' * `reference_date` 58 | #' * `report_date` 59 | #' * `state` 60 | #' * `disease` 61 | #' 62 | #' These columns have the same meaning as in [read_data()]. Additional columns 63 | #' are allowed and will be ignored by the reader. 64 | #' 65 | #' @param path The path to the exclusions file in `.csv` format 66 | #' 67 | #' @return A dataframe with columns `reference_date`, `report_date`, 68 | #' `geo_value`, `disease` 69 | #' @family exclusions 70 | #' @export 71 | read_exclusions <- function(path) { 72 | check_file_exists(path) 73 | 74 | con <- DBI::dbConnect(duckdb::duckdb()) 75 | on.exit(DBI::dbDisconnect(con)) 76 | df <- rlang::try_fetch( 77 | DBI::dbGetQuery( 78 | con, 79 | " 80 | SELECT 81 | reference_date, 82 | report_date, 83 | state AS geo_value, 84 | disease 85 | FROM read_csv(?) 86 | ", 87 | params = list(path) 88 | ), 89 | error = function(con) { 90 | cli::cli_abort( 91 | c( 92 | "Error fetching exclusions from {.path {path}}", 93 | "Original error: {con}" 94 | ), 95 | class = "wrapped_invalid_query" 96 | ) 97 | } 98 | ) 99 | 100 | if (nrow(df) == 0) { 101 | cli::cli_abort( 102 | "No data matching returned from {.path {path}}", 103 | class = "empty_return" 104 | ) 105 | } 106 | 107 | cli::cli_alert_success("Exclusions file read") 108 | 109 | return(df) 110 | } 111 | -------------------------------------------------------------------------------- /R/fit_model.R: -------------------------------------------------------------------------------- 1 | #' Fit an `EpiNow2` model 2 | #' 3 | #' @param data, in the format returned by [read_data()] 4 | #' @param parameters As returned from [read_disease_parameters()] 5 | #' @param seed The random seed, used for both initialization by `EpiNow2` in R 6 | #' and sampling in Stan 7 | #' @param horizon The number of days, as an integer, to forecast 8 | #' @param priors A list of lists. The first level should contain the key `rt` 9 | #' with elements `mean` and `sd` and the key `gp` with element `alpha_sd`. 10 | #' @param sampler_opts A list. The Stan sampler options to be passed through 11 | #' EpiNow2. It has required keys: `cores`, `chains`, `iter_warmup`, 12 | #' `iter_sampling`, `max_treedepth`, and `adapt_delta`. 13 | #' 14 | #' @return A fitted model object of class `epinow` or, if model fitting fails, 15 | #' an NA is returned with a warning 16 | #' @family pipeline 17 | #' @export 18 | fit_model <- function( 19 | data, 20 | parameters, 21 | seed, 22 | horizon, 23 | priors, 24 | sampler_opts 25 | ) { 26 | # Priors ------------------------------------------------------------------ 27 | rt <- EpiNow2::rt_opts( 28 | list( 29 | mean = priors[["rt"]][["mean"]], 30 | sd = priors[["rt"]][["sd"]] 31 | ) 32 | ) 33 | gp <- EpiNow2::gp_opts( 34 | alpha_sd = priors[["gp"]][["alpha_sd"]] 35 | ) 36 | 37 | # Distributions ----------------------------------------------------------- 38 | generation_time <- format_generation_interval( 39 | parameters[["generation_interval"]] 40 | ) 41 | delays <- format_delay_interval( 42 | parameters[["delay_interval"]] 43 | ) 44 | truncation <- format_right_truncation( 45 | parameters[["right_truncation"]], 46 | data 47 | ) 48 | stan <- format_stan_opts( 49 | sampler_opts, 50 | seed 51 | ) 52 | df <- data.frame( 53 | confirm = data[["confirm"]], 54 | date = as.Date(data[["reference_date"]]) 55 | ) 56 | rlang::try_fetch( 57 | withr::with_seed(seed, { 58 | EpiNow2::epinow( 59 | df, 60 | generation_time = generation_time, 61 | delays = delays, 62 | truncation = truncation, 63 | horizon = horizon, 64 | rt = rt, 65 | gp = gp, 66 | stan = stan, 67 | verbose = TRUE, 68 | # Dump logs to console to be caught by pipeline's logging instead of 69 | # EpiNow2's default through futile.logger 70 | logs = EpiNow2::setup_logging( 71 | threshold = "INFO", 72 | file = NULL, 73 | mirror_to_console = TRUE, 74 | name = "EpiNow2" 75 | ), 76 | filter_leading_zeros = FALSE, 77 | ) 78 | }), 79 | error = function(cnd) { 80 | cli::cli_abort( 81 | "Call to EpiNow2::epinow() failed with an error", 82 | parent = cnd, 83 | class = "failing_fit" 84 | ) 85 | } 86 | ) 87 | } 88 | 89 | #' Format Stan options for input to EpiNow2 90 | #' 91 | #' Format configuration `sampler_opts` for input to `EpiNow2` via a call to 92 | #' [EpiNow2::stan_opts()]. 93 | #' 94 | #' @inheritParams fit_model 95 | #' @param seed A stochastic seed passed here to the Stan sampler and as the R 96 | #' PRNG seed for `EpiNow2` initialization 97 | #' 98 | #' @return A `stan_opts` object of arguments 99 | #' 100 | #' @family pipeline 101 | #' @export 102 | format_stan_opts <- function(sampler_opts, seed) { 103 | expected_stan_args <- c( 104 | "cores", 105 | "chains", 106 | "iter_warmup", 107 | "iter_sampling", 108 | "adapt_delta", 109 | "max_treedepth" 110 | ) 111 | missing_keys <- !(expected_stan_args %in% names(sampler_opts)) 112 | missing_elements <- rlang::is_null(sampler_opts[expected_stan_args]) 113 | if (any(missing_keys) || any(missing_elements)) { 114 | cli::cli_abort(c( 115 | "Missing expected keys/values in {.val sampler_opts}", 116 | "Missing keys: {.val {expected_stan_args[missing_keys]}}", 117 | "Missing values: {.val {expected_stan_args[missing_elements]}}" 118 | )) 119 | } 120 | EpiNow2::stan_opts( 121 | cores = sampler_opts[["cores"]], 122 | chains = sampler_opts[["chains"]], 123 | seed = seed, 124 | warmup = sampler_opts[["iter_warmup"]], 125 | samples = sampler_opts[["iter_sampling"]], 126 | control = list( 127 | adapt_delta = sampler_opts[["adapt_delta"]], 128 | max_treedepth = sampler_opts[["max_treedepth"]] 129 | ) 130 | ) 131 | } 132 | -------------------------------------------------------------------------------- /R/read_data.R: -------------------------------------------------------------------------------- 1 | #' Read in the dataset of incident case counts 2 | #' 3 | #' Each row of the table corresponds to a single facilities' cases for a 4 | #' reference-date/report-date/disease tuple. We want to aggregate these counts 5 | #' to the level of geographic aggregate/report-date/reference-date/disease. 6 | #' 7 | #' We handle two distinct cases for geographic aggregates: 8 | #' 9 | #' 1. A single state: Subset to facilities **in that state only** and aggregate 10 | #' up to the state level 2. The US overall: Aggregate over all facilities 11 | #' without any subsetting 12 | #' 13 | #' Note that we do _not_ apply exclusions here. The exclusions are applied 14 | #' later, after the aggregations. That means that for the US overall, we 15 | #' aggregate over points that might potentially be excluded at the state level. 16 | #' Our recourse in this case is to exclude the US overall aggregate point. 17 | #' 18 | #' @param data_path The path to the local file. This could contain a glob and 19 | #' must be in parquet format. 20 | #' @inheritParams Config 21 | #' 22 | #' @return A dataframe with one or more rows and columns `report_date`, 23 | #' `reference_date`, `geo_value`, `confirm` 24 | #' @family read_data 25 | #' @export 26 | read_data <- function( 27 | data_path, 28 | disease = c("COVID-19", "Influenza", "RSV", "test"), 29 | geo_value, 30 | report_date, 31 | max_reference_date, 32 | min_reference_date 33 | ) { 34 | rlang::arg_match(disease) 35 | # NOTE: this is temporary workaround until we switch to the new API. I'm not 36 | # sure if there's a better way to do this without a whole bunch of special 37 | # casing -- which is its own code smell. I think this should really be handled 38 | # upstream in the ETL job and standardize on "COVID-19", but that's beyond 39 | # scope here and we need to do _something_ in the meantime so this runs. 40 | disease_map <- c( 41 | "COVID-19" = "COVID-19/Omicron", 42 | "Influenza" = "Influenza", 43 | "RSV" = "RSV", 44 | "test" = "test" 45 | ) 46 | mapped_disease <- disease_map[[disease]] 47 | 48 | check_file_exists(data_path) 49 | 50 | parameters <- list( 51 | data_path = data_path, 52 | disease = mapped_disease, 53 | min_ref_date = stringify_date(min_reference_date), 54 | max_ref_date = stringify_date(max_reference_date), 55 | report_date = stringify_date(report_date) 56 | ) 57 | 58 | # We need different queries for the states and the US overall. For US overall 59 | # we need to aggregate over all the facilities in all the states. For the 60 | # states, we need to aggregate over all the facilities in that one state 61 | if (geo_value == "US") { 62 | query <- " 63 | SELECT 64 | report_date, 65 | reference_date, 66 | CASE 67 | WHEN disease = 'COVID-19/Omicron' THEN 'COVID-19' 68 | ELSE disease 69 | END AS disease, 70 | -- We want to inject the 'US' as our abbrevation here bc data is not agg'd 71 | 'US' AS geo_value, 72 | sum(value) AS confirm 73 | FROM read_parquet(?) 74 | WHERE 1=1 75 | AND disease = ? 76 | AND metric = 'count_ed_visits' 77 | AND reference_date >= ? :: DATE 78 | AND reference_date <= ? :: DATE 79 | AND report_date = ? :: DATE 80 | GROUP BY reference_date, report_date, disease 81 | ORDER BY reference_date 82 | " 83 | } else { 84 | # We want just one state so aggregate over facilites in that one state only 85 | query <- " 86 | SELECT 87 | report_date, 88 | reference_date, 89 | CASE 90 | WHEN disease = 'COVID-19/Omicron' THEN 'COVID-19' 91 | ELSE disease 92 | END AS disease, 93 | geo_value AS geo_value, 94 | sum(value) AS confirm, 95 | FROM read_parquet(?) 96 | WHERE 1=1 97 | AND disease = ? 98 | AND metric = 'count_ed_visits' 99 | AND reference_date >= ? :: DATE 100 | AND reference_date <= ? :: DATE 101 | AND report_date = ? :: DATE 102 | AND geo_value = ? 103 | GROUP BY geo_value, reference_date, report_date, disease 104 | ORDER BY reference_date 105 | " 106 | # Append `geo_value` to the query 107 | parameters <- c(parameters, list(geo_value = geo_value)) 108 | } 109 | 110 | con <- DBI::dbConnect(duckdb::duckdb()) 111 | on.exit(expr = DBI::dbDisconnect(con)) 112 | df <- rlang::try_fetch( 113 | DBI::dbGetQuery( 114 | con, 115 | statement = query, 116 | params = unname(parameters) 117 | ), 118 | error = function(con) { 119 | cli::cli_abort( 120 | c( 121 | "Error fetching data from {.path {data_path}}", 122 | "Using parameters:", 123 | "*" = "data_path: {.path {parameters[['data_path']]}}", 124 | "*" = "mapped_disease: {.val {parameters[['disease']]}}", 125 | "*" = "min_reference_date: {.val {parameters[['min_ref_date']]}}", 126 | "*" = "max_reference_date: {.val {parameters[['max_ref_date']]}}", 127 | "*" = "report_date: {.val {parameters[['report_date']]}}", 128 | "Original error: {con}" 129 | ), 130 | class = "wrapped_invalid_query" 131 | ) 132 | } 133 | ) 134 | 135 | # Guard against empty return 136 | if (nrow(df) == 0) { 137 | cli::cli_abort( 138 | c( 139 | "No data matching returned from {.path {data_path}}", 140 | "Using parameters {parameters}" 141 | ), 142 | class = "empty_return" 143 | ) 144 | } 145 | # Warn for incomplete return 146 | n_rows_expected <- as.Date(max_reference_date) - 147 | as.Date(min_reference_date) + 148 | 1 149 | if (nrow(df) != n_rows_expected) { 150 | expected_dates <- seq.Date( 151 | from = as.Date(min_reference_date), 152 | to = as.Date(max_reference_date), 153 | by = "day" 154 | ) 155 | missing_dates <- stringify_date( 156 | # Setdiff strips the date attribute from the objects; re-add it so that we 157 | # can pretty-format the date for printing 158 | as.Date( 159 | setdiff(expected_dates, df[["reference_date"]]) 160 | ) 161 | ) 162 | cli::cli_warn( 163 | c( 164 | "Incomplete number of rows returned", 165 | "Expected {.val {n_rows_expected}} rows", 166 | "Observed {.val {nrow(df)}} rows", 167 | "Missing reference date(s): {missing_dates}" 168 | ), 169 | class = "incomplete_return" 170 | ) 171 | } 172 | 173 | cli::cli_alert_success("Read {nrow(df)} rows from {.path {data_path}}") 174 | return(df) 175 | } 176 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' DuckDB date comparison fails if the dates are not in string format 2 | #' @noRd 3 | stringify_date <- function(date) { 4 | if (inherits(date, "Date")) { 5 | format(date, "%Y-%m-%d") 6 | } else { 7 | date 8 | } 9 | } 10 | 11 | check_file_exists <- function(data_path) { 12 | # Guard against file does not exist 13 | cli::cli_alert("Reading data from {.path {data_path}}") 14 | if (!file.exists(data_path)) { 15 | cli::cli_abort( 16 | "Cannot read data. File {.path {data_path}} doesn't exist", 17 | class = "file_not_found" 18 | ) 19 | } 20 | invisible(data_path) 21 | } 22 | 23 | #' If `x` is null or empty, return an empty string, otherwise `x` 24 | #' @noRd 25 | empty_str_if_non_existent <- function(x) { 26 | ifelse(rlang::is_empty(x), "", x) 27 | } 28 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://cdcgov.github.io/cfa-epinow2-pipeline/ 2 | template: 3 | bootstrap: 5 4 | 5 | reference: 6 | - title: Azure 7 | desc: Functions which manage interaction with Azure blob 8 | contents: 9 | - has_concept("azure") 10 | - title: Data 11 | desc: Example data included in the package 12 | contents: 13 | - has_concept("data") 14 | - title: Configuration 15 | desc: Manages the input of all configuration settings into the `EpiNow2` model 16 | contents: 17 | - has_concept("config") 18 | - title: Exclusions 19 | desc: Functions to handle exclusion of data from models 20 | contents: 21 | - has_concept("exclusions") 22 | - title: Diagnostics 23 | desc: Functions to calculate diagnostics from fitted `EpiNow2` model 24 | contents: 25 | - has_concept("diagnostics") 26 | - title: Parameter 27 | desc: Functions for parameter values that are input into the `EpiNow2` model 28 | contents: 29 | - has_concept("parameters") 30 | - title: Pipeline 31 | desc: Functions to orchestrate running of the pipeline including fitting the 32 | `EpiNow2` model 33 | contents: 34 | - has_concept("pipeline") 35 | - title: Read data 36 | desc: Functions for data that are input into the `EpiNow2` model 37 | contents: 38 | - has_concept("read_data") 39 | - title: Write output 40 | desc: Functions for post-processing and writing `EpiNow2` model output 41 | contents: 42 | - has_concept("write_output") 43 | -------------------------------------------------------------------------------- /air.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/air.toml -------------------------------------------------------------------------------- /azure/generate_configs.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.13" 3 | # dependencies = [ 4 | # "cfa-config-generator", 5 | # "typer", 6 | # ] 7 | # 8 | # [tool.uv.sources] 9 | # cfa-config-generator = { git = "https://github.com/CDCgov/cfa-config-generator" } 10 | # /// 11 | 12 | 13 | from datetime import date, datetime, timedelta, timezone 14 | from typing import Annotated 15 | 16 | import typer 17 | from cfa_config_generator.utils.epinow2.driver_functions import generate_config 18 | 19 | 20 | def main( 21 | state: Annotated[ 22 | str, typer.Option(help="State to generate config for", show_default=False) 23 | ], 24 | disease: Annotated[ 25 | str, typer.Option(help="Disease to generate config for", show_default=False) 26 | ], 27 | job_id: Annotated[str, typer.Option(help="Job ID to use", show_default=False)], 28 | report_date_str: Annotated[ 29 | str, 30 | typer.Option( 31 | help="Report date in ISO format to generate config for", show_default=False 32 | ), 33 | ], 34 | output_container: Annotated[ 35 | str, 36 | typer.Option(help="Output container to upload config to", show_default=False), 37 | ], 38 | input_container: Annotated[ 39 | str, 40 | typer.Option(help="Input container to download config from"), 41 | ] = "nssp-etl", 42 | production_date_str: Annotated[ 43 | str, 44 | typer.Option( 45 | help="Production date in ISO format. Default is today", show_default=False 46 | ), 47 | ] = date.today().isoformat(), 48 | ): 49 | """ 50 | Generate and upload config files for the epinow2 pipeline. 51 | """ 52 | report_date: date = date.fromisoformat(report_date_str) 53 | production_date: date = date.fromisoformat(production_date_str) 54 | now: datetime = datetime.now(timezone.utc) 55 | 56 | # Make sure the job ID is not empty. 57 | if not job_id: 58 | raise ValueError("Job ID cannot be empty") 59 | 60 | # Generate and upload to blob for all states and diseases. 61 | generate_config( 62 | state=state, 63 | disease=disease, 64 | report_date=report_date, 65 | reference_dates=[ 66 | report_date - timedelta(days=1), 67 | report_date - timedelta(weeks=8), 68 | ], 69 | data_path=f"gold/{report_date.isoformat()}.parquet", 70 | data_container=input_container, 71 | production_date=production_date, 72 | job_id=job_id, 73 | as_of_date=now.isoformat(), 74 | output_container=output_container, 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | typer.run(main) 80 | -------------------------------------------------------------------------------- /azure/generate_rerun_configs.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.13" 3 | # dependencies = [ 4 | # "cfa-config-generator", 5 | # "typer", 6 | # ] 7 | # 8 | # [tool.uv.sources] 9 | # cfa-config-generator = { git = "https://github.com/CDCgov/cfa-config-generator" } 10 | # /// 11 | 12 | 13 | from datetime import date, datetime, timedelta, timezone 14 | from typing import Annotated 15 | 16 | import typer 17 | from cfa_config_generator.utils.epinow2.driver_functions import generate_rerun_config 18 | 19 | 20 | def main( 21 | job_id: Annotated[str, typer.Option(help="Job ID to use", show_default=False)], 22 | report_date_str: Annotated[ 23 | str, 24 | typer.Option( 25 | help="Report date in ISO format to generate config for", show_default=False 26 | ), 27 | ], 28 | output_container: Annotated[ 29 | str, 30 | typer.Option(help="Output container to upload config to", show_default=False), 31 | ], 32 | input_container: Annotated[ 33 | str, 34 | typer.Option(help="Input container to download config from"), 35 | ] = "nssp-etl", 36 | production_date_str: Annotated[ 37 | str, 38 | typer.Option( 39 | help="Production date in ISO format. Default is today", show_default=False 40 | ), 41 | ] = date.today().isoformat(), 42 | data_exclusions_path: Annotated[ 43 | str | None, 44 | typer.Option( 45 | help=( 46 | "Path to data exclusions file." 47 | " Default is to use the report date to generate the path." 48 | " You almost certainly do not want to change from this default." 49 | ), 50 | show_default=False, 51 | ), 52 | ] = None, 53 | ): 54 | """ 55 | Generate and upload config files for rerunning the epinow2 pipeline. 56 | """ 57 | report_date: date = date.fromisoformat(report_date_str) 58 | production_date: date = date.fromisoformat(production_date_str) 59 | now: datetime = datetime.now(timezone.utc) 60 | 61 | # Make sure the job ID is not empty. 62 | if not job_id: 63 | raise ValueError("Job ID cannot be empty") 64 | 65 | # Generate and upload to blob for all states and diseases. 66 | generate_rerun_config( 67 | state="all", 68 | disease="all", 69 | report_date=report_date, 70 | reference_dates=[ 71 | report_date - timedelta(days=1), 72 | report_date - timedelta(weeks=8), 73 | ], 74 | data_path=f"gold/{report_date.isoformat()}.parquet", 75 | data_container=input_container, 76 | production_date=production_date, 77 | job_id=job_id, 78 | as_of_date=now.isoformat(), 79 | output_container=output_container, 80 | data_exclusions_path=data_exclusions_path, 81 | ) 82 | 83 | 84 | if __name__ == "__main__": 85 | typer.run(main) 86 | -------------------------------------------------------------------------------- /azure/job.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.13" 3 | # dependencies = [ 4 | # "azure-batch==14.2.0", 5 | # "azure-identity==1.21.0", 6 | # "azure-storage-blob==12.25.1", 7 | # "msrest==0.7.1", 8 | # ] 9 | # /// 10 | import datetime 11 | import os 12 | import time 13 | import uuid 14 | 15 | from msrest.authentication import BasicTokenAuthentication 16 | 17 | import azure.batch.models as batchmodels 18 | from azure.batch import BatchServiceClient 19 | from azure.identity import DefaultAzureCredential 20 | from azure.storage.blob import BlobServiceClient 21 | 22 | 23 | def main(image_name: str, config_container: str, pool_id: str, job_id: str): 24 | """ 25 | Submit a job 26 | 27 | Arguments 28 | ---------- 29 | image_name: str 30 | The name of the container image (and tag) to use for the job 31 | config_container: str 32 | The name of the storage container for the job to output to 33 | pool_id: str 34 | The name of the pool to use for the job 35 | job_id: str 36 | The name of the job to use for the job. 37 | """ 38 | blob_account = os.environ["BLOB_ACCOUNT"] 39 | blob_url = f"https://{blob_account}.blob.core.windows.net" 40 | batch_account = os.environ["BATCH_ACCOUNT"] 41 | batch_url = f"https://{batch_account}.eastus.batch.azure.com" 42 | 43 | # Authenticate with workaround because Batch is the one remaining 44 | # service that doesn't yet support Azure auth flow v2 :) :) 45 | # https://github.com/Azure/azure-sdk-for-python/issues/30468 46 | credential_v2 = DefaultAzureCredential() 47 | token = { 48 | "access_token": credential_v2.get_token( 49 | "https://batch.core.windows.net/.default" 50 | ).token 51 | } 52 | credential_v1 = BasicTokenAuthentication(token) 53 | 54 | batch_client = BatchServiceClient(credentials=credential_v1, batch_url=batch_url) 55 | 56 | ############# 57 | # Set up job 58 | batch_job_id = pool_id 59 | job = batchmodels.JobAddParameter( 60 | id=batch_job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id) 61 | ) 62 | 63 | try: 64 | batch_client.job.add(job) 65 | except batchmodels.BatchErrorException as err: 66 | if err.error.code != "JobExists": 67 | raise 68 | else: 69 | print("Job already exists. Using job object") 70 | 71 | ########## 72 | # Get tasks 73 | blob_service_client = BlobServiceClient(blob_url, credential_v2) 74 | container_client = blob_service_client.get_container_client( 75 | container=config_container 76 | ) 77 | 78 | task_configs: list[str] = [ 79 | b.name for b in container_client.list_blobs() if job_id in b.name 80 | ] 81 | if len(task_configs) > 0: 82 | print(f"Creating {len(task_configs)} tasks in job {job_id} on pool {pool_id}") 83 | elif len(task_configs) == 0: 84 | raise ValueError("No tasks found") 85 | 86 | ########### 87 | # Set up tasks on job 88 | task_container_settings = batchmodels.TaskContainerSettings( 89 | image_name=image_name, container_run_options="--rm --workdir /" 90 | ) 91 | task_env_settings = [ 92 | batchmodels.EnvironmentSetting( 93 | name="az_tenant_id", value=os.environ["AZURE_TENANT_ID"] 94 | ), 95 | batchmodels.EnvironmentSetting( 96 | name="az_client_id", value=os.environ["AZURE_CLIENT_ID"] 97 | ), 98 | batchmodels.EnvironmentSetting( 99 | name="az_service_principal", value=os.environ["AZURE_CLIENT_SECRET"] 100 | ), 101 | ] 102 | 103 | # Run task at the admin level to be able to read/write to mounted drives 104 | user_identity = batchmodels.UserIdentity( 105 | auto_user=batchmodels.AutoUserSpecification( 106 | scope=batchmodels.AutoUserScope.pool, 107 | elevation_level=batchmodels.ElevationLevel.admin, 108 | ) 109 | ) 110 | 111 | for config_path in task_configs: 112 | command = f"Rscript -e \"CFAEpiNow2Pipeline::orchestrate_pipeline('{config_path}', config_container = '{config_container}', input_dir = '/mnt/input', output_dir = '/mnt/output')\"" 113 | task = batchmodels.TaskAddParameter( 114 | id=str(uuid.uuid4()), 115 | command_line=command, 116 | container_settings=task_container_settings, 117 | environment_settings=task_env_settings, 118 | user_identity=user_identity, 119 | ) 120 | 121 | batch_client.task.add(batch_job_id, task) 122 | 123 | 124 | if __name__ == "__main__": 125 | from argparse import ArgumentParser 126 | 127 | parser = ArgumentParser( 128 | description="Submit a job to Azure Batch with the specified image and config container" 129 | ) 130 | parser.add_argument( 131 | "--image_name", 132 | type=str, 133 | help="The name of the container image (and tag) to use for the job", 134 | required=True, 135 | ) 136 | parser.add_argument( 137 | "--config_container", 138 | type=str, 139 | help="The name of the storage container for the job to output to", 140 | required=True, 141 | ) 142 | parser.add_argument( 143 | "--pool_id", 144 | type=str, 145 | help="The name of the pool to use for the job", 146 | required=True, 147 | ) 148 | parser.add_argument( 149 | "--job_id", 150 | type=str, 151 | help="The name of the job to use for the job. Defaults to pool_id", 152 | default=None, 153 | ) 154 | 155 | # Parse the args 156 | args = parser.parse_args() 157 | image_name: str = args.image_name 158 | config_container: str = args.config_container 159 | pool_id: str = args.pool_id 160 | # Use pool_id as job_id if not specified 161 | job_id: str = args.job_id or pool_id 162 | 163 | main( 164 | image_name=image_name, 165 | config_container=config_container, 166 | pool_id=pool_id, 167 | job_id=job_id, 168 | ) 169 | -------------------------------------------------------------------------------- /azure/requirements.txt: -------------------------------------------------------------------------------- 1 | adal==1.2.7 2 | azure-batch==14.2.0 3 | azure-common==1.1.28 4 | azure-core==1.32.0 5 | azure-identity==1.19.0 6 | azure-keyvault==4.2.0 7 | azure-keyvault-certificates==4.9.0 8 | azure-keyvault-keys==4.10.0 9 | azure-keyvault-secrets==4.9.0 10 | azure-mgmt-batch==18.0.0 11 | azure-mgmt-core==1.5.0 12 | azure-storage-blob==12.24.0 13 | certifi==2024.8.30 14 | cffi==1.17.1 15 | charset-normalizer==3.4.0 16 | cryptography==44.0.1 17 | idna==3.10 18 | isodate==0.7.2 19 | msal==1.31.1 20 | msal-extensions==1.2.0 21 | msrest==0.7.1 22 | msrestazure==0.6.4.post1 23 | oauthlib==3.2.2 24 | portalocker==2.10.1 25 | pycparser==2.22 26 | PyJWT==2.10.1 27 | python-dateutil==2.9.0.post0 28 | requests==2.32.3 29 | requests-oauthlib==2.0.0 30 | six==1.17.0 31 | toml==0.10.2 32 | typing_extensions==4.12.2 33 | urllib3==2.2.3 34 | -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Creating a Culture of Innovation 2 | We aspire to create a culture where people work joyfully, communicate openly 3 | about things that matter, and provide great services globally. We would like our 4 | team and communities (both government and private sector) to reflect on 5 | diversity of all kinds, not just the classes protected in law. Diversity fosters 6 | innovation. Diverse teams are creative teams. We need a diversity of perspective 7 | to create solutions for the challenges we face. 8 | 9 | This is our code of conduct (adapted from [18F's Code of Conduct](https://github.com/18F/code-of-conduct)). 10 | We follow all Equal Employment Opportunity laws and we expect everyone we work 11 | with to adhere to the [GSA Anti-harassment Policy](http://www.gsa.gov/portal/directive/d0/content/512516), 12 | even if they do not work for the Centers for Disease Control and Prevention or 13 | GSA. We expect every user to follow this code of conduct and the laws and 14 | policies mentioned above. 15 | 16 | ## Be Empowering 17 | Consider what you can do to encourage and support others. Make room for quieter 18 | voices to contribute. Offer support and enthusiasm for great ideas. Leverage the 19 | low cost of experimentation to support your colleagues' ideas, and take care to 20 | acknowledge the original source. Look for ways to contribute and collaborate, 21 | even in situations where you normally wouldn't. Share your knowledge and skills. 22 | Prioritize access for and input from those who are traditionally excluded from 23 | the civic process. 24 | 25 | ## Rules of Behavior 26 | * I understand that I must complete security awareness and records management 27 | training annually in order to comply with the latest security and records 28 | management policies. 29 | * I understand that I must also follow the [Rules of Behavior for use of HHS Information Resources](http://www.hhs.gov/ocio/policy/hhs-rob.html) 30 | * I understand that I must not use, share, or store any kind of sensitive data 31 | (health status, provision or payment of healthcare, PII, etc.) under ANY 32 | circumstance. 33 | * I will not knowingly conceal, falsify, or remove information. 34 | * I understand that I can only use non-sensitive and/or publicly available 35 | data. 36 | * I understand that all passwords I create to set up accounts need to comply 37 | with CDC's password policy. 38 | * I understand that the stewards reserves the right to moderate all data at any 39 | time. 40 | 41 | ## Boundaries 42 | Create boundaries to your own behavior and consider how you can create a safe 43 | space that helps prevent unacceptable behavior by others. We can't list all 44 | instances of unacceptable behavior, but we can provide examples to help guide 45 | our community in thinking through how to respond when we experience these types 46 | of behavior, whether directed at ourselves or others. 47 | 48 | If you are unsure if something is appropriate behavior, it probably is not. Each 49 | person we interact with can define where the line is for them. Impact matters 50 | more than intent. Ensuring that your behavior does not have a negative impact is 51 | your responsibility. Problems usually arise when we assume that our way of 52 | thinking or behavior is the norm for everyone. 53 | 54 | ### Here are some examples of unacceptable behavior 55 | * Negative or offensive remarks based on the protected classes as listed in the 56 | GSA Anti-harassment Policy of race, religion, color, sex, national origin, 57 | age, disability, genetric information, sexual orientation, gender identity, 58 | parental status, maritual status, and political affiliation as well as gender 59 | expression, mental illness, socioeconomic status or backgrounds, 60 | neuro(a)typicality, physical appearance, body size, or clothing. Consider 61 | that calling attention to differences can feel alienating. 62 | * Sustained disruption of meetings, talks, or discussions, including chatrooms. 63 | * Patronizing language or behavior. 64 | * Aggressive behavior, such as unconstructive criticism, providing correction 65 | that do not improve the conversation (sometimes referred to as "well 66 | actually's"), repeatedly interrupting or talking over someone else, feigning 67 | surprise at someone's lack of knowledge or awareness about a topic, or subtle 68 | prejudice. 69 | * Referring to people in a way that misidentifies their gender and/or rejects 70 | the validity of their gender identity; for instance by using incorrect 71 | pronouns or forms of address (misgendering). 72 | * Retaliating against anyone who files a formal complaint that someone has 73 | violated these codes or laws. 74 | 75 | ## Background 76 | CDC Scientific Clearance is the process of obtaining approvals by appropriate 77 | CDC officials before a CDC information product is released to the public or 78 | CDC's external public health partners. Information products that require formal 79 | clearance include print, electronic, or oral materials, that CDC employees 80 | author or co-author, whether published by CDC or outside CDC. CDC contractors 81 | developing content on behalf of CDC for the public or CDC's external public 82 | health partners are also required to put their content through the formal 83 | clearance process. The collaborative functions related to the projects include 84 | blogs, wikis, forums, bug tracking sites, source control and 85 | others deemed as necessary. 86 | 87 | For those individuals within the CDC, adherence to the following policies are 88 | required: 89 | * CDC ["Clearance of Information Products Disseminated Outside CDC for Public Use"](http://www.cdc.gov/maso/Policy/PublicUse.pdf) 90 | * HHS ["Ensuring the Quality of Information Disseminated by HHS agencies"](http://aspe.hhs.gov/infoquality) 91 | 92 | All collaborative materials will be controlled by the rules contained within 93 | this document. This will allow for the real-time collaboration opportunities 94 | among CDC employees, CDC contractors and CDC public health partners. 95 | 96 | ## Credit 97 | This code of conduct was mainly adapted from [18F's Code of Conduct](https://github.com/18F/code-of-conduct) 98 | and the [CDC's Informatics Innovation Unit R&D Lab's code of conduct.](https://www.philab.cdc.gov/index.php/code-of-conduct/) 99 | 100 | ## Relevant Legal Considerations 101 | * [Laws enforced by the Equal Employment Opportunity Commission](http://www.eeoc.gov/laws/statutes/index.cfm) 102 | * [Types of discrimination prohibited by law](http://www.eeoc.gov/laws/types) 103 | * [New and proposed regulations](http://www.eeoc.gov/laws/regulations/index.cfm) 104 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "condensed_header, condensed_files, condensed_footer" # add "condensed_" to "header", "files" and "footer" 3 | hide_project_coverage: TRUE # set to true 4 | 5 | coverage: 6 | status: 7 | project: 8 | default: 9 | target: auto 10 | threshold: 1% 11 | informational: true 12 | patch: 13 | default: 14 | target: auto 15 | threshold: 1% 16 | informational: true 17 | -------------------------------------------------------------------------------- /container-app-jobs/README.md: -------------------------------------------------------------------------------- 1 | # Container App Job Tools 2 | 3 | This directory contains tools related to executing this pipeline in Azure as a Container App Job. 4 | 5 | ## job-template.yaml 6 | 7 | The *job-template-yaml* file can be passed to the Azure CLI to start a Container App Job from the command line. This allows a user to quickly kick off specific jobs from a WSL console. 8 | 9 | If not previously installed, refer to the documentation [here](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt) for installation instructions on the CLI itself. The command in Option 1 is the best way to accomplish this: 10 | 11 | ```bash 12 | curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash 13 | ``` 14 | 15 | Update the *job-template.yaml* file with the Azure tenant and client ids, as well as the config file to execute the job on. The job can then be started from the CLI with the following command: 16 | 17 | ```bash 18 | az containerapp job start --name 'cfa-epinow2-test-caj' --resource-group 'EXT-EDAV-CFA-PRD' --yaml job-template.yaml 19 | ``` 20 | 21 | This command will start the job and return metadata including the newly created job's id. Refer to the Azure portal in a browser to track status and results. 22 | 23 | ## blob-config-runner 24 | 25 | The *blob-config-runner* directory contains a Python tool that can start multiple jobs at once. It looks for files within a specified Azure Blob Storage container, presents them to the user for interactive selection, and runs a job on each once confirmed. 26 | 27 | This tool requires Python 3, which is already installed within WSL. A virtual environment using *venv* is recommended for execution, which can be installed with *apt*. To initialize the environment and necessary libraries, run the following command from the directory: 28 | 29 | ```bash 30 | python3 -m venv .venv 31 | .venv/bin/python3 -m pip install -r requirements.txt 32 | ``` 33 | 34 | Enter the *config.ini* file's client, tenant, and subscription id values within the Azure section. Update the container name and prefix as needed for this specific run. The env_vars section should not be updated, as these are used by the script to replace the values. The tool can now be run as follows: 35 | 36 | ```bash 37 | .venv/bin/python3 start-jobs.py 38 | ``` 39 | 40 | **Note:** This tool identifies config files by looking for a suffix of *-config.json*. This logic could be updated to instead look for tags or metadata, if files were appropriately identified as such within Azure. 41 | -------------------------------------------------------------------------------- /container-app-jobs/blob-config-runner/config.ini: -------------------------------------------------------------------------------- 1 | [env_vars] 2 | config_file_key = <<_config_file_>> 3 | tenant_id_key = <<_tenant_id_>> 4 | tenant_id_label = az_tenant_id 5 | client_id_key = <<_client_id_>> 6 | client_id_label = az_client_id 7 | sp_label = az_service_principal 8 | sp_ref = az-service-principal 9 | 10 | [azure] 11 | account_url = https://cfaazurebatchprd.blob.core.windows.net 12 | container_name = rt-epinow2-config 13 | prefix = Rt-estimation-20250124_172623/configs/ 14 | tenant_id_value = 15 | client_id_value = 16 | subscription_id = 17 | 18 | [caj] 19 | name = cfa-epinow2-test-caj 20 | resource_group = EXT-EDAV-CFA-PRD 21 | command = /pkg/start.sh 22 | image = cfaprdbatchcr.azurecr.io/cfa-epinow2-pipeline:latest 23 | cpu = 4.0 24 | memory = 8Gi 25 | -------------------------------------------------------------------------------- /container-app-jobs/blob-config-runner/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-storage-blob 2 | azure-identity 3 | azure-mgmt-appcontainers 4 | -------------------------------------------------------------------------------- /container-app-jobs/job-template.yaml: -------------------------------------------------------------------------------- 1 | # Template file for starting an Azure Container App job running this workflow. 2 | # A Container App Job must be created and defined in Azure, and its name and 3 | # resource group passed to the command with this template to start the job execution. 4 | # Config file, tenant id, and client id need to be set before running. 5 | # Usage: 6 | # az containerapp job start --name <<_job_name_>> --resource-group <<_rg_>> --yaml job-template.yaml 7 | 8 | containers: 9 | - args: ["<<_config_file_>>"] 10 | command: 11 | - /pkg/start.sh 12 | env: 13 | - name: az_tenant_id 14 | value: <<_tenant_id_>> 15 | - name: az_client_id 16 | value: <<_client_id_>> 17 | - name: az_service_principal 18 | secretRef: az-service-principal # pragma: allowlist secret 19 | image: cfaprdbatchcr.azurecr.io/cfa-epinow2-pipeline:latest 20 | name: cfa-epinow2-test-caj 21 | resources: 22 | cpu: 4 23 | memory: 8Gi 24 | -------------------------------------------------------------------------------- /data-raw/convert_gostic_toy_rt_to_test_dataset.R: -------------------------------------------------------------------------------- 1 | load("data/gostic_toy_rt.rda") 2 | gostic_toy_rt[["reference_date"]] <- as.Date("2023-01-01") + 3 | gostic_toy_rt[["time"]] 4 | gostic_toy_rt[["report_date"]] <- max(gostic_toy_rt[["reference_date"]]) + 1 5 | 6 | con <- DBI::dbConnect(duckdb::duckdb()) 7 | 8 | duckdb::duckdb_register(con, "gostic_toy_rt", gostic_toy_rt) 9 | dbExecute( 10 | con, 11 | " 12 | COPY ( 13 | SELECT 14 | obs_incidence AS value, 15 | 'test' AS geo_value, 16 | 'test' AS disease, 17 | 'count_ed_visits' AS metric, 18 | reference_date, 19 | report_date 20 | FROM gostic_toy_rt 21 | ORDER BY reference_date 22 | LIMIT 150 23 | ) TO 24 | 'tests/testthat/data/test_data.parquet' (FORMAT PARQUET) 25 | ; 26 | " 27 | ) 28 | 29 | # Repeat for US overall 30 | dbExecute( 31 | con, 32 | " 33 | COPY ( 34 | SELECT 35 | obs_incidence AS value, 36 | 'US' AS geo_value, 37 | 'test' AS disease, 38 | 'count_ed_visits' AS metric, 39 | reference_date, 40 | report_date 41 | FROM gostic_toy_rt 42 | ORDER BY reference_date 43 | LIMIT 150 44 | ) TO 45 | 'tests/testthat/data/us_overall_test_data.parquet' (FORMAT PARQUET) 46 | ; 47 | " 48 | ) 49 | dbDisconnect(con) 50 | -------------------------------------------------------------------------------- /data-raw/sir_gt_pmf.R: -------------------------------------------------------------------------------- 1 | # E and I compartments both with exponentially distributed residence times 2 | # with a mean of 4 days. 3 | shape <- 2 4 | rate <- 1 / 4 5 | 6 | sir_gt_pmf <- primarycensored::dpcens( 7 | 0:26, 8 | pgamma, 9 | shape = shape, 10 | rate = rate, 11 | D = 27 12 | ) # v0.4.0 13 | 14 | # Drop first element because GI can't have same-day transmission 15 | # and replace with a zero 16 | sir_gt_pmf <- c(0, sir_gt_pmf[2:27]) 17 | 18 | # Renormalize to a proper PMF 19 | while (abs(sum(sir_gt_pmf) - 1) > 1e-10) { 20 | sir_gt_pmf <- sir_gt_pmf / sum(sir_gt_pmf) 21 | } 22 | 23 | usethis::use_data(sir_gt_pmf, overwrite = TRUE) 24 | -------------------------------------------------------------------------------- /data/gostic_toy_rt.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/data/gostic_toy_rt.rda -------------------------------------------------------------------------------- /data/sir_gt_pmf.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/data/sir_gt_pmf.rda -------------------------------------------------------------------------------- /image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/image.png -------------------------------------------------------------------------------- /man/Config.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/config.R 3 | \name{Config} 4 | \alias{Config} 5 | \title{Config Class} 6 | \usage{ 7 | Config( 8 | job_id = character(0), 9 | task_id = character(0), 10 | min_reference_date = character(0), 11 | max_reference_date = character(0), 12 | report_date = character(0), 13 | production_date = character(0), 14 | disease = character(0), 15 | geo_value = character(0), 16 | geo_type = character(0), 17 | seed = integer(0), 18 | horizon = integer(0), 19 | model = "EpiNow2", 20 | config_version = character(0), 21 | quantile_width = c(0.5, 0.95), 22 | data = Data(), 23 | priors = list(), 24 | parameters = Parameters(), 25 | sampler_opts = list(), 26 | exclusions = Exclusions(), 27 | output_container = character(0) 28 | ) 29 | } 30 | \arguments{ 31 | \item{job_id}{A string specifying the job.} 32 | 33 | \item{task_id}{A string specifying the task.} 34 | 35 | \item{min_reference_date}{A string representing the minimum reference 36 | date. Formatted as "YYYY-MM-DD".} 37 | 38 | \item{max_reference_date}{A string representing the maximum reference 39 | date. Formatted as "YYYY-MM-DD".} 40 | 41 | \item{report_date}{A string representing the report date. Formatted as 42 | "YYYY-MM-DD".} 43 | 44 | \item{production_date}{A string representing the production date. 45 | Formatted as "YYYY-MM-DD".} 46 | 47 | \item{disease}{A string specifying the disease being modeled. One of 48 | \code{"COVID-19"} or \code{"Influenza"} or \code{"RSV"}.} 49 | 50 | \item{geo_value}{An uppercase, two-character string specifying the geographic 51 | value, usually a state or \code{"US"} for national data.} 52 | 53 | \item{geo_type}{A string specifying the geographic type, usually "state".} 54 | 55 | \item{seed}{An integer for setting the random seed.} 56 | 57 | \item{horizon}{An integer specifying the forecasting horizon.} 58 | 59 | \item{model}{A string specifying the model to be used.} 60 | 61 | \item{config_version}{A numeric value specifying the configuration version.} 62 | 63 | \item{quantile_width}{A vector of numeric values representing the desired 64 | quantiles. Passed to \code{\link[tidybayes:reexports]{tidybayes::median_qi()}}.} 65 | 66 | \item{data}{An instance of \code{Data} class containing data configurations.} 67 | 68 | \item{priors}{A list of lists. The first level should contain the key \code{rt} 69 | with elements \code{mean} and \code{sd} and the key \code{gp} with element \code{alpha_sd}.} 70 | 71 | \item{parameters}{An instance of \code{Parameters} class containing parameter 72 | configurations.} 73 | 74 | \item{sampler_opts}{A list. The Stan sampler options to be passed through 75 | EpiNow2. It has required keys: \code{cores}, \code{chains}, \code{iter_warmup}, 76 | \code{iter_sampling}, \code{max_treedepth}, and \code{adapt_delta}.} 77 | 78 | \item{exclusions}{An instance of \code{Exclusions} class containing exclusion 79 | criteria.} 80 | 81 | \item{output_container}{An optional string specifying the output blob storage 82 | container.} 83 | } 84 | \description{ 85 | Represents the complete configuration for the pipeline. 86 | } 87 | \seealso{ 88 | Other config: 89 | \code{\link{Data}()}, 90 | \code{\link{Exclusions}()}, 91 | \code{\link{Interval}}, 92 | \code{\link{Parameters}()}, 93 | \code{\link{read_json_into_config}()} 94 | } 95 | \concept{config} 96 | -------------------------------------------------------------------------------- /man/Data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/config.R 3 | \name{Data} 4 | \alias{Data} 5 | \title{Data Class} 6 | \usage{ 7 | Data( 8 | path = character(0), 9 | blob_storage_container = character(0), 10 | report_date = character(0), 11 | reference_date = character(0) 12 | ) 13 | } 14 | \arguments{ 15 | \item{path}{A string specifying the path to the data Parquet file.} 16 | 17 | \item{blob_storage_container}{Optional. The name of the blob storage 18 | container to which the data file will be uploaded. If NULL, no upload will 19 | occur.} 20 | 21 | \item{report_date}{A list of strings representing report dates.} 22 | 23 | \item{reference_date}{A list of strings representing reference dates.} 24 | } 25 | \description{ 26 | Represents the data-related configurations. 27 | } 28 | \seealso{ 29 | Other config: 30 | \code{\link{Config}()}, 31 | \code{\link{Exclusions}()}, 32 | \code{\link{Interval}}, 33 | \code{\link{Parameters}()}, 34 | \code{\link{read_json_into_config}()} 35 | } 36 | \concept{config} 37 | -------------------------------------------------------------------------------- /man/Exclusions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/config.R 3 | \name{Exclusions} 4 | \alias{Exclusions} 5 | \title{Exclusions Class} 6 | \usage{ 7 | Exclusions(path = character(0), blob_storage_container = character(0)) 8 | } 9 | \arguments{ 10 | \item{path}{A string specifying the path to a CSV file containing exclusion 11 | data. It should include at least the columns: \code{reference_date}, 12 | \code{report_date}, \code{state}, \code{disease}.} 13 | 14 | \item{blob_storage_container}{Optional. The name of the blob storage 15 | container to get it from. If NULL, will look locally.} 16 | } 17 | \description{ 18 | Represents exclusion criteria for the pipeline. 19 | } 20 | \seealso{ 21 | Other config: 22 | \code{\link{Config}()}, 23 | \code{\link{Data}()}, 24 | \code{\link{Interval}}, 25 | \code{\link{Parameters}()}, 26 | \code{\link{read_json_into_config}()} 27 | } 28 | \concept{config} 29 | -------------------------------------------------------------------------------- /man/Interval.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/config.R 3 | \name{Interval} 4 | \alias{Interval} 5 | \alias{GenerationInterval} 6 | \alias{DelayInterval} 7 | \alias{RightTruncation} 8 | \title{Interval Class} 9 | \usage{ 10 | Interval(path = character(0), blob_storage_container = character(0)) 11 | 12 | GenerationInterval(path = character(0), blob_storage_container = character(0)) 13 | 14 | DelayInterval(path = character(0), blob_storage_container = character(0)) 15 | 16 | RightTruncation(path = character(0), blob_storage_container = character(0)) 17 | } 18 | \arguments{ 19 | \item{path}{A string specifying the path to the generation interval CSV file.} 20 | 21 | \item{blob_storage_container}{Optional. The name of the blob storage 22 | container to get it from. If NULL, will look locally.} 23 | } 24 | \description{ 25 | Represents a generic interval. Meant to be subclassed. 26 | 27 | Represents the generation interval parameters. 28 | 29 | Represents the delay interval parameters. 30 | 31 | Represents the right truncation parameters. 32 | } 33 | \seealso{ 34 | Other config: 35 | \code{\link{Config}()}, 36 | \code{\link{Data}()}, 37 | \code{\link{Exclusions}()}, 38 | \code{\link{Parameters}()}, 39 | \code{\link{read_json_into_config}()} 40 | 41 | Other config: 42 | \code{\link{Config}()}, 43 | \code{\link{Data}()}, 44 | \code{\link{Exclusions}()}, 45 | \code{\link{Parameters}()}, 46 | \code{\link{read_json_into_config}()} 47 | 48 | Other config: 49 | \code{\link{Config}()}, 50 | \code{\link{Data}()}, 51 | \code{\link{Exclusions}()}, 52 | \code{\link{Parameters}()}, 53 | \code{\link{read_json_into_config}()} 54 | 55 | Other config: 56 | \code{\link{Config}()}, 57 | \code{\link{Data}()}, 58 | \code{\link{Exclusions}()}, 59 | \code{\link{Parameters}()}, 60 | \code{\link{read_json_into_config}()} 61 | } 62 | \concept{config} 63 | -------------------------------------------------------------------------------- /man/Parameters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/config.R 3 | \name{Parameters} 4 | \alias{Parameters} 5 | \title{Parameters Class} 6 | \usage{ 7 | Parameters( 8 | as_of_date = character(0), 9 | generation_interval = GenerationInterval(), 10 | delay_interval = DelayInterval(), 11 | right_truncation = RightTruncation() 12 | ) 13 | } 14 | \arguments{ 15 | \item{as_of_date}{A string representing the as-of date. Formatted as 16 | "YYYY-MM-DD".} 17 | 18 | \item{generation_interval}{An instance of \code{GenerationInterval} class.} 19 | 20 | \item{delay_interval}{An instance of \code{DelayInterval} class.} 21 | 22 | \item{right_truncation}{An instance of \code{RightTruncation} class.} 23 | } 24 | \description{ 25 | Holds all parameter-related configurations for the pipeline. 26 | } 27 | \seealso{ 28 | Other config: 29 | \code{\link{Config}()}, 30 | \code{\link{Data}()}, 31 | \code{\link{Exclusions}()}, 32 | \code{\link{Interval}}, 33 | \code{\link{read_json_into_config}()} 34 | } 35 | \concept{config} 36 | -------------------------------------------------------------------------------- /man/apply_exclusions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exclusions.R 3 | \name{apply_exclusions} 4 | \alias{apply_exclusions} 5 | \title{Convert case counts in matching rows to NA} 6 | \usage{ 7 | apply_exclusions(cases, exclusions) 8 | } 9 | \arguments{ 10 | \item{cases}{A dataframe returned by \code{\link[=read_data]{read_data()}}} 11 | 12 | \item{exclusions}{A dataframe returned by \code{\link[=read_exclusions]{read_exclusions()}}} 13 | } 14 | \value{ 15 | A dataframe with the same rows and schema as \code{cases} where the value 16 | in the column \code{confirm} converted to NA in any rows that match a row in 17 | \code{exclusions} 18 | } 19 | \description{ 20 | Mark selected points to be ignored in model fitting. This manual selection 21 | occurs externally to the pipeline and is passed to the pipeline in an 22 | exclusions file read with \code{\link[=read_exclusions]{read_exclusions()}}. Mechanically, the exclusions 23 | are applied by converting specified points to NAs in the dataset. NAs are 24 | skipped in model fitting by EpiNow2, so matched rows are excluded from the 25 | model likelihood. 26 | } 27 | \seealso{ 28 | Other exclusions: 29 | \code{\link{read_exclusions}()} 30 | } 31 | \concept{exclusions} 32 | -------------------------------------------------------------------------------- /man/download_file_from_container.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/azure.R 3 | \name{download_file_from_container} 4 | \alias{download_file_from_container} 5 | \title{Download specified blobs from Blob Storage and save them in a local dir} 6 | \usage{ 7 | download_file_from_container( 8 | blob_storage_path, 9 | local_file_path, 10 | storage_container 11 | ) 12 | } 13 | \arguments{ 14 | \item{blob_storage_path}{A character of a blob in \code{storage_container}} 15 | 16 | \item{local_file_path}{The local path to save the blob} 17 | 18 | \item{storage_container}{The blob storage container with \code{blob_storage_path}} 19 | } 20 | \value{ 21 | Invisibly, \code{local_file_path} 22 | } 23 | \description{ 24 | Download specified blobs from Blob Storage and save them in a local dir 25 | } 26 | \seealso{ 27 | Other azure: 28 | \code{\link{download_if_specified}()}, 29 | \code{\link{fetch_blob_container}()}, 30 | \code{\link{fetch_credential_from_env_var}()} 31 | } 32 | \concept{azure} 33 | -------------------------------------------------------------------------------- /man/download_if_specified.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/azure.R 3 | \name{download_if_specified} 4 | \alias{download_if_specified} 5 | \title{Download if specified} 6 | \usage{ 7 | download_if_specified(blob_path, blob_storage_container, dir) 8 | } 9 | \arguments{ 10 | \item{blob_path}{The name of the blob to download} 11 | 12 | \item{blob_storage_container}{The name of the container to download from} 13 | 14 | \item{dir}{The directory to which to write the downloaded file} 15 | } 16 | \value{ 17 | The path of the file 18 | } 19 | \description{ 20 | Download if specified 21 | } 22 | \seealso{ 23 | Other azure: 24 | \code{\link{download_file_from_container}()}, 25 | \code{\link{fetch_blob_container}()}, 26 | \code{\link{fetch_credential_from_env_var}()} 27 | } 28 | \concept{azure} 29 | -------------------------------------------------------------------------------- /man/extract_diagnostics.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/diagnostics.R 3 | \name{extract_diagnostics} 4 | \alias{extract_diagnostics} 5 | \title{Extract diagnostic metrics from model fit and data} 6 | \usage{ 7 | extract_diagnostics(fit, data, job_id, task_id, disease, geo_value, model) 8 | } 9 | \arguments{ 10 | \item{fit}{The model fit object from \code{EpiNow2}} 11 | 12 | \item{data}{A data frame containing the input data used in the model fit.} 13 | 14 | \item{job_id}{A string specifying the job.} 15 | 16 | \item{task_id}{A string specifying the task.} 17 | 18 | \item{disease}{A string specifying the disease being modeled. One of 19 | \code{"COVID-19"} or \code{"Influenza"} or \code{"RSV"}.} 20 | 21 | \item{geo_value}{An uppercase, two-character string specifying the geographic 22 | value, usually a state or \code{"US"} for national data.} 23 | 24 | \item{model}{A string specifying the model to be used.} 25 | } 26 | \value{ 27 | A \code{data.frame} containing the extracted diagnostic metrics. The 28 | data frame includes the following columns: 29 | \itemize{ 30 | \item \code{diagnostic}: The name of the diagnostic metric. 31 | \item \code{value}: The value of the diagnostic metric. 32 | \item \code{job_id}: The unique identifier for the job. 33 | \item \code{task_id}: The unique identifier for the task. 34 | \item \code{disease,geo_value,model}: Metadata for downstream processing. 35 | } 36 | } 37 | \description{ 38 | This function extracts various diagnostic metrics from a fitted \code{EpiNow2} 39 | model and provided data. It checks for low case counts and computes 40 | diagnostics from the fitted model, including the mean acceptance 41 | statistic, divergent transitions, maximum tree depth, and Rhat values. 42 | Additionally, a combined flag is computed indicating if any diagnostics 43 | are outside an acceptable range. The results are returned as a data frame. 44 | } 45 | \details{ 46 | The following diagnostics are calculated: 47 | \itemize{ 48 | \item \code{mean_accept_stat}: The average acceptance statistic across 49 | all chains. 50 | \item \code{p_divergent}: The \emph{proportion} of divergent transitions across 51 | all samples. 52 | \item \code{n_divergent}: The \emph{number} of divergent transitions across 53 | all samples. 54 | \item \code{p_max_treedepth}: The proportion of samples that hit the 55 | maximum tree depth. 56 | \item \code{p_high_rhat}: The \emph{proportion} of parameters with Rhat values 57 | greater than 1.05, indicating potential convergence issues. 58 | \item \code{n_high_rhat}: The \emph{number} of parameters with Rhat values 59 | greater than 1.05, indicating potential convergence issues. 60 | \item \code{low_case_count_flag}: A flag indicating if there are low case 61 | counts in the data. See \code{low_case_count_diagnostic()} for more 62 | information on this diagnostic. 63 | \item \code{epinow2_diagnostic_flag}: A combined flag that indicates if 64 | any diagnostic metrics are outside an accepted range, as determined 65 | by the thresholds: (1) mean_accept_stat < 0.1, (2) p_divergent > 66 | 0.0075, (3) p_max_treedepth > 0.05, and (4) p_high_rhat > 0.0075. 67 | } 68 | } 69 | \seealso{ 70 | Other diagnostics: 71 | \code{\link{low_case_count_diagnostic}()} 72 | } 73 | \concept{diagnostics} 74 | -------------------------------------------------------------------------------- /man/fetch_blob_container.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/azure.R 3 | \name{fetch_blob_container} 4 | \alias{fetch_blob_container} 5 | \title{Load Azure Blob container using credentials in environment variables} 6 | \usage{ 7 | fetch_blob_container(container_name) 8 | } 9 | \arguments{ 10 | \item{container_name}{The Azure Blob Storage container associated with the 11 | credentials} 12 | } 13 | \value{ 14 | A Blob endpoint 15 | } 16 | \description{ 17 | This function depends on the following Azure credentials stored in 18 | environment variables: 19 | } 20 | \details{ 21 | \itemize{ 22 | \item \code{az_tenant_id}: an Azure Active Directory (AAD) tenant ID 23 | \item \code{az_subscription_id}: an Azure subscription ID 24 | \item \code{az_resource_group}: The name of the Azure resource group 25 | \item \code{az_storage_account}: The name of the Azure storage account 26 | } 27 | 28 | As a result it is an impure function, and should be used bearing that 29 | warning in mind. Each variable is obtained using 30 | \code{\link[=fetch_credential_from_env_var]{fetch_credential_from_env_var()}} (which will return an error if the 31 | credential is not specified or empty). 32 | } 33 | \seealso{ 34 | Other azure: 35 | \code{\link{download_file_from_container}()}, 36 | \code{\link{download_if_specified}()}, 37 | \code{\link{fetch_credential_from_env_var}()} 38 | } 39 | \concept{azure} 40 | -------------------------------------------------------------------------------- /man/fetch_credential_from_env_var.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/azure.R 3 | \name{fetch_credential_from_env_var} 4 | \alias{fetch_credential_from_env_var} 5 | \title{Fetch Azure credential from environment variable} 6 | \usage{ 7 | fetch_credential_from_env_var(env_var) 8 | } 9 | \arguments{ 10 | \item{env_var}{A character, the credential to fetch} 11 | } 12 | \value{ 13 | The associated value 14 | } 15 | \description{ 16 | And throw an informative error if credential is not found 17 | } 18 | \seealso{ 19 | Other azure: 20 | \code{\link{download_file_from_container}()}, 21 | \code{\link{download_if_specified}()}, 22 | \code{\link{fetch_blob_container}()} 23 | } 24 | \concept{azure} 25 | -------------------------------------------------------------------------------- /man/fit_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fit_model.R 3 | \name{fit_model} 4 | \alias{fit_model} 5 | \title{Fit an \code{EpiNow2} model} 6 | \usage{ 7 | fit_model(data, parameters, seed, horizon, priors, sampler_opts) 8 | } 9 | \arguments{ 10 | \item{data, }{in the format returned by \code{\link[=read_data]{read_data()}}} 11 | 12 | \item{parameters}{As returned from \code{\link[=read_disease_parameters]{read_disease_parameters()}}} 13 | 14 | \item{seed}{The random seed, used for both initialization by \code{EpiNow2} in R 15 | and sampling in Stan} 16 | 17 | \item{horizon}{The number of days, as an integer, to forecast} 18 | 19 | \item{priors}{A list of lists. The first level should contain the key \code{rt} 20 | with elements \code{mean} and \code{sd} and the key \code{gp} with element \code{alpha_sd}.} 21 | 22 | \item{sampler_opts}{A list. The Stan sampler options to be passed through 23 | EpiNow2. It has required keys: \code{cores}, \code{chains}, \code{iter_warmup}, 24 | \code{iter_sampling}, \code{max_treedepth}, and \code{adapt_delta}.} 25 | } 26 | \value{ 27 | A fitted model object of class \code{epinow} or, if model fitting fails, 28 | an NA is returned with a warning 29 | } 30 | \description{ 31 | Fit an \code{EpiNow2} model 32 | } 33 | \seealso{ 34 | Other pipeline: 35 | \code{\link{format_stan_opts}()}, 36 | \code{\link{orchestrate_pipeline}()} 37 | } 38 | \concept{pipeline} 39 | -------------------------------------------------------------------------------- /man/format_stan_opts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fit_model.R 3 | \name{format_stan_opts} 4 | \alias{format_stan_opts} 5 | \title{Format Stan options for input to EpiNow2} 6 | \usage{ 7 | format_stan_opts(sampler_opts, seed) 8 | } 9 | \arguments{ 10 | \item{sampler_opts}{A list. The Stan sampler options to be passed through 11 | EpiNow2. It has required keys: \code{cores}, \code{chains}, \code{iter_warmup}, 12 | \code{iter_sampling}, \code{max_treedepth}, and \code{adapt_delta}.} 13 | 14 | \item{seed}{A stochastic seed passed here to the Stan sampler and as the R 15 | PRNG seed for \code{EpiNow2} initialization} 16 | } 17 | \value{ 18 | A \code{stan_opts} object of arguments 19 | } 20 | \description{ 21 | Format configuration \code{sampler_opts} for input to \code{EpiNow2} via a call to 22 | \code{\link[EpiNow2:stan_opts]{EpiNow2::stan_opts()}}. 23 | } 24 | \seealso{ 25 | Other pipeline: 26 | \code{\link{fit_model}()}, 27 | \code{\link{orchestrate_pipeline}()} 28 | } 29 | \concept{pipeline} 30 | -------------------------------------------------------------------------------- /man/gostic_toy_rt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{gostic_toy_rt} 5 | \alias{gostic_toy_rt} 6 | \title{Synthetic dataset of stochastic SIR system with known Rt} 7 | \format{ 8 | \code{gostic_toy_rt} A data frame with 301 rows and 12 columns: 9 | \describe{ 10 | \item{time}{Timestep of the discrete-time stochastic SEIR simulation} 11 | \item{date}{Added from the original Gostic, 2020 dataset. A date 12 | corresponding to the assigned \code{time}. Arbitrarily starts on January 1st, 13 | 2023.} 14 | \item{S, E, I, R}{The realized state of the stochastic SEIR system} 15 | \item{dS, dEI, DIR}{The stochastic transition between compartments} 16 | \item{incidence}{The true incidence in the \code{I} compartment at time t} 17 | \item{obs_cases}{The observed number of cases at time t from 18 | forward-convolved incidence.} 19 | \item{obs_incidence}{Added from the original Gostic, 2020 dataset. The 20 | \code{incidence} column with added negative-binomial observation noise. 21 | Created with \code{set.seed(123456)} and the call 22 | \code{rnbinom(299, mu = gostic_toy_rt[["incidence"]], size = 10)} Useful for 23 | testing.} 24 | \item{true_r0}{The initial R0 of the system (i.e., 2)} 25 | \item{true_rt}{The known, true Rt of the epidemic system} 26 | } 27 | } 28 | \source{ 29 | \url{https://github.com/cobeylab/Rt_estimation/tree/d9d8977ba8492ac1a3b8287d2f470b313bfb9f1d} # nolint 30 | } 31 | \usage{ 32 | gostic_toy_rt 33 | } 34 | \description{ 35 | A dataset from Gostic, Katelyn M., et al. "Practical considerations for 36 | measuring the effective reproductive number, Rt." PLoS Computational Biology 37 | 16.12 (2020): e1008409. The data are simulated from a stochastic SEIR 38 | compartmental model. 39 | } 40 | \details{ 41 | This synthetic dataset has a number of desirable properties: 42 | \enumerate{ 43 | \item The force of infection changes depending on the Rt, allowing for sudden 44 | changes in the Rt. This allows for modeling of sudden changes in infection 45 | dynamics, which might otherwise be difficult to capture. Rt estimation 46 | framework 47 | \item The realized Rt is known at each timepoint 48 | \item The dataset incorporates a simple generation interval and a reporting 49 | delay. 50 | } 51 | 52 | Gostic et al. benchmark the performance of a number of Rt estimation 53 | frameworks, providing practical guidance on how to use this dataset to 54 | evaluate Rt estimates. 55 | 56 | In practice, we've found that the amount of observation noise in the 57 | incidence and/or observed cases is often undesirably low for testing. Many 58 | empirical datasets are much noisier. As a result, models built with these 59 | settings in mind can perform poorly on this dataset or fail to converge. To 60 | the original dataset, we add a new column with the original incidence counts 61 | with additional observation noise: \code{obs_incidence}. We manually add 62 | observation noise with \code{rnbinom(299, mu = gostic_toy_rt[["obs_cases"]], size = 10)} and the random seed 123456 and store it in the \code{obs_incidence} column. 63 | } 64 | \seealso{ 65 | Other data: 66 | \code{\link{sir_gt_pmf}} 67 | } 68 | \concept{data} 69 | \keyword{datasets} 70 | -------------------------------------------------------------------------------- /man/low_case_count_diagnostic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/diagnostics.R 3 | \name{low_case_count_diagnostic} 4 | \alias{low_case_count_diagnostic} 5 | \title{Calculate low case count diagnostic flag} 6 | \usage{ 7 | low_case_count_diagnostic(df) 8 | } 9 | \arguments{ 10 | \item{df}{A dataframe as returned by \code{\link[=read_data]{read_data()}}. The dataframe must 11 | include columns such as \code{reference_date} (a date vector) and \code{confirm} 12 | (the number of confirmed cases per day).} 13 | } 14 | \value{ 15 | A logical value (TRUE or FALSE) indicating whether either of the last 16 | two weeks in the dataset had fewer than 10 cases per week. 17 | } 18 | \description{ 19 | The diagnostic flag is TRUE if either of the \emph{last} two weeks of the dataset 20 | have fewer than an aggregate 10 cases per week. This aggregation excludes the 21 | count from confirmed outliers, which have been set to NA in the data. 22 | } 23 | \details{ 24 | This function assumes that the \code{df} input dataset has been 25 | "completed": that any implicit missingness has been made explicit. 26 | } 27 | \seealso{ 28 | Other diagnostics: 29 | \code{\link{extract_diagnostics}()} 30 | } 31 | \concept{diagnostics} 32 | -------------------------------------------------------------------------------- /man/opts_formatter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{opts_formatter} 4 | \alias{opts_formatter} 5 | \alias{format_generation_interval} 6 | \alias{format_delay_interval} 7 | \alias{format_right_truncation} 8 | \title{Format PMFs for EpiNow2} 9 | \usage{ 10 | format_generation_interval(pmf) 11 | 12 | format_delay_interval(pmf) 13 | 14 | format_right_truncation(pmf, data) 15 | } 16 | \arguments{ 17 | \item{pmf}{As returned by \code{\link[=read_disease_parameters]{read_disease_parameters()}}. A PMF vector or an NA, 18 | if not applying the PMF to the model fit.} 19 | 20 | \item{data}{in the format returned by \code{\link[=read_data]{read_data()}}} 21 | } 22 | \value{ 23 | An \verb{EpiNow2::*_opts()} formatted object or NA with a message 24 | } 25 | \description{ 26 | Opinionated wrappers around \code{\link[EpiNow2:generation_time_opts]{EpiNow2::generation_time_opts()}}, 27 | \code{\link[EpiNow2:delay_opts]{EpiNow2::delay_opts()}}, or \code{\link[EpiNow2:dist_spec]{EpiNow2::dist_spec()}} which format the 28 | generation interval, delay, or right truncation parameters as an object ready 29 | for input to \code{EpiNow2}. 30 | } 31 | \details{ 32 | Delays or right truncation are optional and can be skipped by passing \code{pmf = NA}. 33 | } 34 | \seealso{ 35 | Other parameters: 36 | \code{\link{read_disease_parameters}()}, 37 | \code{\link{read_interval_pmf}()} 38 | } 39 | \concept{parameters} 40 | -------------------------------------------------------------------------------- /man/pipeline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pipeline.R 3 | \name{orchestrate_pipeline} 4 | \alias{orchestrate_pipeline} 5 | \alias{execute_model_logic} 6 | \title{Run an Rt Estimation Model Pipeline} 7 | \usage{ 8 | orchestrate_pipeline( 9 | config_path, 10 | config_container = NULL, 11 | input_dir = "/input", 12 | output_dir = "/output" 13 | ) 14 | 15 | execute_model_logic(config, input_dir, output_dir) 16 | } 17 | \arguments{ 18 | \item{config_path}{A string specifying the file path to the JSON 19 | configuration file.} 20 | 21 | \item{config_container}{Optional. The name of the blob storage container 22 | from which the config file will be downloaded.} 23 | 24 | \item{input_dir}{A string specifying the directory to read inputs from. If 25 | passing storage containers, this is where the files will be downloaded to.} 26 | 27 | \item{output_dir}{A string specifying the directory where output, logs, and 28 | other pipeline artifacts will be saved. Defaults to the root directory ("/").} 29 | 30 | \item{config}{A Config object containing configuration settings for the 31 | pipeline, including paths to data, exclusions, disease parameters, model 32 | settings, and other necessary inputs.} 33 | } 34 | \value{ 35 | The function returns a boolean, TRUE For pipeline success and FALSE 36 | otherwise. It writes the files: 37 | directory will contain the following files: 38 | \itemize{ 39 | \item Model RDS file (\code{model.rds}) 40 | \item Sample output in Parquet format (\verb{.parquet} in the \verb{samples/} 41 | directory) 42 | \item Summary output in Parquet format (\verb{.parquet} in the \verb{summaries/} 43 | directory) 44 | \item Log file (\code{logs.txt}) in the task directory 45 | } 46 | 47 | Returns \code{TRUE} on success. Errors are caught by the outer pipeline 48 | logic and logged accordingly. 49 | } 50 | \description{ 51 | This function runs a complete pipeline for fitting an Rt estimation model, 52 | using the \code{EpiNow2} model, based on a configuration file. The pipeline 53 | processes the model, logs its progress, and handles errors by logging 54 | warnings and setting the pipeline status. Output and logs are written to 55 | the specified directories. Additionally, support for uploading logs and 56 | outputs to a blob storage container is planned. 57 | } 58 | \details{ 59 | The function reads the configuration from a JSON file and uses this to set 60 | up the job and task identifiers. It creates an output directory structure 61 | based on these IDs and starts logging the process in a file. The main 62 | pipeline process is handled by \code{execute_model_logic()}, with errors 63 | caught and logged as warnings. The function will log the success or 64 | failure of the run. 65 | 66 | Logs are written to a file in the output directory, and console output is 67 | also mirrored in this log file. Error handling is in place to capture any 68 | issues during the pipeline execution and ensure they are logged 69 | appropriately. 70 | 71 | During the execution of the pipeline, the following output files are 72 | expected to be generated: 73 | \itemize{ 74 | \item \strong{Model Output}: An RDS file of the fitted model is saved in the 75 | task-specific directory (\code{model.rds}). 76 | \item \strong{Samples}: Parquet files containing the model's sample outputs are saved 77 | in a \code{samples} subdirectory, named using the \code{task_id} (e.g., 78 | \code{task_id.parquet}). 79 | \item \strong{Summaries}: Parquet files summarizing the model's results are saved in 80 | a \code{summaries} subdirectory, also named using the \code{task_id} (e.g., 81 | \code{task_id.parquet}). 82 | \item \strong{Logs}: A \code{logs.txt} file is generated in the task directory, capturing 83 | both console and error messages. 84 | } 85 | 86 | The output directory structure will follow this format: 87 | 88 | \if{html}{\out{
}}\preformatted{/ 89 | └── / 90 | ├── samples/ 91 | │ └── .parquet 92 | ├── summaries/ 93 | │ └── .parquet 94 | └── tasks/ 95 | └── / 96 | ├── model.rds 97 | └── logs.txt 98 | }\if{html}{\out{
}} 99 | 100 | This function performs the core model fitting process within the Rt 101 | estimation pipeline, including reading data, applying exclusions, fitting 102 | the model, and writing outputs such as model samples, summaries, and logs. 103 | } 104 | \seealso{ 105 | Other pipeline: 106 | \code{\link{fit_model}()}, 107 | \code{\link{format_stan_opts}()} 108 | 109 | Other pipeline: 110 | \code{\link{fit_model}()}, 111 | \code{\link{format_stan_opts}()} 112 | } 113 | \concept{pipeline} 114 | -------------------------------------------------------------------------------- /man/read_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_data.R 3 | \name{read_data} 4 | \alias{read_data} 5 | \title{Read in the dataset of incident case counts} 6 | \usage{ 7 | read_data( 8 | data_path, 9 | disease = c("COVID-19", "Influenza", "RSV", "test"), 10 | geo_value, 11 | report_date, 12 | max_reference_date, 13 | min_reference_date 14 | ) 15 | } 16 | \arguments{ 17 | \item{data_path}{The path to the local file. This could contain a glob and 18 | must be in parquet format.} 19 | 20 | \item{disease}{A string specifying the disease being modeled. One of 21 | \code{"COVID-19"} or \code{"Influenza"} or \code{"RSV"}.} 22 | 23 | \item{geo_value}{An uppercase, two-character string specifying the geographic 24 | value, usually a state or \code{"US"} for national data.} 25 | 26 | \item{report_date}{A string representing the report date. Formatted as 27 | "YYYY-MM-DD".} 28 | 29 | \item{max_reference_date}{A string representing the maximum reference 30 | date. Formatted as "YYYY-MM-DD".} 31 | 32 | \item{min_reference_date}{A string representing the minimum reference 33 | date. Formatted as "YYYY-MM-DD".} 34 | } 35 | \value{ 36 | A dataframe with one or more rows and columns \code{report_date}, 37 | \code{reference_date}, \code{geo_value}, \code{confirm} 38 | } 39 | \description{ 40 | Each row of the table corresponds to a single facilities' cases for a 41 | reference-date/report-date/disease tuple. We want to aggregate these counts 42 | to the level of geographic aggregate/report-date/reference-date/disease. 43 | } 44 | \details{ 45 | We handle two distinct cases for geographic aggregates: 46 | \enumerate{ 47 | \item A single state: Subset to facilities \strong{in that state only} and aggregate 48 | up to the state level 2. The US overall: Aggregate over all facilities 49 | without any subsetting 50 | } 51 | 52 | Note that we do \emph{not} apply exclusions here. The exclusions are applied 53 | later, after the aggregations. That means that for the US overall, we 54 | aggregate over points that might potentially be excluded at the state level. 55 | Our recourse in this case is to exclude the US overall aggregate point. 56 | } 57 | \concept{read_data} 58 | -------------------------------------------------------------------------------- /man/read_disease_parameters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{read_disease_parameters} 4 | \alias{read_disease_parameters} 5 | \title{Read in disease process parameters from an external file or files} 6 | \usage{ 7 | read_disease_parameters( 8 | generation_interval_path, 9 | delay_interval_path, 10 | right_truncation_path, 11 | disease, 12 | as_of_date, 13 | geo_value, 14 | report_date 15 | ) 16 | } 17 | \arguments{ 18 | \item{generation_interval_path, delay_interval_path, right_truncation_path}{Path to a local file with the parameter PMF. See \code{\link[=read_interval_pmf]{read_interval_pmf()}} for 19 | details on the file schema. The parameters can be in the same file or a 20 | different file.} 21 | 22 | \item{disease}{A string specifying the disease being modeled. One of 23 | \code{"COVID-19"} or \code{"Influenza"} or \code{"RSV"}.} 24 | 25 | \item{as_of_date}{Use the parameters that were used in production on this 26 | date. Set for the current date for the most up-to-to date version of the 27 | parameters and set to an earlier date to use parameters from an earlier 28 | time period.} 29 | 30 | \item{geo_value}{An uppercase, two-character string specifying the geographic 31 | value, usually a state or \code{"US"} for national data.} 32 | 33 | \item{report_date}{An optional parameter to subset the query to a parameter 34 | on or before a particular \code{report_date}. Right now, the only parameter with 35 | report date-specific estimates is \code{right_truncation}. Note that this 36 | is similar to, but different from \code{as_of_date}. The \code{report_date} is used 37 | to select the particular value of a time-varying estimate. This estimate 38 | may itself be regenerated over time (e.g., as new data becomes available or 39 | with a methodological update). We can pull the estimate for date 40 | \code{report_date} as generated on date \code{as_of_date}.} 41 | } 42 | \value{ 43 | A named list with three PMFs. The list elements are named 44 | \code{generation_interval}, \code{delay_interval}, and \code{right_truncation}. If a path 45 | to a local file is not provided (NA or NULL), the corresponding parameter 46 | estimate will be NA in the returned list. 47 | } 48 | \description{ 49 | Read in disease process parameters from an external file or files 50 | } 51 | \details{ 52 | \code{generation_interval_path} is required because the generation 53 | interval is a required parameter for $R_t$ estimation. 54 | \code{delay_interval_path} and \code{right_truncation_path} are optional (but 55 | strongly suggested). 56 | } 57 | \seealso{ 58 | Other parameters: 59 | \code{\link{opts_formatter}}, 60 | \code{\link{read_interval_pmf}()} 61 | } 62 | \concept{parameters} 63 | -------------------------------------------------------------------------------- /man/read_exclusions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exclusions.R 3 | \name{read_exclusions} 4 | \alias{read_exclusions} 5 | \title{Read exclusions from an external file} 6 | \usage{ 7 | read_exclusions(path) 8 | } 9 | \arguments{ 10 | \item{path}{The path to the exclusions file in \code{.csv} format} 11 | } 12 | \value{ 13 | A dataframe with columns \code{reference_date}, \code{report_date}, 14 | \code{geo_value}, \code{disease} 15 | } 16 | \description{ 17 | Expects to read a CSV with required columns: 18 | \itemize{ 19 | \item \code{reference_date} 20 | \item \code{report_date} 21 | \item \code{state} 22 | \item \code{disease} 23 | } 24 | } 25 | \details{ 26 | These columns have the same meaning as in \code{\link[=read_data]{read_data()}}. Additional columns 27 | are allowed and will be ignored by the reader. 28 | } 29 | \seealso{ 30 | Other exclusions: 31 | \code{\link{apply_exclusions}()} 32 | } 33 | \concept{exclusions} 34 | -------------------------------------------------------------------------------- /man/read_interval_pmf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{read_interval_pmf} 4 | \alias{read_interval_pmf} 5 | \title{Read parameter PMF into memory} 6 | \usage{ 7 | read_interval_pmf( 8 | path, 9 | disease = c("COVID-19", "Influenza", "RSV", "test"), 10 | as_of_date, 11 | parameter = c("generation_interval", "delay", "right_truncation"), 12 | geo_value = NA, 13 | report_date = NA 14 | ) 15 | } 16 | \arguments{ 17 | \item{path}{A path to a local file} 18 | 19 | \item{disease}{A string specifying the disease being modeled. One of 20 | \code{"COVID-19"} or \code{"Influenza"} or \code{"RSV"}.} 21 | 22 | \item{as_of_date}{Use the parameters that were used in production on this 23 | date. Set for the current date for the most up-to-to date version of the 24 | parameters and set to an earlier date to use parameters from an earlier 25 | time period.} 26 | 27 | \item{parameter}{One of "generation interval", "delay", or "right-truncation"} 28 | 29 | \item{geo_value}{An uppercase, two-character string specifying the geographic 30 | value, usually a state or \code{"US"} for national data.} 31 | 32 | \item{report_date}{An optional parameter to subset the query to a parameter 33 | on or before a particular \code{report_date}. Right now, the only parameter with 34 | report date-specific estimates is \code{right_truncation}. Note that this 35 | is similar to, but different from \code{as_of_date}. The \code{report_date} is used 36 | to select the particular value of a time-varying estimate. This estimate 37 | may itself be regenerated over time (e.g., as new data becomes available or 38 | with a methodological update). We can pull the estimate for date 39 | \code{report_date} as generated on date \code{as_of_date}.} 40 | } 41 | \value{ 42 | A PMF vector 43 | } 44 | \description{ 45 | Using DuckDB from a parquet file. The function expects the file to be in SCD2 46 | format with column names: 47 | \itemize{ 48 | \item parameter 49 | \item geo_value 50 | \item disease 51 | \item start_date 52 | \item end_date 53 | \item value 54 | } 55 | } 56 | \details{ 57 | start_date and end_date specify the date range for which the value was used. 58 | end_date may be NULL (e.g. for the current value used in production). value 59 | must contain a pmf vector whose values are all positive and sum to 1. all 60 | other fields must be consistent with the specifications of the function 61 | arguments described below, which are used to query from the .parquet file. 62 | 63 | SCD2 format is shorthand for slowly changing dimension type 2. This format is 64 | normalized to track change over time: 65 | https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row 66 | } 67 | \seealso{ 68 | Other parameters: 69 | \code{\link{opts_formatter}}, 70 | \code{\link{read_disease_parameters}()} 71 | } 72 | \concept{parameters} 73 | -------------------------------------------------------------------------------- /man/read_json_into_config.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/config.R 3 | \name{read_json_into_config} 4 | \alias{read_json_into_config} 5 | \title{Read JSON Configuration into Config Object} 6 | \usage{ 7 | read_json_into_config(config_path, optional_fields) 8 | } 9 | \arguments{ 10 | \item{config_path}{A string specifying the path to the JSON configuration 11 | file.} 12 | 13 | \item{optional_fields}{A list of strings specifying the optional fields in 14 | the JSON file. If a field is not present in the JSON file, and is marked as 15 | optional, it will be set to either the empty type (e.g. \code{chr(0)}), or NULL. 16 | If a field is not present in the JSON file, and is not marked as optional, an 17 | error will be thrown.} 18 | } 19 | \value{ 20 | An instance of the \code{Config} class populated with the data from the 21 | JSON file. 22 | } 23 | \description{ 24 | Reads a JSON file from the specified path and converts it into a \code{Config} 25 | object. 26 | } 27 | \seealso{ 28 | Other config: 29 | \code{\link{Config}()}, 30 | \code{\link{Data}()}, 31 | \code{\link{Exclusions}()}, 32 | \code{\link{Interval}}, 33 | \code{\link{Parameters}()} 34 | } 35 | \concept{config} 36 | -------------------------------------------------------------------------------- /man/sample_processing_functions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/write_output.R 3 | \name{sample_processing_functions} 4 | \alias{sample_processing_functions} 5 | \alias{process_samples} 6 | \alias{process_quantiles} 7 | \title{Process posterior samples from a Stan fit object (raw draws).} 8 | \usage{ 9 | process_samples(fit, geo_value, model, disease) 10 | 11 | process_quantiles(fit, geo_value, model, disease, quantile_width) 12 | } 13 | \arguments{ 14 | \item{fit}{An \code{EpiNow2} fit object with posterior estimates.} 15 | 16 | \item{geo_value}{An uppercase, two-character string specifying the geographic 17 | value, usually a state or \code{"US"} for national data.} 18 | 19 | \item{model}{A string specifying the model to be used.} 20 | 21 | \item{disease}{A string specifying the disease being modeled. One of 22 | \code{"COVID-19"} or \code{"Influenza"} or \code{"RSV"}.} 23 | 24 | \item{quantile_width}{A vector of numeric values representing the desired 25 | quantiles. Passed to \code{\link[tidybayes:reexports]{tidybayes::median_qi()}}.} 26 | } 27 | \value{ 28 | A data.table of posterior draws or quantiles, merged and processed. 29 | } 30 | \description{ 31 | Extracts raw posterior samples from a Stan fit object and post-processes 32 | them, including merging with a fact table and standardizing the parameter 33 | names. If calling \verb{[process_quantiles()]} the 50\% and 95\% intervals are 34 | returned in \code{tidybayes} format. 35 | } 36 | \seealso{ 37 | Other write_output: 38 | \code{\link{write_model_outputs}()}, 39 | \code{\link{write_output_dir_structure}()} 40 | } 41 | \concept{write_output} 42 | -------------------------------------------------------------------------------- /man/sir_gt_pmf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sir_gt_pmf} 5 | \alias{sir_gt_pmf} 6 | \title{Generation interval corresponding to the sample \code{gostic_toy_rt} dataset} 7 | \format{ 8 | \code{sir_gt_pmf} A numeric vector of length 26 that sums to one within 9 | numerical tolerance 10 | } 11 | \usage{ 12 | sir_gt_pmf 13 | } 14 | \description{ 15 | Gostic et al., 2020 simulates data from a stochastic SEIR model. Residence 16 | time in both the E and the I compartments is exponentially distributed, with 17 | a mean of 4 days (or a rate/inverse-scale of 1/4). These residence times 18 | imply a gamma-distributed generation time distribution with a shape of 2 and 19 | a rate of 1/4. We convert the continuous gamma distribution into a PMF to use 20 | with \code{{RtGam}}. 21 | } 22 | \details{ 23 | From this parametric specification, we produce a double-censored, 24 | left-truncated probability mass function of the generation interval 25 | distribution. We produce the PMF using \code{{epinowcast}}'s 26 | \code{simulate_double_censored_pmf()} with version 0.3.0. See 27 | https://doi.org/10.1101/2024.01.12.24301247 for more information on 28 | double-censoring biases and corrections. 29 | 30 | We correct the output from \code{simulate_double_censored_pmf()} to make it 31 | appropriate to use with \code{{EpiNow2}}. The function returns a numeric vector, 32 | with the position of the element corresponding to one day more than the 33 | length of the delay and value corresponding to the amount of discretized 34 | probability density in the bin. The vector does not necessarily sum to one. 35 | We drop the first element of the vector, which corresponds to a zero-day 36 | delay. The renewal framework, which underpins our model does not account for 37 | zero-day delays. We renormalize the left-truncated vector to sum to one so 38 | that it's a proper PMF. 39 | } 40 | \seealso{ 41 | Other data: 42 | \code{\link{gostic_toy_rt}} 43 | } 44 | \concept{data} 45 | \keyword{datasets} 46 | -------------------------------------------------------------------------------- /man/write_model_outputs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/write_output.R 3 | \name{write_model_outputs} 4 | \alias{write_model_outputs} 5 | \title{Write model outputs to specified directories} 6 | \usage{ 7 | write_model_outputs( 8 | fit, 9 | samples, 10 | summaries, 11 | output_dir, 12 | job_id, 13 | task_id, 14 | metadata = list(), 15 | diagnostics 16 | ) 17 | } 18 | \arguments{ 19 | \item{fit}{An \code{EpiNow2} fit object with posterior estimates.} 20 | 21 | \item{samples}{A data.table as returned by \code{\link[=process_samples]{process_samples()}}} 22 | 23 | \item{summaries}{A data.table as returned by \code{\link[=process_quantiles]{process_quantiles()}}} 24 | 25 | \item{output_dir}{A string specifying the directory where output, logs, and 26 | other pipeline artifacts will be saved. Defaults to the root directory ("/").} 27 | 28 | \item{job_id}{A string specifying the job.} 29 | 30 | \item{task_id}{A string specifying the task.} 31 | 32 | \item{metadata}{List. Additional metadata to be included in the output. The 33 | paths to the samples, summaries, and model output will be added to the 34 | metadata list.} 35 | 36 | \item{diagnostics}{A data.table as returned by \code{\link[=extract_diagnostics]{extract_diagnostics()}}} 37 | } 38 | \value{ 39 | Invisible NULL. The function is called for its side effects. 40 | } 41 | \description{ 42 | Processes the model fit, extracts samples and quantiles, 43 | and writes them to the appropriate directories. 44 | } 45 | \seealso{ 46 | Other write_output: 47 | \code{\link{sample_processing_functions}}, 48 | \code{\link{write_output_dir_structure}()} 49 | } 50 | \concept{write_output} 51 | -------------------------------------------------------------------------------- /man/write_output_dir_structure.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/write_output.R 3 | \name{write_output_dir_structure} 4 | \alias{write_output_dir_structure} 5 | \title{Create output directory structure for a given job and task.} 6 | \usage{ 7 | write_output_dir_structure(output_dir, job_id, task_id) 8 | } 9 | \arguments{ 10 | \item{output_dir}{A string specifying the directory where output, logs, and 11 | other pipeline artifacts will be saved. Defaults to the root directory ("/").} 12 | 13 | \item{job_id}{A string specifying the job.} 14 | 15 | \item{task_id}{A string specifying the task.} 16 | } 17 | \value{ 18 | The path to the base output directory (invisible). 19 | } 20 | \description{ 21 | This function generates the necessary directory structure for storing output 22 | files related to a job and its tasks, including directories for raw samples 23 | and summarized quantiles. 24 | } 25 | \seealso{ 26 | Other write_output: 27 | \code{\link{sample_processing_functions}}, 28 | \code{\link{write_model_outputs}()} 29 | } 30 | \concept{write_output} 31 | -------------------------------------------------------------------------------- /rules_of_behavior.md: -------------------------------------------------------------------------------- 1 | # Rules of Behavior and Posting Guidelines for the Use of GitHub as a Third-Party Web Application 2 | 3 | ## Purpose 4 | 5 | These rules of behavior establish the privacy and information security requirements for the use of Third Party Web Applications (TPWAs) in conjunction with the CDC GitHub.com organizations established for open source projects. These rules of behavior were developed to ensure that CDC and its confidential information and technologies are not compromised, as well as protecting general CDC interests and services from risks associated with the use of TPWAs while allowing for the increased efficiencies and cost savings that come with appropriate use of third party services. 6 | 7 | ## Scope 8 | 9 | These rules of behavior and its related guidance apply to federal employees, contractors, and all external collaborators who will access GitHub from CDC directly or use them with non-sensitive data obtained from CDC. All engagement with TPWAs related to the GitHub will be governed by these rules of behavior, as well as to the Rules of Behavior for the Use of HHS Information Services. 10 | 11 | ## Ownership 12 | 13 | CDC assigns three stewards in charge of rules and policy compliance: a Business Steward, a Security Steward, and a Technical Steward. The business and security stewards are responsible for establishing policy and providing approval, while the technical steward fulfills requests from users. Users requesting access to GitHub that have not been approved yet need to assign a main and a backup point of contact (POC) with the business steward, as well as provide a justification to the security steward. 14 | 15 | The security steward is responsible for the security of the GitHub usage as a TPWA and its impact on the CDC network and compliance with CDC security policies. All users, including POCs, are responsible for adherence to this policy and associated processes. Where there is not a rule of behavior that provides explicit guidance, users must do their best to safeguard CDC and its network and services from security risks. 16 | 17 | ## Rules of Behavior 18 | 19 | All new users of GitHub must read and acknowledge these rules before using any of the approved TPWAs. This acknowledgment must be completed annually, and establishes agreement from part of the user to adhere to these rules. 20 | 21 | * I understand that I must complete security awareness and records management training annually in order to comply with the latest security and records management policies. 22 | * I understand that I must also follow the Rules of Behavior for use of HHS Information Resources. 23 | * I understand that I must not use, share, or store any kind of sensitive data (health status, provision or payment of healthcare, pictures, PII, etc.) with TPWAs under ANY circumstance. 24 | * I will not knowingly conceal, falsify or remove information.This includes editing or removing the template language provided when a Github repository is created. 25 | * I understand that I can only use non-sensitive and/or publicly available data in GitHub. If you are unsure of what constitutes non-sensitive information, please see guidance below. 26 | * I understand that all passwords I create to set up GitHub accounts need to comply with CDC’s password policy. 27 | * I understand that the steward reserves the right to moderate all data at any time. 28 | * I understand my responsibilities to protect systems and data as specified by CDC policies. 29 | 30 | ## Guidance Regarding Non-Sensitive and Publicly Available Information 31 | 32 | In support of program collaboration in the use oF GitHub, portions of some GitHub projects are either currently open to the public or may become open to the public in the future. The following guidelines will inform and assist the user in determining that the information to be posted on GitHub is not sensitive. The bottom line is if the content you are posting is not appropriate to post for public access, it should not be posted on GitHub. 33 | 34 | Before posting information that involves other CDC programs, employees, etc. to GitHub, it is important that the poster ensures they receive approval by the relevant CDC entity to post the information. 35 | 36 | Questions to consider before posting information include: 37 | 38 | | Do I have reservations about anyone viewing this information? | Yes | Do not post. | 39 | | Were individuals informed that this information would be posted on GitHub? | No | Do not post. | 40 | | Does this information contain details or descriptions of CDC security systems or other sensitive infrastructures? | Yes | Do not post. | 41 | | Does this information reflect program efforts to engage and inform external partners and the public? | No | Do not post. | 42 | 43 | Examples of information which has been deemed not sensitive and may be posted on GitHub include the following. 44 | 45 | * Source Code 46 | * Use cases 47 | * User stories/requirements 48 | * Process flows 49 | * Program pain points 50 | * Software Service Descriptions 51 | 52 | Sensitive information, which should not be posted, includes (but is not limited to) the following. 53 | 54 | * Information directly attributed to an individual in a sensitive manner 55 | * The names or pictures of individuals 56 | * Protected health information 57 | * Project management material. This includes posting or discussing security documentation, implementation plans, communications regarding project specifics, etc. 58 | * Opinions related to programs or tools, specifically those that may have an adverse impact 59 | * Non-public Links to CDC SharePoint or other internal references 60 | * Non-public Details on CDC internal infrastructure 61 | 62 | If there’s any question on whether information may be sensitive (such as detailed interview notes or specific references provided during a program interview), further guidance should be sought from the security steward prior to posting the information on any GitHub. 63 | 64 | ## Enforcement 65 | 66 | Users looking to use GitHub that are unable to follow these rules of behavior will not have authorization to do so. Any users that violate these rules of behavior or CDC security policies may be subject to action, up to and including revoking access to GitHub. Technical and security stewards have the right to enforce these rules of behavior based on violations at any time. 67 | 68 | ## References 69 | 70 | * [Policy for Managing the Use of Third-Party Websites and Applications](https://www.hhs.gov/about/agencies/asa/ocio/cybersecurity/policy-social-media-technologies/index.html) 71 | * [Rules of Behavior for Use of HHS Information Resources](http://www.hhs.gov/ocio/policy/hhs-rob.html) 72 | * [Security and Awareness Training](http://sat.cdc.gov/) (requires login) 73 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This is a wrapper script around the CFAEpiNow2Pipeline::orchestrate_pipeline command that checks 4 | # environment variables and executes the command. This provides a simple mechanism that can be specified 5 | # as a container startup command, allowing the same build to be executed with different configurations 6 | # and inputs. 7 | 8 | # Config file always differs and needs to be specified as a parameter. Azure tenant id, client id, 9 | # and service principal are required as environment variables. If any are not present, print a message 10 | # and exit. 11 | if [[ -z "$1" ]]; then 12 | echo "No config file specified - please provide as argument to this script." 13 | elif [[ -z "${az_tenant_id}" ]]; then 14 | echo "No Azure Tenant ID specified - please set az_tenant_id environment variable." 15 | elif [[ -z "${az_client_id}" ]]; then 16 | echo "No Azure Client ID specified - please set az_client_id environment variable." 17 | elif [[ -z "${az_service_principal}" ]]; then 18 | echo "No Azure Service Principal specified - please set az_service_principal environment variable." 19 | else 20 | # check for other environment variables, using defaults if not set 21 | CFG_CNTR="${CFG_CNTR:-rt-epinow2-config}" 22 | INPUT_DIR="${INPUT_DIR:-/mnt/input}" 23 | OUTPUT_DIR="${OUTPUT_DIR:-/mnt}" 24 | OUTPUT_CNTR="${OUTPUT_CNTR:-zs-test-pipeline-update}" 25 | 26 | # build the string 27 | EXEC_STR="CFAEpiNow2Pipeline::orchestrate_pipeline('$1', config_container='$CFG_CNTR', input_dir='$INPUT_DIR', output_dir='$OUTPUT_DIR', output_container='$OUTPUT_CNTR')" 28 | 29 | # print it, also visible and filterable in Azure logs 30 | echo "Executing pipeline: $EXEC_STR" 31 | 32 | # execute 33 | Rscript -e "$EXEC_STR" 34 | fi 35 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview 7 | # * https://testthat.r-lib.org/articles/special-files.html 8 | 9 | library(testthat) 10 | library(CFAEpiNow2Pipeline) 11 | 12 | test_check("CFAEpiNow2Pipeline") 13 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/fit_model.md: -------------------------------------------------------------------------------- 1 | # Right truncation longer than data throws error 2 | 3 | Removing right-truncation PMF elements after 2 4 | Right truncation PMF longer than the data 5 | PMF length: 3 6 | Data length: 2 7 | PMF can only be up to the length of the data 8 | 9 | # Missing keys throws error 10 | 11 | Code 12 | format_stan_opts(list(), random_seed) 13 | Condition 14 | Error in `format_stan_opts()`: 15 | ! Missing expected keys/values in "sampler_opts" 16 | Missing keys: "cores", "chains", "iter_warmup", "iter_sampling", "adapt_delta", and "max_treedepth" 17 | Missing values: 18 | 19 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/parameters.md: -------------------------------------------------------------------------------- 1 | # NULL `reference_date` prints in output 2 | 3 | Code 4 | pmf <- check_returned_pmf(pmf_df = pmf_df, parameter = parameter, disease = disease, 5 | as_of_date = as_of_date, geo_value = geo_value, report_date = report_date, 6 | path = path) 7 | Message 8 | Using right-truncation estimate for date "NA" 9 | Queried last available estimate from "2023-01-15" or earlier 10 | Subject to parameters available as of "2023-01-01" 11 | 12 | # GI with nonzero first element throws warning 13 | 14 | Code 15 | fixed <- format_generation_interval(pmf) 16 | Condition 17 | Warning: 18 | First element of GI PMF is not 0 19 | x Renewal equation assumes no same-day transmission 20 | ! Auto-fixing by prepending a 0. Consider left-truncating instead? 21 | > New PMF: 0, 0.0478174439101374, 0.0760979101401105, 0.0895274782138445, 0.0932924246386663, 0.0910112663029942, 0.0851745750679048, 0.0774669281292755, 0.0690016173717581, 0.0604909602604732, 0.0523692179334625, 0.0448807538374044, 0.0381427961649933, 0.0321897258102522, 0.0270039920145235, 0.0225374046222701, 0.0187255476449921, 0.0154973154449738, ..., 0.00308673656614286, and 0.00250027133286461 22 | 23 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/read_data.md: -------------------------------------------------------------------------------- 1 | # Incomplete return throws warning 2 | 3 | Incomplete number of rows returned 4 | Expected 23 rows 5 | Observed 21 rows 6 | Missing reference date(s): 2022-12-31 and 2023-01-01 7 | 8 | -------------------------------------------------------------------------------- /tests/testthat/data/2025-04-02_test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/tests/testthat/data/2025-04-02_test.parquet -------------------------------------------------------------------------------- /tests/testthat/data/CA_COVID-19.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "Rt-estimation-2024-11-26T14-38-24-622e8cc8ac3611efbe8d5a0f1d07309c", 3 | "task_id": "622e8cc8ac3611efbe8d5a0f1d07309c_CA_COVID-19_1732653504", 4 | "min_reference_date": "2024-10-01", 5 | "max_reference_date": "2024-11-25", 6 | "disease": "COVID-19", 7 | "geo_value": "CA", 8 | "geo_type": "state", 9 | "report_date": "2024-11-26", 10 | "production_date": "2024-11-26", 11 | "parameters": { 12 | "as_of_date": "2024-11-26", 13 | "generation_interval": { 14 | "path": "test_parameters.parquet", 15 | "blob_storage_container": null 16 | }, 17 | "delay_interval": { 18 | "path": null, 19 | "blob_storage_container": null 20 | }, 21 | "right_truncation": { 22 | "path": null, 23 | "blob_storage_container": null 24 | } 25 | }, 26 | "data": { 27 | "path": "CA_test.parquet", 28 | "blob_storage_container": null 29 | }, 30 | "seed": 42, 31 | "horizon": 14, 32 | "priors": { 33 | "rt": { 34 | "mean": 1.0, 35 | "sd": 0.2 36 | }, 37 | "gp": { 38 | "alpha_sd": 0.01 39 | } 40 | }, 41 | "sampler_opts": { 42 | "cores": 1, 43 | "chains": 1, 44 | "iter_warmup": 50, 45 | "iter_sampling": 50, 46 | "adapt_delta": 0.99, 47 | "max_treedepth": 12 48 | }, 49 | "exclusions": { 50 | "path": null 51 | }, 52 | "config_version": "1.0", 53 | "quantile_width": [ 54 | 0.5, 55 | 0.95 56 | ], 57 | "model": "EpiNow2" 58 | } 59 | -------------------------------------------------------------------------------- /tests/testthat/data/CA_test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/tests/testthat/data/CA_test.parquet -------------------------------------------------------------------------------- /tests/testthat/data/bad_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "6183da58-89bc-455f-8562-4f607257a876", 3 | "task_id": "bc0c3eb3-7158-4631-a2a9-86b97357f97e", 4 | "disease": "test", 5 | "geo_value": "test", 6 | "geo_type": "test", 7 | "min_reference_date": "2023-01-02", 8 | "max_reference_date": "2023-01-07", 9 | "report_date": "2023-10-28", 10 | "quantile_width": [0.5, 0.95], 11 | "model": "EpiNow2_test", 12 | "parameters": { 13 | "as_of_date": "2023-10-28", 14 | "generation_interval": { 15 | "path": "data/test_parameters.parquet", 16 | "blob_storage_container": null 17 | }, 18 | "delay_interval": { 19 | "path": null, 20 | "blob_storage_container": null 21 | }, 22 | "right_truncation": { 23 | "path": null, 24 | "blob_storage_container": null 25 | } 26 | }, 27 | "data": { 28 | "path": "data/test_data.parquet", 29 | "blob_storage_container": null 30 | }, 31 | "exclusions": { 32 | "path": "data/test_exclusions.csv", 33 | "blob_storage_container": null 34 | }, 35 | "seed": 42, 36 | "horizon": 14, 37 | "priors": { 38 | "rt": { 39 | "mean": 1.0, 40 | "sd": 0.2 41 | }, 42 | "gp": { 43 | "alpha_sd": 0.01 44 | } 45 | }, 46 | "sampler_opts": { 47 | "cores": 1, 48 | "chains": 1, 49 | "iter_warmup": 50, 50 | "iter_sampling": -50, 51 | "adapt_delta": 0.99, 52 | "max_treedepth": 12 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/testthat/data/sample_config_no_exclusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "6183da58-89bc-455f-8562-4f607257a876", 3 | "task_id": "bc0c3eb3-7158-4631-a2a9-86b97357f97e", 4 | "disease": "test", 5 | "geo_value": "test", 6 | "geo_type": "test", 7 | "min_reference_date": "2023-01-02", 8 | "max_reference_date": "2023-01-07", 9 | "report_date": "2023-10-28", 10 | "production_date": "2024-10-28", 11 | "quantile_width": [0.5, 0.95], 12 | "model": "EpiNow2_test", 13 | "parameters": { 14 | "as_of_date": "2023-10-28", 15 | "generation_interval": { 16 | "path": "data/test_parameters.parquet", 17 | "blob_storage_container": null 18 | }, 19 | "delay_interval": { 20 | "path": null, 21 | "blob_storage_container": null 22 | }, 23 | "right_truncation": { 24 | "path": null, 25 | "blob_storage_container": null 26 | } 27 | }, 28 | "data": { 29 | "path": "data/test_data.parquet", 30 | "blob_storage_container": null 31 | }, 32 | "seed": 42, 33 | "horizon": 14, 34 | "priors": { 35 | "rt": { 36 | "mean": 1.0, 37 | "sd": 0.2 38 | }, 39 | "gp": { 40 | "alpha_sd": 0.01 41 | } 42 | }, 43 | "sampler_opts": { 44 | "cores": 1, 45 | "chains": 1, 46 | "iter_warmup": 50, 47 | "iter_sampling": 50, 48 | "adapt_delta": 0.99, 49 | "max_treedepth": 12 50 | }, 51 | "config_version": "0.1.0" 52 | } 53 | -------------------------------------------------------------------------------- /tests/testthat/data/sample_config_with_exclusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "6183da58-89bc-455f-8562-4f607257a876", 3 | "task_id": "bc0c3eb3-7158-4631-a2a9-86b97357f97e", 4 | "disease": "test", 5 | "geo_value": "test", 6 | "geo_type": "test", 7 | "min_reference_date": "2023-01-02", 8 | "max_reference_date": "2023-01-07", 9 | "report_date": "2023-10-28", 10 | "production_date": "2024-10-28", 11 | "quantile_width": [0.5, 0.95], 12 | "model": "EpiNow2_test", 13 | "parameters": { 14 | "as_of_date": "2023-10-28", 15 | "generation_interval": { 16 | "path": "test_parameters.parquet", 17 | "blob_storage_container": null 18 | }, 19 | "delay_interval": { 20 | "path": null, 21 | "blob_storage_container": null 22 | }, 23 | "right_truncation": { 24 | "path": null, 25 | "blob_storage_container": null 26 | } 27 | }, 28 | "data": { 29 | "path": "test_data.parquet", 30 | "blob_storage_container": null 31 | }, 32 | "exclusions": { 33 | "path": "test_exclusions.csv", 34 | "blob_storage_container": null 35 | }, 36 | "seed": 42, 37 | "horizon": 14, 38 | "priors": { 39 | "rt": { 40 | "mean": 1.0, 41 | "sd": 0.2 42 | }, 43 | "gp": { 44 | "alpha_sd": 0.01 45 | } 46 | }, 47 | "sampler_opts": { 48 | "cores": 1, 49 | "chains": 1, 50 | "iter_warmup": 50, 51 | "iter_sampling": 50, 52 | "adapt_delta": 0.99, 53 | "max_treedepth": 12 54 | }, 55 | "config_version": "0.1.0" 56 | } 57 | -------------------------------------------------------------------------------- /tests/testthat/data/sample_fit.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/tests/testthat/data/sample_fit.rds -------------------------------------------------------------------------------- /tests/testthat/data/test_big_exclusions.csv: -------------------------------------------------------------------------------- 1 | reference_date,report_date,state,disease 2 | 2025-04-01,2025-04-02,AL,COVID-19 3 | 2025-04-01,2025-04-02,AK,COVID-19 4 | 2025-04-01,2025-04-02,AZ,COVID-19 5 | 2025-04-01,2025-04-02,CA,COVID-19 6 | 2025-04-01,2025-04-02,CO,COVID-19 7 | 2025-04-01,2025-04-02,CT,COVID-19 8 | 2025-04-01,2025-04-02,DE,COVID-19 9 | 2025-04-01,2025-04-02,DC,COVID-19 10 | 2025-04-01,2025-04-02,FL,COVID-19 11 | 2025-04-01,2025-04-02,HI,COVID-19 12 | 2025-04-01,2025-04-02,ID,COVID-19 13 | 2025-04-01,2025-04-02,IL,COVID-19 14 | 2025-04-01,2025-04-02,IN,COVID-19 15 | 2025-04-01,2025-04-02,IA,COVID-19 16 | 2025-04-01,2025-04-02,KS,COVID-19 17 | 2025-04-01,2025-04-02,KY,COVID-19 18 | 2025-04-01,2025-04-02,LA,COVID-19 19 | 2025-04-01,2025-04-02,ME,COVID-19 20 | 2025-04-01,2025-04-02,MD,COVID-19 21 | 2025-04-01,2025-04-02,MA,COVID-19 22 | 2025-04-01,2025-04-02,MI,COVID-19 23 | 2025-04-01,2025-04-02,MN,COVID-19 24 | 2025-04-01,2025-04-02,MS,COVID-19 25 | 2025-04-01,2025-04-02,MO,COVID-19 26 | 2025-04-01,2025-04-02,MT,COVID-19 27 | 2025-04-01,2025-04-02,NE,COVID-19 28 | 2025-04-01,2025-04-02,NV,COVID-19 29 | 2025-04-01,2025-04-02,NH,COVID-19 30 | 2025-04-01,2025-04-02,NJ,COVID-19 31 | 2025-04-01,2025-04-02,NM,COVID-19 32 | 2025-04-01,2025-04-02,NY,COVID-19 33 | 2025-04-01,2025-04-02,ND,COVID-19 34 | 2025-03-31,2025-04-02,OH,COVID-19 35 | 2025-04-01,2025-04-02,OH,COVID-19 36 | 2025-04-01,2025-04-02,OK,COVID-19 37 | 2025-04-01,2025-04-02,OR,COVID-19 38 | 2025-04-01,2025-04-02,PA,COVID-19 39 | 2025-04-01,2025-04-02,RI,COVID-19 40 | 2025-04-01,2025-04-02,SD,COVID-19 41 | 2025-04-01,2025-04-02,TN,COVID-19 42 | 2025-04-01,2025-04-02,TX,COVID-19 43 | 2025-04-01,2025-04-02,US,COVID-19 44 | 2025-03-31,2025-04-02,UT,COVID-19 45 | 2025-04-01,2025-04-02,UT,COVID-19 46 | 2025-04-01,2025-04-02,VT,COVID-19 47 | 2025-04-01,2025-04-02,VA,COVID-19 48 | 2025-04-01,2025-04-02,WA,COVID-19 49 | 2025-04-01,2025-04-02,WV,COVID-19 50 | 2025-04-01,2025-04-02,WI,COVID-19 51 | 2025-04-01,2025-04-02,AL,Influenza 52 | 2025-04-01,2025-04-02,AK,Influenza 53 | 2025-04-01,2025-04-02,AZ,Influenza 54 | 2025-04-01,2025-04-02,CA,Influenza 55 | 2025-04-01,2025-04-02,CO,Influenza 56 | 2025-04-01,2025-04-02,CT,Influenza 57 | 2025-04-01,2025-04-02,DE,Influenza 58 | 2025-04-01,2025-04-02,DC,Influenza 59 | 2025-04-01,2025-04-02,FL,Influenza 60 | 2025-04-01,2025-04-02,HI,Influenza 61 | 2025-04-01,2025-04-02,ID,Influenza 62 | 2025-04-01,2025-04-02,IL,Influenza 63 | 2025-04-01,2025-04-02,IN,Influenza 64 | 2025-04-01,2025-04-02,IA,Influenza 65 | 2025-04-01,2025-04-02,KS,Influenza 66 | 2025-04-01,2025-04-02,KY,Influenza 67 | 2025-04-01,2025-04-02,LA,Influenza 68 | 2025-04-01,2025-04-02,ME,Influenza 69 | 2025-04-01,2025-04-02,MD,Influenza 70 | 2025-04-01,2025-04-02,MA,Influenza 71 | 2025-04-01,2025-04-02,MI,Influenza 72 | 2025-04-01,2025-04-02,MN,Influenza 73 | 2025-04-01,2025-04-02,MS,Influenza 74 | 2025-04-01,2025-04-02,MO,Influenza 75 | 2025-04-01,2025-04-02,MT,Influenza 76 | 2025-04-01,2025-04-02,NE,Influenza 77 | 2025-04-01,2025-04-02,NV,Influenza 78 | 2025-04-01,2025-04-02,NH,Influenza 79 | 2025-04-01,2025-04-02,NJ,Influenza 80 | 2025-04-01,2025-04-02,NM,Influenza 81 | 2025-04-01,2025-04-02,NY,Influenza 82 | 2025-04-01,2025-04-02,ND,Influenza 83 | 2025-03-31,2025-04-02,OH,Influenza 84 | 2025-04-01,2025-04-02,OH,Influenza 85 | 2025-04-01,2025-04-02,OK,Influenza 86 | 2025-04-01,2025-04-02,OR,Influenza 87 | 2025-04-01,2025-04-02,PA,Influenza 88 | 2025-04-01,2025-04-02,RI,Influenza 89 | 2025-04-01,2025-04-02,SD,Influenza 90 | 2025-04-01,2025-04-02,TN,Influenza 91 | 2025-04-01,2025-04-02,TX,Influenza 92 | 2025-04-01,2025-04-02,US,Influenza 93 | 2025-03-31,2025-04-02,UT,Influenza 94 | 2025-04-01,2025-04-02,UT,Influenza 95 | 2025-04-01,2025-04-02,VT,Influenza 96 | 2025-04-01,2025-04-02,VA,Influenza 97 | 2025-04-01,2025-04-02,WA,Influenza 98 | 2025-04-01,2025-04-02,WV,Influenza 99 | 2025-04-01,2025-04-02,WI,Influenza 100 | -------------------------------------------------------------------------------- /tests/testthat/data/test_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/tests/testthat/data/test_data.parquet -------------------------------------------------------------------------------- /tests/testthat/data/test_exclusions.csv: -------------------------------------------------------------------------------- 1 | reference_date,report_date,state,disease 2 | 2023-01-07,2023-10-28,test,test 3 | -------------------------------------------------------------------------------- /tests/testthat/data/test_parameters.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/tests/testthat/data/test_parameters.parquet -------------------------------------------------------------------------------- /tests/testthat/data/us_overall_test_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CDCgov/cfa-epinow2-pipeline/349cbea090a01498bb32a810ae357875359ff8b2/tests/testthat/data/us_overall_test_data.parquet -------------------------------------------------------------------------------- /tests/testthat/data/v_bad_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "job_id": "6183da58-89bc-455f-8562-4f607257a876", 3 | "task_id": "bc0c3eb3-7158-4631-a2a9-86b97357f97e" 4 | } 5 | -------------------------------------------------------------------------------- /tests/testthat/helper-expect_pipeline_files_written.R: -------------------------------------------------------------------------------- 1 | expect_pipeline_files_written <- function( 2 | output_dir, 3 | job_id, 4 | task_id, 5 | check_logs = TRUE 6 | ) { 7 | ######## 8 | # Assert output files all exist 9 | job_path <- file.path(output_dir, job_id) 10 | task_path <- file.path(job_path, "tasks", task_id) 11 | 12 | # Samples 13 | expect_true( 14 | file.exists( 15 | file.path( 16 | job_path, 17 | "samples", 18 | paste0(task_id, ".parquet") 19 | ) 20 | ) 21 | ) 22 | # Summaries 23 | expect_true( 24 | file.exists( 25 | file.path( 26 | job_path, 27 | "summaries", 28 | paste0(task_id, ".parquet") 29 | ) 30 | ) 31 | ) 32 | # Model 33 | expect_true(file.exists(file.path(task_path, "model.rds"))) 34 | # Logs 35 | if (check_logs) { 36 | expect_true(file.exists(file.path(task_path, "logs.txt"))) 37 | } 38 | # Non-empty metadata 39 | metadata_path <- file.path(task_path, "metadata.json") 40 | expect_true(file.exists(metadata_path)) 41 | metadata <- jsonlite::read_json(metadata_path) 42 | expect_gt(length(metadata), 0) 43 | 44 | # Check that each field passes `rlang::is_atomic()` 45 | for (field in names(metadata)) { 46 | expect_true(rlang::is_atomic(metadata[[field]])) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/testthat/helper-write_exclusion.R: -------------------------------------------------------------------------------- 1 | write_exclusions <- function() { 2 | exclusions <- data.frame( 3 | reference_date = as.Date("2023-01-07"), 4 | report_date = as.Date("2023-10-28"), 5 | state = "test", 6 | disease = "test" 7 | ) 8 | con <- DBI::dbConnect(duckdb::duckdb()) 9 | duckdb::duckdb_register(con, "exclusions", exclusions) 10 | DBI::dbExecute( 11 | con, 12 | "COPY (SELECT * FROM exclusions) 13 | TO 'data/test_exclusions.csv'" 14 | ) 15 | } 16 | -------------------------------------------------------------------------------- /tests/testthat/helper-write_parameter_file.R: -------------------------------------------------------------------------------- 1 | write_sample_parameters_file <- function( 2 | value, 3 | path, 4 | state, 5 | param, 6 | disease, 7 | parameter, 8 | start_date, 9 | end_date, 10 | geo_value, 11 | reference_date 12 | ) { 13 | Sys.sleep(0.05) 14 | df <- data.frame( 15 | start_date = as.Date(start_date), 16 | disease = disease, 17 | parameter = parameter, 18 | end_date = end_date, 19 | geo_value = geo_value, 20 | value = I(list(value)), 21 | reference_date = reference_date 22 | ) 23 | 24 | con <- DBI::dbConnect(duckdb::duckdb()) 25 | on.exit(DBI::dbDisconnect(con), add = TRUE) 26 | 27 | duckdb::duckdb_register(con, "test_table", df) 28 | sql <- "COPY (SELECT * FROM test_table) TO ?path" 29 | query <- DBI::sqlInterpolate( 30 | DBI::ANSI(), 31 | sql, 32 | path = DBI::dbQuoteIdentifier(DBI::ANSI(), path) 33 | ) 34 | 35 | # Retry a few times because DuckDB throws std::exception intermittently. 36 | # This seems like a bug in DuckDB coming from on.exit not always closing the 37 | # connection in case of error and/or the many layers of filesystem runner 38 | # involved in writing this temp file. Rather than think too hard about it, 39 | # this is the sledgehammer approach. 40 | attempt <- 0 41 | success <- NULL 42 | while (attempt < 5 && is.null(success)) { 43 | attempt <- attempt + 1 44 | try(success <- DBI::dbExecute(con, query)) 45 | } 46 | 47 | invisible(path) 48 | } 49 | -------------------------------------------------------------------------------- /tests/testthat/test-diagnostics.R: -------------------------------------------------------------------------------- 1 | test_that("Fitted model extracts diagnostics", { 2 | # Arrange 3 | data_path <- test_path("data/test_data.parquet") 4 | con <- DBI::dbConnect(duckdb::duckdb()) 5 | data <- DBI::dbGetQuery( 6 | con, 7 | " 8 | SELECT 9 | report_date, 10 | reference_date, 11 | disease, 12 | geo_value AS state_abb, 13 | value AS confirm 14 | FROM read_parquet(?) 15 | WHERE reference_date <= '2023-01-22'", 16 | params = list(data_path) 17 | ) 18 | DBI::dbDisconnect(con) 19 | fit_path <- test_path("data", "sample_fit.rds") 20 | fit <- readRDS(fit_path) 21 | 22 | # Expected diagnostics 23 | expected <- data.frame( 24 | diagnostic = c( 25 | "mean_accept_stat", 26 | "p_divergent", 27 | "n_divergent", 28 | "p_max_treedepth", 29 | "p_high_rhat", 30 | "n_high_rhat", 31 | "diagnostic_flag", 32 | "low_case_count_flag" 33 | ), 34 | value = c( 35 | 0.94240233, 36 | 0.00000000, 37 | 0.00000000, 38 | 0.00000000, 39 | 0.00000000, 40 | 0.00000000, 41 | 0.00000000, 42 | 0.00000000 43 | ), 44 | job_id = rep("test", 8), 45 | task_id = rep("test", 8), 46 | disease = rep("test", 8), 47 | geo_value = rep("test", 8), 48 | model = rep("test", 8), 49 | stringsAsFactors = FALSE 50 | ) 51 | actual <- extract_diagnostics( 52 | fit, 53 | data, 54 | "test", 55 | "test", 56 | "test", 57 | "test", 58 | "test" 59 | ) 60 | 61 | testthat::expect_equal( 62 | actual, 63 | expected 64 | ) 65 | }) 66 | 67 | test_that("Cases below threshold returns TRUE", { 68 | # Arrange 69 | true_df <- data.frame( 70 | reference_date = seq.Date( 71 | from = as.Date("2023-01-01"), 72 | by = "day", 73 | length.out = 14 74 | ), 75 | confirm = c(9, rep(0, 12), 9) 76 | ) 77 | 78 | # Act 79 | diagnostic <- low_case_count_diagnostic(true_df) 80 | 81 | # Assert 82 | expect_true(diagnostic) 83 | }) 84 | 85 | test_that("Cases above threshold returns FALSE", { 86 | # Arrange 87 | false_df <- data.frame( 88 | reference_date = seq.Date( 89 | from = as.Date("2023-01-01"), 90 | by = "day", 91 | length.out = 14 92 | ), 93 | confirm = rep(10, 14) 94 | ) 95 | 96 | # Act 97 | diagnostic <- low_case_count_diagnostic(false_df) 98 | 99 | # Assert 100 | expect_false(diagnostic) 101 | }) 102 | 103 | 104 | test_that("Only the last two weeks are evalated", { 105 | # Arrange 106 | # 3 weeks, first week would pass but last week does not 107 | df <- data.frame( 108 | reference_date = seq.Date( 109 | from = as.Date("2023-01-01"), 110 | by = "day", 111 | length.out = 21 112 | ), 113 | # Week 1: 700, Week 2: 700, Week 3: 0 114 | confirm = c(rep(100, 14), rep(0, 7)) 115 | ) 116 | 117 | # Act 118 | diagnostic <- low_case_count_diagnostic(df) 119 | 120 | # Assert 121 | expect_true(diagnostic) 122 | }) 123 | 124 | test_that("Old approach's negative is now positive", { 125 | # Arrange 126 | df <- data.frame( 127 | reference_date = seq.Date( 128 | from = as.Date("2023-01-01"), 129 | by = "day", 130 | length.out = 14 131 | ), 132 | # Week 1: 21, Week 2: 0 133 | confirm = c(rep(3, 7), rep(0, 7)) 134 | ) 135 | 136 | # Act 137 | diagnostic <- low_case_count_diagnostic(df) 138 | 139 | # Assert 140 | expect_true(diagnostic) 141 | }) 142 | 143 | test_that("NAs are evalated as 0", { 144 | # Arrange 145 | df <- data.frame( 146 | reference_date = seq.Date( 147 | from = as.Date("2023-01-01"), 148 | by = "day", 149 | length.out = 14 150 | ), 151 | # Week 1: 6 (not NA!), Week 2: 700 152 | confirm = c(NA_real_, rep(1, 6), rep(100, 7)) 153 | ) 154 | 155 | # Act 156 | diagnostic <- low_case_count_diagnostic(df) 157 | 158 | # Assert 159 | expect_true(diagnostic) 160 | }) 161 | -------------------------------------------------------------------------------- /tests/testthat/test-exclusions.R: -------------------------------------------------------------------------------- 1 | test_that("Can apply exclusions on happy path", { 2 | exclusions <- data.frame( 3 | reference_date = as.Date("2023-01-06"), 4 | report_date = as.Date("2023-10-28"), 5 | geo_value = "test", 6 | disease = "test" 7 | ) 8 | data_path <- test_path("data", "test_data.parquet") 9 | con <- DBI::dbConnect(duckdb::duckdb()) 10 | data <- DBI::dbGetQuery( 11 | con, 12 | " 13 | SELECT 14 | report_date, 15 | reference_date, 16 | disease, 17 | geo_value, 18 | value AS confirm 19 | FROM read_parquet(?)", 20 | params = list(data_path) 21 | ) 22 | DBI::dbDisconnect(con) 23 | 24 | # Apply exclusion by hand 25 | expected <- data 26 | expected[ 27 | expected[["reference_date"]] == "2023-01-06", 28 | ][["confirm"]] <- NA 29 | 30 | # Act 31 | actual <- apply_exclusions( 32 | cases = data, 33 | exclusions = exclusions 34 | ) 35 | 36 | expect_equal(actual, expected) 37 | }) 38 | 39 | test_that("Can read exclusions on happy path", { 40 | expected <- data.frame( 41 | reference_date = as.Date("2023-01-01"), 42 | report_date = as.Date("2023-01-02"), 43 | geo_value = "test", 44 | disease = "test" 45 | ) 46 | 47 | con <- DBI::dbConnect(duckdb::duckdb()) 48 | on.exit(DBI::dbDisconnect(con)) 49 | duckdb::duckdb_register(con, "exclusions", expected) 50 | 51 | withr::with_tempdir({ 52 | DBI::dbExecute( 53 | con, 54 | " 55 | COPY ( 56 | SELECT 57 | reference_date, 58 | report_date, 59 | geo_value AS state, 60 | disease 61 | FROM exclusions 62 | ) TO 'test.csv'" 63 | ) 64 | 65 | actual <- read_exclusions("test.csv") 66 | }) 67 | 68 | expect_equal(actual, expected) 69 | }) 70 | 71 | test_that("Empty read errors", { 72 | expected <- data.frame( 73 | reference_date = character(), 74 | report_date = character(), 75 | state = character(), 76 | disease = character() 77 | ) 78 | 79 | con <- DBI::dbConnect(duckdb::duckdb()) 80 | on.exit(DBI::dbDisconnect(con)) 81 | duckdb::duckdb_register(con, "exclusions", expected) 82 | 83 | withr::with_tempdir({ 84 | DBI::dbExecute(con, "COPY (FROM exclusions) TO 'test.csv'") 85 | 86 | expect_error(read_exclusions("test.csv"), class = "empty_return") 87 | }) 88 | }) 89 | 90 | test_that("Missing file errors", { 91 | expect_error( 92 | read_exclusions(path = "not_a_real_path"), 93 | class = "file_not_found" 94 | ) 95 | }) 96 | 97 | test_that("Bad query errors", { 98 | expect_error( 99 | read_exclusions(path = "test-exclusions.R"), 100 | class = "wrapped_invalid_query" 101 | ) 102 | }) 103 | 104 | test_that("Works as expected on large exclusions file", { 105 | # Read in the large exclusions file 106 | excl_path <- test_path("data", "test_big_exclusions.csv") 107 | exclusions <- read_exclusions(excl_path) 108 | 109 | # Load some sample case data 110 | data_path <- test_path("data", "2025-04-02_test.parquet") 111 | cases <- read_data( 112 | data_path, 113 | disease = "COVID-19", 114 | geo_value = "OH", 115 | report_date = "2025-04-02", 116 | max_reference_date = "2025-04-02", 117 | min_reference_date = "1970-01-01" 118 | ) 119 | 120 | # Apply the exclusions 121 | got <- apply_exclusions(cases, exclusions) 122 | 123 | # Check that the exclusions were applied as expected 124 | expect_equal( 125 | got$confirm[179:181], 126 | c(54, NA, NA) 127 | ) 128 | }) 129 | -------------------------------------------------------------------------------- /tests/testthat/test-fit_model.R: -------------------------------------------------------------------------------- 1 | test_that("Minimal model fit all params runs", { 2 | # Parameters 3 | parameters <- list( 4 | generation_interval = sir_gt_pmf, 5 | delay_interval = c(0.2, 0.8), 6 | right_truncation = c(0.7, 0.3) 7 | ) 8 | # Data -- 5 points only 9 | data_path <- test_path("data", "test_data.parquet") 10 | con <- DBI::dbConnect(duckdb::duckdb()) 11 | data <- DBI::dbGetQuery( 12 | con, 13 | " 14 | SELECT 15 | report_date, 16 | reference_date, 17 | disease, 18 | geo_value AS state_abb, 19 | value AS confirm 20 | FROM read_parquet(?) 21 | ORDER BY reference_date 22 | LIMIT 5 23 | ", 24 | params = list(data_path) 25 | ) 26 | DBI::dbDisconnect(con) 27 | # Priors 28 | priors <- list( 29 | rt = list( 30 | mean = 1, 31 | sd = 0.2 32 | ), 33 | gp = list( 34 | alpha_sd = 0.05 35 | ) 36 | ) 37 | # Sampler 38 | sampler_opts <- list( 39 | cores = 1, 40 | chains = 1, 41 | adapt_delta = 0.8, 42 | max_treedepth = 10, 43 | iter_warmup = 25, 44 | iter_sampling = 25 45 | ) 46 | 47 | fit <- fit_model( 48 | data = data, 49 | parameters = parameters, 50 | seed = 12345, 51 | horizon = 0, 52 | priors = priors, 53 | sampler = sampler_opts 54 | ) 55 | 56 | expect_s3_class(fit, "epinow") 57 | }) 58 | 59 | test_that("Minimal model fit with no right trunc or delay runs", { 60 | # Parameters 61 | parameters <- list( 62 | generation_interval = sir_gt_pmf, 63 | delay_interval = NA, 64 | right_truncation = NA 65 | ) 66 | # Data -- 5 points only 67 | data_path <- test_path("data", "test_data.parquet") 68 | con <- DBI::dbConnect(duckdb::duckdb()) 69 | data <- DBI::dbGetQuery( 70 | con, 71 | " 72 | SELECT 73 | report_date, 74 | reference_date, 75 | disease, 76 | geo_value AS state_abb, 77 | value AS confirm 78 | FROM read_parquet(?) 79 | ORDER BY reference_date 80 | LIMIT 5 81 | ", 82 | params = list(data_path) 83 | ) 84 | DBI::dbDisconnect(con) 85 | # Priors 86 | priors <- list( 87 | rt = list( 88 | mean = 1, 89 | sd = 0.2 90 | ), 91 | gp = list( 92 | alpha_sd = 0.05 93 | ) 94 | ) 95 | # Sampler 96 | sampler_opts <- list( 97 | cores = 1, 98 | chains = 1, 99 | adapt_delta = 0.8, 100 | max_treedepth = 10, 101 | iter_warmup = 25, 102 | iter_sampling = 25 103 | ) 104 | 105 | fit <- fit_model( 106 | data = data, 107 | parameters = parameters, 108 | seed = 12345, 109 | horizon = 0, 110 | priors = priors, 111 | sampler = sampler_opts 112 | ) 113 | 114 | expect_s3_class(fit, "epinow") 115 | }) 116 | 117 | test_that("Bad params w/ failing fit issues warning and returns NA", { 118 | # Parameterization is same as above except Stan argument `iter_warmup` is 119 | # negative, which is an illegal parameterizaion. As a result, EpiNow2 starts 120 | # the Stan sampler but it terminates unexpectedly with an error, which is the 121 | # desired testing condition. 122 | 123 | # Parameters 124 | parameters <- list( 125 | generation_interval = sir_gt_pmf, 126 | delay_interval = NA, 127 | right_truncation = NA 128 | ) 129 | # Data -- 5 points only 130 | data_path <- test_path("data", "test_data.parquet") 131 | con <- DBI::dbConnect(duckdb::duckdb()) 132 | data <- DBI::dbGetQuery( 133 | con, 134 | " 135 | SELECT 136 | report_date, 137 | reference_date, 138 | disease, 139 | geo_value AS state_abb, 140 | value AS confirm 141 | FROM read_parquet(?) 142 | ORDER BY reference_date 143 | LIMIT 5 144 | ", 145 | params = list(data_path) 146 | ) 147 | DBI::dbDisconnect(con) 148 | # Priors 149 | priors <- list( 150 | rt = list( 151 | mean = 1, 152 | sd = 0.2 153 | ), 154 | gp = list( 155 | alpha_sd = 0.05 156 | ) 157 | ) 158 | # Sampler 159 | sampler_opts <- list( 160 | cores = 1, 161 | chains = 1, 162 | adapt_delta = 0.8, 163 | max_treedepth = 10, 164 | iter_warmup = -25, 165 | iter_sampling = 25 166 | ) 167 | 168 | expect_error( 169 | fit <- fit_model( 170 | data = data, 171 | parameters = parameters, 172 | seed = 12345, 173 | horizon = 0, 174 | priors = priors, 175 | sampler = sampler_opts 176 | ), 177 | class = "failing_fit" 178 | ) 179 | }) 180 | 181 | test_that("Right truncation longer than data throws error", { 182 | data <- data.frame(x = c(1, 2)) 183 | right_truncation_pmf <- c(0.1, 0.2, 0.7) 184 | 185 | expect_snapshot_warning( 186 | format_right_truncation( 187 | right_truncation_pmf, 188 | data 189 | ) 190 | ) 191 | }) 192 | 193 | test_that("Missing GI throws error", { 194 | expect_error(format_generation_interval(NA), class = "Missing_GI") 195 | }) 196 | 197 | test_that("Missing keys throws error", { 198 | random_seed <- 12345 199 | expect_snapshot(format_stan_opts(list(), random_seed), error = TRUE) 200 | }) 201 | -------------------------------------------------------------------------------- /tests/testthat/test-pipeline.R: -------------------------------------------------------------------------------- 1 | test_that("Bad config throws warning and returns failure", { 2 | # Arrange 3 | config_path <- test_path("data", "bad_config.json") 4 | config <- jsonlite::read_json(config_path) 5 | # Read from locally 6 | output_container <- NULL 7 | output_dir <- "pipeline_test" 8 | input_dir <- "." 9 | on.exit(unlink(output_dir, recursive = TRUE)) 10 | 11 | # Act 12 | expect_warning( 13 | pipeline_success <- orchestrate_pipeline( 14 | config_path = config_path, 15 | input_dir = input_dir, 16 | output_dir = output_dir 17 | ), 18 | class = "Bad_config" 19 | ) 20 | expect_false(pipeline_success) 21 | }) 22 | 23 | test_that("Pipeline run produces expected outputs with NO exclusions", { 24 | # Arrange 25 | config_path <- test_path("data", "sample_config_no_exclusion.json") 26 | config <- jsonlite::read_json(config_path) 27 | # Read from locally 28 | output_container <- NULL 29 | output_dir <- "pipeline_test" 30 | input_dir <- "." 31 | on.exit(unlink(output_dir, recursive = TRUE)) 32 | 33 | # Act 34 | pipeline_success <- orchestrate_pipeline( 35 | config_path = config_path, 36 | input_dir = input_dir, 37 | output_dir = output_dir 38 | ) 39 | expect_true(pipeline_success) 40 | expect_pipeline_files_written( 41 | output_dir, 42 | config[["job_id"]], 43 | config[["task_id"]] 44 | ) 45 | }) 46 | 47 | test_that("Pipeline run produces expected outputs with exclusions", { 48 | # Arrange 49 | input_dir <- test_path("data") 50 | config_path <- "sample_config_with_exclusion.json" 51 | config <- jsonlite::read_json(file.path(input_dir, config_path)) 52 | # Read from locally 53 | output_container <- NULL 54 | output_dir <- "pipeline_test" 55 | on.exit(unlink(output_dir, recursive = TRUE)) 56 | 57 | # Act 58 | pipeline_success <- orchestrate_pipeline( 59 | config_path = config_path, 60 | input_dir = input_dir, 61 | output_dir = output_dir 62 | ) 63 | 64 | ######## 65 | # Assert output files all exist 66 | expect_pipeline_files_written( 67 | output_dir, 68 | config[["job_id"]], 69 | config[["task_id"]] 70 | ) 71 | expect_true(pipeline_success) 72 | }) 73 | 74 | test_that("Process pipeline produces expected outputs and returns success", { 75 | # Arrange 76 | input_dir <- "data" 77 | config_path <- file.path(input_dir, "sample_config_with_exclusion.json") 78 | config <- read_json_into_config( 79 | config_path, 80 | c("exclusions", "output_container") 81 | ) 82 | # Read from locally 83 | output_dir <- "pipeline_test" 84 | on.exit(unlink(output_dir, recursive = TRUE)) 85 | 86 | # Act 87 | pipeline_success <- execute_model_logic( 88 | config = config, 89 | input_dir = input_dir, 90 | output_dir = output_dir 91 | ) 92 | expect_true(pipeline_success) 93 | 94 | ######## 95 | # Assert output files all exist 96 | expect_pipeline_files_written( 97 | output_dir, 98 | config@job_id, 99 | config@task_id, 100 | # Don't check logs here, bc logs are set up by orchestrate_pipeline(), but 101 | # this test is just for execute_model_logic() which is called after logs are 102 | # set up in orchestrate_pipeline(). 103 | check_logs = FALSE 104 | ) 105 | }) 106 | 107 | test_that("Runs on config from generator as of 2024-11-26", { 108 | # Arrange 109 | config_path <- "CA_COVID-19.json" 110 | input_dir <- test_path("data") 111 | config <- read_json_into_config( 112 | file.path(input_dir, config_path), 113 | c("exclusions", "output_container") 114 | ) 115 | # Read from locally 116 | output_dir <- test_path("pipeline_test") 117 | on.exit(unlink(output_dir, recursive = TRUE)) 118 | 119 | # Act 120 | pipeline_success <- execute_model_logic( 121 | config = config, 122 | output_dir = output_dir, 123 | input_dir = input_dir 124 | ) 125 | expect_true(pipeline_success) 126 | 127 | ######## 128 | # Assert output files all exist 129 | expect_pipeline_files_written( 130 | output_dir, 131 | config@job_id, 132 | config@task_id, 133 | # Do not check for log output here, bc logs get created in 134 | # `orchestrate_pipeline()`, and this test only calls `execute_model_logic()` 135 | # which gets called after the log files have been created. 136 | check_logs = FALSE 137 | ) 138 | }) 139 | 140 | test_that("Warning and exit for bad config file", { 141 | # Arrange 142 | config_path <- test_path("v_bad_config.json") 143 | # Read from locally 144 | input_dir <- test_path("data") 145 | output_dir <- test_path("bad_output") 146 | on.exit(unlink(output_dir, recursive = TRUE)) 147 | 148 | # Act 149 | expect_warning( 150 | pipeline_success <- orchestrate_pipeline( 151 | config_path = config_path, 152 | input_dir = input_dir, 153 | output_dir = output_dir 154 | ), 155 | class = "Bad_config" 156 | ) 157 | expect_false(pipeline_success) 158 | }) 159 | -------------------------------------------------------------------------------- /tests/testthat/test-read_data.R: -------------------------------------------------------------------------------- 1 | test_that("Data read for one state works on happy path", { 2 | data_path <- test_path("data/test_data.parquet") 3 | con <- DBI::dbConnect(duckdb::duckdb()) 4 | expected <- DBI::dbGetQuery( 5 | con, 6 | " 7 | SELECT 8 | report_date, 9 | reference_date, 10 | disease, 11 | geo_value AS geo_value, 12 | value AS confirm 13 | FROM read_parquet(?) 14 | WHERE reference_date <= '2023-01-22'", 15 | params = list(data_path) 16 | ) 17 | DBI::dbDisconnect(con) 18 | 19 | actual <- read_data( 20 | data_path, 21 | disease = "test", 22 | geo_value = "test", 23 | report_date = "2023-10-28", 24 | min_reference_date = as.Date("2023-01-02"), 25 | max_reference_date = "2023-01-22" 26 | ) 27 | 28 | expect_equal(actual, expected) 29 | }) 30 | 31 | test_that("Data read for US overall works on happy path", { 32 | data_path <- test_path("data/us_overall_test_data.parquet") 33 | con <- DBI::dbConnect(duckdb::duckdb()) 34 | expected <- DBI::dbGetQuery( 35 | con, 36 | " 37 | SELECT 38 | report_date, 39 | reference_date, 40 | disease, 41 | geo_value AS geo_value, 42 | value AS confirm 43 | FROM read_parquet(?) 44 | WHERE reference_date <= '2023-01-22'", 45 | params = list(data_path) 46 | ) 47 | DBI::dbDisconnect(con) 48 | 49 | actual <- read_data( 50 | data_path, 51 | disease = "test", 52 | geo_value = "US", 53 | report_date = "2023-10-28", 54 | min_reference_date = "2023-01-02", 55 | max_reference_date = "2023-01-22" 56 | ) 57 | 58 | expect_equal(actual, expected) 59 | }) 60 | 61 | test_that("Reading a file that doesn't exist fails", { 62 | data_path <- "not_a_real_file" 63 | expect_error( 64 | read_data( 65 | data_path, 66 | disease = "test", 67 | geo_value = "not_a_real_state", 68 | report_date = "2023-10-28", 69 | min_reference_date = "2023-01-02", 70 | max_reference_date = "2023-01-22" 71 | ), 72 | class = "file_not_found" 73 | ) 74 | }) 75 | 76 | test_that("A query with no matching return fails", { 77 | data_path <- test_path("data/us_overall_test_data.parquet") 78 | expect_error( 79 | read_data( 80 | data_path, 81 | disease = "test", 82 | geo_value = "not_a_real_state", 83 | report_date = "2023-10-28", 84 | min_reference_date = "2023-01-02", 85 | max_reference_date = "2023-01-22" 86 | ), 87 | class = "empty_return" 88 | ) 89 | }) 90 | 91 | test_that("An invalid query throws a wrapped error", { 92 | # point the query at a non-parquet file 93 | data_path <- test_path("test-read_data.R") 94 | expect_error( 95 | read_data( 96 | data_path, 97 | disease = "test", 98 | geo_value = "not_a_real_state", 99 | report_date = "2023-10-28", 100 | min_reference_date = "2023-01-02", 101 | max_reference_date = "2023-01-22" 102 | ), 103 | class = "wrapped_invalid_query" 104 | ) 105 | }) 106 | 107 | test_that("Incomplete return throws warning", { 108 | data_path <- test_path("data/test_data.parquet") 109 | 110 | # Two missing dates 111 | expect_snapshot_warning( 112 | read_data( 113 | data_path, 114 | disease = "test", 115 | geo_value = "test", 116 | report_date = "2023-10-28", 117 | min_reference_date = "2022-12-31", 118 | max_reference_date = "2023-01-22" 119 | ), 120 | class = "incomplete_return" 121 | ) 122 | }) 123 | 124 | test_that("Replace COVID-19/Omicron with COVID-19, one state", { 125 | data_path <- test_path("data/CA_test.parquet") 126 | 127 | actual <- read_data( 128 | data_path, 129 | disease = "COVID-19", 130 | geo_value = "CA", 131 | report_date = "2024-11-26", 132 | min_reference_date = as.Date("2024-06-01"), 133 | max_reference_date = "2024-11-25" 134 | ) 135 | 136 | # Expect that there should be no "COVID-19/Omicron" in the data, 137 | # only "COVID-19" 138 | expect_false("COVID-19/Omicron" %in% actual$disease) 139 | expect_true(all(actual$disease == "COVID-19")) 140 | }) 141 | 142 | 143 | test_that("Replace COVID-19/Omicron with COVID-19, US", { 144 | data_path <- test_path("data/CA_test.parquet") 145 | 146 | actual <- read_data( 147 | data_path, 148 | disease = "COVID-19", 149 | geo_value = "US", 150 | report_date = "2024-11-26", 151 | min_reference_date = as.Date("2024-06-01"), 152 | max_reference_date = "2024-11-25" 153 | ) 154 | 155 | # Expect that there should be no "COVID-19/Omicron" in the data, 156 | # only "COVID-19" 157 | expect_false("COVID-19/Omicron" %in% actual$disease) 158 | expect_true(all(actual$disease == "COVID-19")) 159 | }) 160 | -------------------------------------------------------------------------------- /thanks.md: -------------------------------------------------------------------------------- 1 | # Thanks and Acknowledgements 2 | 3 | Starting this file way too late, but wanted to recognize contributions made by people who helped this repo. There are many more than this, but I should have started this file years ago. 4 | 5 | * Chris Sandlin [@cssandlin](https://github.com/cssandlin) 6 | * Drewry Morris [@drewry](https://github.com/drewry) 7 | -------------------------------------------------------------------------------- /utils/Rt_review_exclusions.R: -------------------------------------------------------------------------------- 1 | option_list <- list( 2 | optparse::make_option( 3 | c("-d", "--dates"), 4 | type = "character", 5 | default = gsub( 6 | "-", 7 | "", 8 | lubridate::today(tzone = "UTC") 9 | ), 10 | help = "Reports Date in yyyymmdd format", 11 | metavar = "character" 12 | ) 13 | ) 14 | opt_parser <- optparse::OptionParser(option_list = option_list) 15 | opt <- optparse::parse_args(opt_parser) 16 | # Get All Files Names to Download and Parse 17 | date_names <- opt$dates 18 | 19 | 20 | read_process_excel_func <- function( 21 | sheet_name, 22 | pathogen, 23 | file_name, 24 | report_date 25 | ) { 26 | df <- readxl::read_excel( 27 | paste0(file_name), # path where saved 28 | sheet = sheet_name, 29 | skip = 3, 30 | col_names = c( 31 | "state", 32 | "dates_affected", 33 | "observed volume", 34 | "expected volume", 35 | "initial_thoughts", 36 | "state_abb", 37 | "review_1_decision", 38 | "reviewer_2_decision", 39 | "final_decision", 40 | "drop_dates", 41 | "additional_reasoning" 42 | ) 43 | ) 44 | df <- df |> dplyr::mutate(drop_dates = as.character(drop_dates)) 45 | df <- data.frame(tidyr::separate_rows(df, 10, sep = "\\|")) |> 46 | dplyr::filter(!is.na(state)) |> 47 | dplyr::mutate( 48 | report_date = report_date, 49 | pathogen = pathogen 50 | ) |> 51 | dplyr::select( 52 | "report_date", 53 | "state", 54 | "state_abb", 55 | "pathogen", 56 | "review_1_decision", 57 | "reviewer_2_decision", 58 | "final_decision", 59 | "drop_dates" 60 | ) 61 | return(df) 62 | } 63 | 64 | 65 | create_pt_excl_from_rt_xslx <- function(dates) { 66 | # Connect to Sharepoint via Microsoft365R library 67 | # Provide team name here 68 | site <- Microsoft365R::get_sharepoint_site( 69 | auth_type = "device_code", 70 | "OD-OCoS-Center for Forecasting and Outbreak Analytics" 71 | ) 72 | drv <- site$get_drive("Documents") # Set drive to Documents (vs Wiki) 73 | rt_review_path <- file.path( 74 | "General", 75 | "02 - Predict", 76 | "Real Time Monitoring (RTM) Branch", 77 | "Nowcasting and Natural History", 78 | "Rt", 79 | "NSSP-Rt", 80 | "Rt_Review_Notes", 81 | "Review_Decisions" 82 | ) 83 | 84 | for (report_date in dates) { 85 | fname <- paste0("Rt_Review_", report_date, ".xlsx") 86 | drv$get_item(file.path(rt_review_path, fname))$download( 87 | dest = paste0(fname), 88 | overwrite = TRUE 89 | ) 90 | # read and process the COVID sheet 91 | covid_df <- read_process_excel_func( 92 | sheet_name = "Rt_Review_COVID", 93 | pathogen = "covid", 94 | file_name = fname, 95 | report_date = report_date 96 | ) 97 | # read and process the Influenza sheet 98 | influenza_df <- read_process_excel_func( 99 | sheet_name = "Rt_Review_Influenza", 100 | pathogen = "influenza", 101 | file_name = fname, 102 | report_date = report_date 103 | ) 104 | # Overall Rt_review machine readable format 105 | combined_df <- rbind(covid_df, influenza_df) 106 | if (file.exists(paste0(fname))) { 107 | # Delete file if it exists 108 | file.remove(paste0(fname)) 109 | } 110 | # Further processing 111 | combined_df <- combined_df |> 112 | dplyr::mutate( 113 | reference_date = lubridate::ymd(drop_dates), 114 | report_date = lubridate::ymd(report_date), 115 | geo_value = state_abb, 116 | pathogen = dplyr::case_when( 117 | pathogen == "influenza" ~ "Influenza", 118 | pathogen == "covid" ~ "COVID-19", 119 | .default = as.character(pathogen) 120 | ) 121 | ) 122 | 123 | # point exclusions in outlier.csv format 124 | point_exclusions <- combined_df |> 125 | dplyr::filter(!is.na(drop_dates)) |> 126 | dplyr::mutate( 127 | raw_confirm = NA, 128 | clean_confirm = NA 129 | ) |> 130 | dplyr::select( 131 | reference_date, 132 | report_date, 133 | "state" = "geo_value", 134 | "disease" = "pathogen" 135 | ) 136 | container_name <- "nssp-etl" 137 | cont <- CFAEpiNow2Pipeline::fetch_blob_container(container_name) 138 | 139 | cli::cli_alert_info( 140 | "saving {lubridate::ymd(report_date)}.csv in 141 | {container_name}/outliers-v2" 142 | ) 143 | AzureStor::storage_write_csv( 144 | cont = cont, 145 | object = point_exclusions, 146 | file = file.path( 147 | "outliers-v2", 148 | paste0(lubridate::ymd(report_date), ".csv") 149 | ) 150 | ) 151 | 152 | #### State exclusions ##### 153 | state_exclusions <- combined_df |> 154 | dplyr::filter( 155 | final_decision %in% 156 | c( 157 | "Exclude State (Data)", 158 | "Exclude State (Model)", 159 | "Exclude State" 160 | ) 161 | ) |> 162 | dplyr::mutate( 163 | type = dplyr::case_when( 164 | final_decision == "Exclude State (Data)" ~ "Data", 165 | final_decision == "Exclude State (Model)" ~ "Model" 166 | ) 167 | ) |> 168 | dplyr::select(state_abb, pathogen, type) 169 | 170 | container_name <- "nssp-etl" 171 | cont <- CFAEpiNow2Pipeline::fetch_blob_container(container_name) 172 | file <- paste0(lubridate::ymd(report_date), "_state_exclusions.csv") 173 | cli::cli_alert_info( 174 | "saving {file} in {container_name}/state_exclusions" 175 | ) 176 | AzureStor::storage_write_csv( 177 | cont = cont, 178 | object = state_exclusions, 179 | file = file.path( 180 | "state_exclusions", 181 | file 182 | ) 183 | ) 184 | 185 | #### Temp old-pipeline csv generator##### 186 | # Save a version in temp folder. 187 | # Need to copy and paste this in current blank outlier csv file 188 | # Can get rid of this once we end old pipeline support 189 | point_exclusions <- combined_df |> 190 | dplyr::filter(!is.na(drop_dates)) |> 191 | dplyr::mutate( 192 | raw_confirm = NA, 193 | clean_confirm = NA 194 | ) |> 195 | dplyr::select( 196 | reference_date, 197 | report_date, 198 | "geo_value", 199 | "pathogen" 200 | ) |> 201 | dplyr::mutate( 202 | geo_value = tolower(geo_value), 203 | pathogen = dplyr::case_when( 204 | pathogen == "Influenza" ~ "flu", 205 | pathogen == "COVID-19" ~ "covid", 206 | .default = as.character(pathogen) 207 | ) 208 | ) 209 | cli::cli_alert_info( 210 | "saving {lubridate::ymd(report_date)}.csv in 211 | {container_name}/temp_outliers_for_old" 212 | ) 213 | AzureStor::storage_write_csv( 214 | cont = cont, 215 | object = point_exclusions, 216 | file = file.path( 217 | "temp_outliers_for_old", 218 | paste0(lubridate::ymd(report_date), ".csv") 219 | ) 220 | ) 221 | } 222 | } 223 | 224 | 225 | create_pt_excl_from_rt_xslx(dates = date_names) 226 | --------------------------------------------------------------------------------